diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000000..4afd9fdf497 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,39 @@ +# OSX leaves these everywhere on SMB shares +._* + +# OSX trash +.DS_Store + +# Eclipse files +.classpath +.project +.settings/** + +# Vim swap files +*.swp + +# Files generated by JetBrains IDEs, e.g. IntelliJ IDEA +.idea/ +*.iml +out/ + +# Vscode files +.vscode/** + +target +dist +tmp +/bin + +# fuzzing hack, see fuzz/cli.rs +fuzz-incremental/ + +# cargo configuration. We presently use this to create custom cargo profiles +# that should not be checked in at this location. +.cargo/ + +# Files generated by tikv-server +/LOCK +/db/ +/last_tikv.toml +/raft/ diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 8e4b30c299a..35c561124f5 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,10 +1,41 @@ -### What problem does this PR solve? + ### What is changed and how it works? + + +Issue Number: Close #xxx + + +What's Changed: + +```commit-message + +``` + +### Related changes + +- [ ] PR to update `pingcap/docs`/`pingcap/docs-cn`: +- [ ] Need to cherry-pick to the release branch ### Check List @@ -21,18 +52,16 @@ Side effects - [ ] Performance regression: Consumes more Memory - [ ] Breaking backward compatibility -Documentation - -- [ ] Affects user behaviors -- [ ] Contains syntax changes -- [ ] Contains variable changes -- [ ] Contains experimental features -- [ ] Changes MySQL compatibility - ### Release note + +If you don't think this PR needs a release note then fill it with None. +If this PR will be picked to release branch, then a release note is probably required. +--> ```release-note -None + ``` diff --git a/Cargo.lock b/Cargo.lock index 9af204564a7..4f24e2523a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -31,13 +31,13 @@ checksum = "5d2e7343e7fc9de883d1b0341e0b13970f764c14101234857d2ddafa1cb1cac2" [[package]] name = "afl" -version = "0.6.0" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59206260f98d163b3ca42fb29fe551dbcda1d43cf70a244066b2a0666a8fb2a9" +checksum = "8c80b57a86234ee3e9238f5f2d33d37f8fd5c7ff168c07f2d5147d410e86db33" dependencies = [ - "cc", - "clap", - "rustc_version 0.2.3", + "home", + "libc 0.2.146", + "rustc_version 0.4.0", "xdg", ] @@ -89,9 +89,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.26" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7825f6833612eb2414095684fcf6c635becf3ce97fe48cf6421321e93bfbd53c" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" [[package]] name = "api_version" @@ -147,7 +147,7 @@ dependencies = [ "lazy_static", "lexical-core", "multiversion", - "num 0.4.0", + "num 0.4.1", "rand 0.8.5", "regex", "serde", @@ -214,17 +214,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "58982858be7540a465c790b95aaea6710e5139bf8956b1d1344d014fa40100b0" dependencies = [ - "async-stream-impl 0.2.0", - "futures-core", -] - -[[package]] -name = "async-stream" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" -dependencies = [ - "async-stream-impl 0.3.3", + "async-stream-impl", "futures-core", ] @@ -239,17 +229,6 @@ dependencies = [ "syn 1.0.103", ] -[[package]] -name = "async-stream-impl" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.103", -] - [[package]] name = "async-trait" version = "0.1.58" @@ -329,51 +308,6 @@ dependencies = [ "uuid 0.8.2", ] -[[package]] -name = "axum" -version = "0.5.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acee9fd5073ab6b045a275b3e709c163dd36c90685219cb21804a147b58dba43" -dependencies = [ - "async-trait", - "axum-core", - "bitflags", - "bytes", - "futures-util", - "http", - "http-body", - "hyper", - "itoa 1.0.1", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde", - "sync_wrapper", - "tokio", - "tower", - "tower-http", - "tower-layer", - "tower-service", -] - -[[package]] -name = "axum-core" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e5939e02c56fecd5c017c37df4238c0a839fa76b7f97acdd7efb804fd181cc" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "mime", - "tower-layer", - "tower-service", -] - [[package]] name = "azure" version = "0.0.1" @@ -406,7 +340,7 @@ dependencies = [ [[package]] name = "azure_core" version = "0.12.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#69431158e9d39f2064fe207cf241d3fc748c851c" +source = "git+https://github.com/tikv/azure-sdk-for-rust?branch=release-7.5-fips#e3dc3e02573e60e70f00418255c417aa80b8e26b" dependencies = [ "async-trait", "base64 0.21.0", @@ -432,7 +366,7 @@ dependencies = [ [[package]] name = "azure_identity" version = "0.12.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#69431158e9d39f2064fe207cf241d3fc748c851c" +source = "git+https://github.com/tikv/azure-sdk-for-rust?branch=release-7.5-fips#e3dc3e02573e60e70f00418255c417aa80b8e26b" dependencies = [ "async-lock", "async-trait", @@ -452,7 +386,7 @@ dependencies = [ [[package]] name = "azure_security_keyvault" version = "0.12.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#69431158e9d39f2064fe207cf241d3fc748c851c" +source = "git+https://github.com/tikv/azure-sdk-for-rust?branch=release-7.5-fips#e3dc3e02573e60e70f00418255c417aa80b8e26b" dependencies = [ "async-trait", "azure_core", @@ -467,20 +401,19 @@ dependencies = [ [[package]] name = "azure_storage" version = "0.12.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#69431158e9d39f2064fe207cf241d3fc748c851c" +source = "git+https://github.com/tikv/azure-sdk-for-rust?branch=release-7.5-fips#e3dc3e02573e60e70f00418255c417aa80b8e26b" dependencies = [ "RustyXML", "async-trait", "azure_core", "bytes", "futures 0.3.15", - "hmac 0.12.1", "log", "once_cell", + "openssl", "serde", "serde_derive", "serde_json", - "sha2 0.10.6", "time 0.3.20", "url", "uuid 1.2.1", @@ -489,7 +422,7 @@ dependencies = [ [[package]] name = "azure_storage_blobs" version = "0.12.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#69431158e9d39f2064fe207cf241d3fc748c851c" +source = "git+https://github.com/tikv/azure-sdk-for-rust?branch=release-7.5-fips#e3dc3e02573e60e70f00418255c417aa80b8e26b" dependencies = [ "RustyXML", "azure_core", @@ -535,7 +468,6 @@ dependencies = [ "engine_traits", "error_code", "external_storage", - "external_storage_export", "file_system", "futures 0.3.15", "futures-util", @@ -576,7 +508,6 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "cfg-if 1.0.0", "chrono", "concurrency_manager", "crossbeam", @@ -587,9 +518,7 @@ dependencies = [ "engine_test", "engine_traits", "error_code", - "etcd-client", "external_storage", - "external_storage_export", "fail", "file_system", "futures 0.3.15", @@ -630,7 +559,6 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic", "txn_types", "url", "uuid 0.8.2", @@ -657,6 +585,7 @@ dependencies = [ "collections", "criterion", "crossbeam", + "dashmap", "derive_more", "fail", "file_system", @@ -751,7 +680,7 @@ dependencies = [ "lazy_static", "lazycell", "peeking_take_while", - "prettyplease 0.2.6", + "prettyplease", "proc-macro2", "quote", "regex", @@ -787,15 +716,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "boolinator" version = "2.4.0" @@ -828,9 +748,9 @@ checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" [[package]] name = "byteorder" -version = "1.3.4" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" @@ -1208,15 +1128,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "cpufeatures" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" -dependencies = [ - "libc 0.2.146", -] - [[package]] name = "cpuid-bool" version = "0.1.2" @@ -1305,7 +1216,7 @@ dependencies = [ "crossbeam-deque", "crossbeam-epoch", "crossbeam-queue", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1315,7 +1226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1326,7 +1237,7 @@ checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1337,7 +1248,7 @@ checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ "autocfg", "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1350,7 +1261,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.8", + "crossbeam-utils", ] [[package]] @@ -1361,21 +1272,10 @@ checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "scopeguard", ] -[[package]] -name = "crossbeam-utils" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "lazy_static", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -1387,23 +1287,13 @@ dependencies = [ ] [[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "crypto-mac" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6" +name = "crypto" +version = "0.0.1" dependencies = [ - "generic-array", - "subtle", + "openssl", + "openssl-sys", + "slog", + "slog-global", ] [[package]] @@ -1465,9 +1355,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.1.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" +checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" dependencies = [ "cfg-if 1.0.0", "num_cpus", @@ -1514,17 +1404,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "digest" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" -dependencies = [ - "block-buffer 0.10.4", - "crypto-common", - "subtle", -] - [[package]] name = "dirs-next" version = "2.0.0" @@ -1585,8 +1464,8 @@ dependencies = [ "cloud", "crc32fast", "crossbeam", + "crypto", "derive_more", - "engine_traits", "error_code", "fail", "file_system", @@ -1600,7 +1479,6 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.5", "serde", "serde_derive", "slog", @@ -1641,6 +1519,7 @@ dependencies = [ name = "engine_panic" version = "0.0.1" dependencies = [ + "encryption", "engine_traits", "kvproto", "raft", @@ -1745,6 +1624,7 @@ dependencies = [ "log", "log_wrappers", "online_config", + "openssl", "ordered-float", "panic_hook", "parking_lot 0.12.1", @@ -1816,6 +1696,7 @@ dependencies = [ "log_wrappers", "num_cpus", "online_config", + "openssl", "portable-atomic 0.3.20", "prometheus", "prometheus-static-metric", @@ -1846,6 +1727,7 @@ version = "0.0.1" dependencies = [ "case_macros", "collections", + "encryption", "error_code", "fail", "file_system", @@ -1948,25 +1830,6 @@ dependencies = [ "tikv_alloc", ] -[[package]] -name = "etcd-client" -version = "0.10.2" -source = "git+https://github.com/pingcap/etcd-client?rev=41d393c32a7a7c728550cee1d9a138dafe6f3e27#41d393c32a7a7c728550cee1d9a138dafe6f3e27" -dependencies = [ - "http", - "hyper", - "hyper-openssl", - "openssl", - "prost", - "tokio", - "tokio-stream", - "tonic", - "tonic-build", - "tower", - "tower-service", - "visible", -] - [[package]] name = "event-listener" version = "2.5.1" @@ -1983,44 +1846,6 @@ dependencies = [ [[package]] name = "external_storage" version = "0.0.1" -dependencies = [ - "async-compression", - "async-trait", - "bytes", - "encryption", - "engine_traits", - "fail", - "ffi-support", - "file_system", - "futures 0.3.15", - "futures-executor", - "futures-io", - "futures-util", - "grpcio", - "kvproto", - "lazy_static", - "libloading", - "matches", - "openssl", - "prometheus", - "protobuf", - "rand 0.8.5", - "rusoto_core", - "rust-ini", - "slog", - "slog-global", - "structopt", - "tempfile", - "tikv_alloc", - "tikv_util", - "tokio", - "tokio-util", - "url", -] - -[[package]] -name = "external_storage_export" -version = "0.0.1" dependencies = [ "async-compression", "async-trait", @@ -2029,30 +1854,23 @@ dependencies = [ "cloud", "encryption", "engine_traits", - "external_storage", - "ffi-support", "file_system", "futures 0.3.15", - "futures-executor", "futures-io", "futures-util", "gcp", - "grpcio", "kvproto", "lazy_static", - "libc 0.2.146", - "libloading", "matches", - "nix 0.24.1", - "once_cell", - "protobuf", + "openssl", + "prometheus", + "rand 0.8.5", "rust-ini", - "signal-hook", "slog", "slog-global", - "slog-term", "structopt", "tempfile", + "tikv_alloc", "tikv_util", "tokio", "tokio-util", @@ -2085,16 +1903,6 @@ dependencies = [ "instant", ] -[[package]] -name = "ffi-support" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f85d4d1be103c0b2d86968f0b0690dc09ac0ba205b90adb0389b552869e5000e" -dependencies = [ - "lazy_static", - "log", -] - [[package]] name = "file_system" version = "0.1.0" @@ -2102,7 +1910,7 @@ dependencies = [ "bcc", "collections", "crc32fast", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "fs2", "lazy_static", "libc 0.2.146", @@ -2179,12 +1987,6 @@ dependencies = [ "syn 1.0.103", ] -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "flatbuffers" version = "2.1.2" @@ -2231,9 +2033,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] @@ -2712,22 +2514,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "644f9158b2f133fd50f5fb3242878846d9eb792e445c893805ff0e3824006e35" [[package]] -name = "hmac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" -dependencies = [ - "crypto-mac", - "digest 0.9.0", -] - -[[package]] -name = "hmac" -version = "0.12.1" +name = "home" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" dependencies = [ - "digest 0.10.6", + "windows-sys 0.48.0", ] [[package]] @@ -2763,12 +2555,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "http-range-header" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" - [[package]] name = "http-types" version = "2.12.0" @@ -2807,6 +2593,18 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "hybrid_engine" +version = "0.0.1" +dependencies = [ + "engine_rocks", + "engine_traits", + "region_cache_memory_engine", + "tempfile", + "tikv_util", + "txn_types", +] + [[package]] name = "hyper" version = "0.14.23" @@ -2849,18 +2647,6 @@ dependencies = [ "tower-layer", ] -[[package]] -name = "hyper-timeout" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" -dependencies = [ - "hyper", - "pin-project-lite", - "tokio", - "tokio-io-timeout", -] - [[package]] name = "hyper-tls" version = "0.5.0" @@ -2882,9 +2668,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -2908,9 +2694,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", "hashbrown 0.14.0", @@ -3099,7 +2885,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#7b612d935bf96f9daf7a537db379bcc88b4644e0" +source = "git+https://github.com/pingcap/kvproto.git#96c40585233f176393213dbd4c04d76259bad8f9" dependencies = [ "futures 0.3.15", "grpcio", @@ -3122,9 +2908,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "lexical-core" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92912c4af2e7d9075be3e5e3122c4d7263855fa6cce34fbece4dd08e5884624d" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -3135,9 +2921,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f518eed87c3be6debe6d26b855c97358d8a11bf05acec137e5f53080f5ad2dd8" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -3146,9 +2932,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.3" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc852ec67c6538bbb2b9911116a385b24510e879a69ab516e6a151b15a79168" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" dependencies = [ "lexical-util", "static_assertions", @@ -3156,18 +2942,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c72a9d52c5c4e62fa2cdc2cb6c694a39ae1382d9c2a17a466f18e272a0930eb1" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a89ec1d062e481210c309b672f73a0567b7855f21e7d2fae636df44d12e97f9" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ "lexical-util", "lexical-write-integer", @@ -3176,9 +2962,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "094060bd2a7c2ff3a16d5304a6ae82727cb3cc9d1c70f813cc73f744c319337e" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" dependencies = [ "lexical-util", "static_assertions", @@ -3228,7 +3014,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3247,7 +3033,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" dependencies = [ "bzip2-sys", "cc", @@ -3317,9 +3103,9 @@ dependencies = [ [[package]] name = "lz4-sys" -version = "1.9.2" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" dependencies = [ "cc", "libc 0.2.146", @@ -3348,23 +3134,6 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" -[[package]] -name = "matchit" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" - -[[package]] -name = "md-5" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" -dependencies = [ - "block-buffer 0.9.0", - "digest 0.9.0", - "opaque-debug", -] - [[package]] name = "md5" version = "0.7.0" @@ -3392,9 +3161,9 @@ dependencies = [ [[package]] name = "memmap2" -version = "0.5.3" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" dependencies = [ "libc 0.2.146", ] @@ -3575,6 +3344,7 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", + "openssl", "pd_client", "protobuf", "proxy_server", @@ -3810,23 +3580,23 @@ dependencies = [ [[package]] name = "num" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" dependencies = [ "num-bigint", - "num-complex 0.4.1", + "num-complex 0.4.4", "num-integer", "num-iter", - "num-rational 0.4.0", + "num-rational 0.4.1", "num-traits", ] [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ "autocfg", "num-integer", @@ -3844,9 +3614,9 @@ dependencies = [ [[package]] name = "num-complex" -version = "0.4.1" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fbc387afefefd5e9e39493299f3069e14a140dd34dc19b4c1c1a8fddb6a790" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" dependencies = [ "num-traits", ] @@ -3862,6 +3632,17 @@ dependencies = [ "syn 1.0.103", ] +[[package]] +name = "num-derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "num-format" version = "0.4.0" @@ -3874,9 +3655,9 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -3884,9 +3665,9 @@ dependencies = [ [[package]] name = "num-iter" -version = "0.1.42" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" dependencies = [ "autocfg", "num-integer", @@ -3906,9 +3687,9 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", "num-bigint", @@ -3918,9 +3699,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] @@ -3958,7 +3739,7 @@ dependencies = [ "serde", "serde_json", "serde_path_to_error", - "sha2 0.9.1", + "sha2", "thiserror", "url", ] @@ -3982,6 +3763,7 @@ checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" name = "online_config" version = "0.1.0" dependencies = [ + "chrono", "online_config_derive", "serde", "serde_derive", @@ -4199,9 +3981,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "perfcnt" @@ -4227,16 +4009,6 @@ dependencies = [ "ucd-trie", ] -[[package]] -name = "petgraph" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" -dependencies = [ - "fixedbitset", - "indexmap 1.6.2", -] - [[package]] name = "phf" version = "0.9.0" @@ -4377,14 +4149,14 @@ version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e" dependencies = [ - "portable-atomic 1.4.2", + "portable-atomic 1.6.0", ] [[package]] name = "portable-atomic" -version = "1.4.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f32154ba0af3a075eefa1eda8bb414ee928f62303a54ea85b8d6638ff1a6ee9e" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "pprof" @@ -4415,16 +4187,6 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" -[[package]] -name = "prettyplease" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" -dependencies = [ - "proc-macro2", - "syn 1.0.103", -] - [[package]] name = "prettyplease" version = "0.2.6" @@ -4564,61 +4326,6 @@ dependencies = [ "syn 1.0.103", ] -[[package]] -name = "prost" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-build" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511" -dependencies = [ - "bytes", - "heck 0.4.1", - "itertools", - "lazy_static", - "log", - "multimap", - "petgraph", - "prettyplease 0.1.21", - "prost", - "prost-types", - "regex", - "syn 1.0.103", - "tempfile", - "which 4.2.4", -] - -[[package]] -name = "prost-derive" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn 1.0.103", -] - -[[package]] -name = "prost-types" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" -dependencies = [ - "bytes", - "prost", -] - [[package]] name = "protobuf" version = "2.8.0" @@ -4687,6 +4394,7 @@ dependencies = [ "keys", "kvproto", "lazy_static", + "openssl", "pd_client", "protobuf", "raftstore", @@ -4707,7 +4415,7 @@ name = "proxy_server" version = "0.0.1" dependencies = [ "api_version", - "async-stream 0.2.0", + "async-stream", "backup", "backup-stream", "causal_ts", @@ -4799,6 +4507,7 @@ dependencies = [ "criterion-perf-events", "crossbeam", "encryption", + "encryption_export", "engine_rocks", "engine_rocks_helper", "engine_store_ffi", @@ -4806,7 +4515,6 @@ dependencies = [ "engine_tiflash", "engine_traits", "error_code", - "external_storage_export", "fail", "file_system", "futures 0.3.15", @@ -4901,7 +4609,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#9d360a3b0cdb691da8e500a4f73c457b605a1d73" +source = "git+https://github.com/tikv/raft-rs?branch=master#f60fb9e143e5b93f7db8917ea376cda04effcbb4" dependencies = [ "bytes", "fxhash", @@ -4915,8 +4623,8 @@ dependencies = [ [[package]] name = "raft-engine" -version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#de3ad04a2db9cdf795b1c82d7413b9b53bac92a8" +version = "0.4.1" +source = "git+https://github.com/tikv/raft-engine.git#fa56f891fdf0b1cb5b7849b7bee3c5dadbb96103" dependencies = [ "byteorder", "crc32fast", @@ -4932,13 +4640,14 @@ dependencies = [ "lz4-sys", "memmap2 0.7.0", "nix 0.26.2", - "num-derive", + "num-derive 0.4.0", "num-traits", "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "protobuf", "rayon", + "rhai", "scopeguard", "serde", "serde_repr", @@ -4949,7 +4658,7 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#9d360a3b0cdb691da8e500a4f73c457b605a1d73" +source = "git+https://github.com/tikv/raft-rs?branch=master#f60fb9e143e5b93f7db8917ea376cda04effcbb4" dependencies = [ "bytes", "protobuf", @@ -4990,6 +4699,7 @@ dependencies = [ "byteorder", "bytes", "causal_ts", + "chrono", "collections", "concurrency_manager", "crc32fast", @@ -5009,6 +4719,7 @@ dependencies = [ "futures-util", "getset", "grpcio-health", + "hybrid_engine", "into_other", "itertools", "keys", @@ -5029,6 +4740,7 @@ dependencies = [ "raft", "raft-proto", "rand 0.8.5", + "region_cache_memory_engine", "resource_control", "resource_metering", "serde", @@ -5264,7 +4976,7 @@ checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "num_cpus", ] @@ -5328,6 +5040,17 @@ version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" +[[package]] +name = "region_cache_memory_engine" +version = "0.0.1" +dependencies = [ + "bytes", + "collections", + "engine_traits", + "skiplist-rs", + "tikv_util", +] + [[package]] name = "remove_dir_all" version = "0.5.2" @@ -5483,24 +5206,35 @@ dependencies = [ ] [[package]] -name = "ring" -version = "0.16.16" +name = "rhai" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b72b84d47e8ec5a4f2872e8262b8f8256c5be1c938a7d6d3a867a3ba8f722f74" +checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" dependencies = [ - "cc", - "libc 0.2.146", - "once_cell", - "spin", - "untrusted", - "web-sys", - "winapi 0.3.9", + "ahash 0.7.4", + "bitflags", + "instant", + "num-traits", + "rhai_codegen", + "smallvec", + "smartstring", +] + +[[package]] +name = "rhai_codegen" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a39bc2aa9258b282ee5518dac493491a9c4c11a6d7361b9d2644c922fc6488" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.103", ] [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0c78f4072d766b152e83b25d3068b5c72b5feca1" +source = "git+https://github.com/tikv/rust-rocksdb.git#c4b7047314a9b27926a1b7b25d2e6d1a37a48d2b" dependencies = [ "libc 0.2.146", "librocksdb_sys", @@ -5509,7 +5243,7 @@ dependencies = [ [[package]] name = "rusoto_core" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "async-trait", "base64 0.13.0", @@ -5533,7 +5267,7 @@ dependencies = [ [[package]] name = "rusoto_credential" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "async-trait", "chrono", @@ -5550,7 +5284,7 @@ dependencies = [ [[package]] name = "rusoto_kms" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "async-trait", "bytes", @@ -5563,7 +5297,7 @@ dependencies = [ [[package]] name = "rusoto_mock" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "async-trait", "chrono", @@ -5577,7 +5311,7 @@ dependencies = [ [[package]] name = "rusoto_s3" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "async-trait", "bytes", @@ -5591,32 +5325,29 @@ dependencies = [ [[package]] name = "rusoto_signature" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "base64 0.13.0", "bytes", "chrono", - "digest 0.9.0", "futures 0.3.15", "hex 0.4.2", - "hmac 0.10.1", "http", "hyper", "log", - "md-5", + "openssl", "percent-encoding", "pin-project-lite", "rusoto_credential", "rustc_version 0.3.3", "serde", - "sha2 0.9.1", "tokio", ] [[package]] name = "rusoto_sts" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" dependencies = [ "async-trait", "bytes", @@ -5956,6 +5687,7 @@ dependencies = [ "grpcio", "grpcio-health", "hex 0.4.2", + "hybrid_engine", "keys", "kvproto", "libc 0.2.146", @@ -5968,7 +5700,7 @@ dependencies = [ "raft_log_engine", "raftstore", "raftstore-v2", - "rand 0.8.5", + "region_cache_memory_engine", "resolved_ts", "resource_control", "resource_metering", @@ -6004,24 +5736,13 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2933378ddfeda7ea26f48c555bdad8bb446bf8a3d17832dc83e380d444cfb8c1" dependencies = [ - "block-buffer 0.9.0", + "block-buffer", "cfg-if 0.1.10", "cpuid-bool", - "digest 0.9.0", + "digest", "opaque-debug", ] -[[package]] -name = "sha2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" -dependencies = [ - "cfg-if 1.0.0", - "cpufeatures", - "digest 0.10.6", -] - [[package]] name = "shlex" version = "0.1.1" @@ -6069,6 +5790,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" +[[package]] +name = "skiplist-rs" +version = "0.1.0" +source = "git+https://github.com/tikv/skiplist-rs.git?branch=main#618af619d9348ef89eaa71c5f6fbddbd9a5c09bf" +dependencies = [ + "bytes", + "rand 0.8.5", + "slog", +] + [[package]] name = "slab" version = "0.4.2" @@ -6146,6 +5877,17 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check 0.9.4", +] + [[package]] name = "snap_recovery" version = "0.1.0" @@ -6219,12 +5961,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "sst_importer" version = "0.1.0" @@ -6238,7 +5974,7 @@ dependencies = [ "engine_test", "engine_traits", "error_code", - "external_storage_export", + "external_storage", "file_system", "futures 0.3.15", "futures-util", @@ -6364,12 +6100,6 @@ dependencies = [ "syn 2.0.18", ] -[[package]] -name = "subtle" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" - [[package]] name = "symbolic-common" version = "10.1.1" @@ -6377,7 +6107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac457d054f793cedfde6f32d21d692b8351cfec9084fefd0470c0373f6d799bc" dependencies = [ "debugid", - "memmap2 0.5.3", + "memmap2 0.5.10", "stable_deref_trait", "uuid 1.2.1", ] @@ -6464,15 +6194,14 @@ dependencies = [ [[package]] name = "tame-oauth" version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9435c9348e480fad0f2215d5602e2dfad03df8a6398c4e7ceaeaa42758f26a8a" +source = "git+https://github.com/tikv/tame-oauth?branch=fips#176e3c69e9b5cd04b4248824ae6ee38ef57385be" dependencies = [ "base64 0.13.0", "chrono", "http", "lock_api", + "openssl", "parking_lot 0.11.1", - "ring", "serde", "serde_json", "twox-hash", @@ -6547,8 +6276,9 @@ dependencies = [ "collections", "concurrency_manager", "crc64fast", + "engine_rocks", "engine_traits", - "external_storage_export", + "external_storage", "file_system", "futures 0.3.15", "futures-executor", @@ -6646,6 +6376,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "grpcio-health", + "hybrid_engine", "keys", "kvproto", "lazy_static", @@ -6655,6 +6386,7 @@ dependencies = [ "raft", "raftstore", "rand 0.8.5", + "region_cache_memory_engine", "resolved_ts", "resource_control", "resource_metering", @@ -6749,6 +6481,7 @@ version = "0.0.1" dependencies = [ "api_version", "collections", + "engine_rocks", "futures 0.3.15", "kvproto", "pd_client", @@ -6785,7 +6518,6 @@ name = "tests" version = "0.0.1" dependencies = [ "api_version", - "arrow", "async-trait", "batch-system", "byteorder", @@ -6804,7 +6536,7 @@ dependencies = [ "engine_test", "engine_traits", "error_code", - "external_storage_export", + "external_storage", "fail", "file_system", "futures 0.3.15", @@ -6979,7 +6711,7 @@ dependencies = [ "match-template", "nom 7.1.0", "num 0.3.0", - "num-derive", + "num-derive 0.3.0", "num-traits", "ordered-float", "protobuf", @@ -7035,6 +6767,7 @@ dependencies = [ "byteorder", "chrono", "codec", + "crypto", "file_system", "flate2", "hex 0.4.2", @@ -7046,7 +6779,6 @@ dependencies = [ "panic_hook", "profiler", "protobuf", - "rand 0.8.5", "regex", "safemem", "serde", @@ -7065,11 +6797,11 @@ dependencies = [ [[package]] name = "tikv" -version = "7.4.0-alpha" +version = "7.6.0-alpha" dependencies = [ "anyhow", "api_version", - "async-stream 0.2.0", + "async-stream", "async-trait", "backtrace", "batch-system", @@ -7084,6 +6816,7 @@ dependencies = [ "crc32fast", "crc64fast", "crossbeam", + "crypto", "dashmap", "encryption_export", "engine_panic", @@ -7106,6 +6839,7 @@ dependencies = [ "grpcio-health", "hex 0.4.2", "http", + "hybrid_engine", "hyper", "hyper-openssl", "hyper-tls", @@ -7147,6 +6881,7 @@ dependencies = [ "raftstore-v2", "rand 0.7.3", "regex", + "region_cache_memory_engine", "reqwest", "resource_control", "resource_metering", @@ -7247,6 +6982,7 @@ version = "0.1.0" dependencies = [ "backtrace", "collections", + "encryption", "engine_panic", "engine_rocks", "engine_test", @@ -7328,6 +7064,7 @@ dependencies = [ "slog-global", "slog-json", "slog-term", + "strum 0.20.0", "sysinfo", "tempfile", "thiserror", @@ -7450,24 +7187,13 @@ dependencies = [ [[package]] name = "tokio-executor" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb2d1b8f4548dbf5e1f7818512e9c406860678f29c300cdf0ebac72d1a3a1671" +version = "0.1.9" +source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#4394380fa3c1f7f2c702a4ccc5ff01384746fdfd" dependencies = [ - "crossbeam-utils 0.7.2", + "crossbeam-utils", "futures 0.1.31", ] -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-macros" version = "1.7.0" @@ -7515,9 +7241,9 @@ dependencies = [ [[package]] name = "tokio-timer" version = "0.2.13" -source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#e8ac149d93f4a9bf49ea569d8d313ee40c5eb448" +source = "git+https://github.com/tikv/tokio?branch=tokio-timer-hotfix#4394380fa3c1f7f2c702a4ccc5ff01384746fdfd" dependencies = [ - "crossbeam-utils 0.7.2", + "crossbeam-utils", "futures 0.1.31", "slab", "tokio-executor", @@ -7549,9 +7275,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" [[package]] name = "toml_edit" @@ -7559,95 +7285,11 @@ version = "0.19.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c500344a19072298cd05a7224b3c0c629348b78692bf48466c5238656e315a78" dependencies = [ - "indexmap 2.0.0", + "indexmap 2.0.1", "toml_datetime", "winnow", ] -[[package]] -name = "tonic" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec" -dependencies = [ - "async-stream 0.3.3", - "async-trait", - "axum", - "base64 0.13.0", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "prost-derive", - "tokio", - "tokio-stream", - "tokio-util", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - -[[package]] -name = "tonic-build" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc" -dependencies = [ - "prettyplease 0.1.21", - "proc-macro2", - "prost-build", - "quote", - "syn 1.0.103", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.6.2", - "pin-project", - "pin-project-lite", - "rand 0.8.5", - "slab", - "tokio", - "tokio-util", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-http" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" -dependencies = [ - "bitflags", - "bytes", - "futures-core", - "futures-util", - "http", - "http-body", - "http-range-header", - "pin-project-lite", - "tower", - "tower-layer", - "tower-service", -] - [[package]] name = "tower-layer" version = "0.3.1" @@ -7667,7 +7309,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" dependencies = [ "cfg-if 1.0.0", - "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -7693,22 +7334,12 @@ dependencies = [ "lazy_static", ] -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project", - "tracing", -] - [[package]] name = "tracker" version = "0.0.1" dependencies = [ "collections", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "kvproto", "lazy_static", "parking_lot 0.12.1", @@ -7735,9 +7366,13 @@ dependencies = [ [[package]] name = "twox-hash" -version = "1.5.0" +version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bfd5b7557925ce778ff9b9ef90e3ade34c524b5ff10e239c69a42d546d2af56" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if 1.0.0", + "static_assertions", +] [[package]] name = "txn_types" @@ -7779,9 +7414,9 @@ checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416" [[package]] name = "unicode-ident" @@ -7816,17 +7451,11 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - [[package]] name = "url" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", @@ -7894,16 +7523,6 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "visible" -version = "0.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a044005fd5c0fc1ebd79c622e5606431c6b879a6a19acafb754be9926a2de73e" -dependencies = [ - "quote", - "syn 1.0.103", -] - [[package]] name = "void" version = "1.0.2" @@ -8115,21 +7734,51 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", + "windows_aarch64_gnullvm 0.42.0", "windows_aarch64_msvc 0.42.0", "windows_i686_gnu 0.42.0", "windows_i686_msvc 0.42.0", "windows_x86_64_gnu 0.42.0", - "windows_x86_64_gnullvm", + "windows_x86_64_gnullvm 0.42.0", "windows_x86_64_msvc 0.42.0", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_msvc" version = "0.32.0" @@ -8142,6 +7791,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_i686_gnu" version = "0.32.0" @@ -8154,6 +7809,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_msvc" version = "0.32.0" @@ -8166,6 +7827,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_x86_64_gnu" version = "0.32.0" @@ -8178,12 +7845,24 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_msvc" version = "0.32.0" @@ -8196,6 +7875,12 @@ version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "winnow" version = "0.4.7" @@ -8241,9 +7926,9 @@ dependencies = [ [[package]] name = "xdg" -version = "2.2.0" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d089681aa106a86fade1b0128fb5daf07d5867a509ab036d99988dec80429a57" +checksum = "213b7324336b53d2414b2db8537e56544d981803139155afa84f76eeebb7a546" [[package]] name = "xml-rs" @@ -8258,7 +7943,7 @@ source = "git+https://github.com/tikv/yatp.git?branch=master#5572a78702572087cab dependencies = [ "crossbeam-deque", "crossbeam-skiplist", - "crossbeam-utils 0.8.8", + "crossbeam-utils", "dashmap", "fail", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index b8839bcb1f4..8beb2a573a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.4.0-alpha" +version = "7.6.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" @@ -21,18 +21,27 @@ snmalloc = ["tikv_alloc/snmalloc"] portable = ["engine_rocks/portable"] sse = ["engine_rocks/sse"] mem-profiling = ["tikv_alloc/mem-profiling"] -failpoints = ["fail/failpoints", "raftstore/failpoints", "tikv_util/failpoints", "engine_rocks/failpoints"] -cloud-aws = ["encryption_export/cloud-aws", "sst_importer/cloud-aws"] -cloud-gcp = ["encryption_export/cloud-gcp", "sst_importer/cloud-gcp"] -cloud-azure = ["encryption_export/cloud-azure", "sst_importer/cloud-azure"] -testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport"] +failpoints = ["fail/failpoints", "raftstore/failpoints", "tikv_util/failpoints", "engine_rocks/failpoints", "raft_log_engine/failpoints"] +cloud-aws = ["encryption_export/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure"] +testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport", "hybrid_engine/testexport"] test-engine-kv-rocksdb = ["engine_test/test-engine-kv-rocksdb"] test-engine-raft-raft-engine = ["engine_test/test-engine-raft-raft-engine"] test-engines-rocksdb = ["engine_test/test-engines-rocksdb"] test-engines-panic = ["engine_test/test-engines-panic"] -cloud-storage-grpc = ["sst_importer/cloud-storage-grpc"] -cloud-storage-dylib = ["sst_importer/cloud-storage-dylib"] pprof-fp = ["pprof/frame-pointer"] +openssl-vendored = [ + "openssl/vendored", + "hyper-tls/vendored", + # NB: the "openssl" feature does not make grpcio-sys v0.10 depends on + # openssl-sys, and it can not find the static openssl built by openssl-sys. + # Enabling "grpcio/openssl-vendored" explicitly makes grpcio-sys depends on + # openssl-sys and correctly links to the static openssl. + "grpcio/openssl-vendored", + # NB: Enable SM4 support if OpenSSL is built from source and statically linked. + "encryption_export/sm4", +] # for testing configure propegate to other crates # https://stackoverflow.com/questions/41700543/can-we-share-test-utilites-between-crates @@ -59,6 +68,7 @@ coprocessor_plugin_api = { workspace = true } crc32fast = "1.2" crc64fast = "0.1" crossbeam = "0.8" +crypto = { workspace = true } dashmap = "5" encryption_export = { workspace = true } engine_panic = { workspace = true } @@ -80,6 +90,7 @@ grpcio = { workspace = true } grpcio-health = { workspace = true } hex = "0.4" http = "0" +hybrid_engine = { workspace = true } hyper = { version = "0.14", features = ["full"] } hyper-tls = "0.5" into_other = { workspace = true } @@ -102,7 +113,7 @@ notify = "4" num-traits = "0.2.14" num_cpus = "1" online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } parking_lot = "0.12" paste = "1.0" pd_client = { workspace = true } @@ -118,6 +129,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" +region_cache_memory_engine = { workspace = true } resource_control = { workspace = true } resource_metering = { workspace = true } rev_lines = "0.2.1" @@ -182,6 +194,7 @@ protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } # TODO: remove this replacement after rusoto_s3 truly supports virtual-host style (https://github.com/rusoto/rusoto/pull/1823). +# UPDATE: use openssl for signature to support fips 140 rusoto_core = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_credential = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_kms = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } @@ -189,6 +202,9 @@ rusoto_mock = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr rusoto_s3 = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_sts = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } +# NOTICE: use openssl for signature to support fips 140 +tame-oauth = { git = "https://github.com/tikv/tame-oauth", branch = "fips" } + snappy-sys = { git = "https://github.com/busyjay/rust-snappy.git", branch = "static-link" } # remove this when https://github.com/danburkert/fs2-rs/pull/42 is merged. @@ -231,6 +247,7 @@ members = [ "components/collections", "components/concurrency_manager", "components/coprocessor_plugin_api", + "components/crypto", "components/encryption", "components/encryption/export", "components/engine_rocks_helper", @@ -239,7 +256,6 @@ members = [ # "components/engine_tirocks", "components/error_code", "components/external_storage", - "components/external_storage/export", "components/file_system", "components/into_other", "components/keys", @@ -318,14 +334,16 @@ encryption = { path = "components/encryption" } encryption_export = { path = "components/encryption/export" } engine_panic = { path = "components/engine_panic" } engine_rocks = { path = "components/engine_rocks" } +hybrid_engine = { path = "components/hybrid_engine" } +region_cache_memory_engine = { path = "components/region_cache_memory_engine" } engine_rocks_helper = { path = "components/engine_rocks_helper" } engine_test = { path = "components/engine_test", default-features = false } engine_traits = { path = "components/engine_traits" } engine_traits_tests = { path = "components/engine_traits_tests", default-features = false } error_code = { path = "components/error_code" } external_storage = { path = "components/external_storage" } -external_storage_export = { path = "components/external_storage/export" } file_system = { path = "components/file_system" } +crypto = { path = "components/crypto" } gcp = { path = "components/cloud/gcp" } into_other = { path = "components/into_other" } keys = { path = "components/keys" } @@ -344,7 +362,7 @@ resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } server = { path = "components/server" } service = { path = "components/service" } -snap_recovery = { path = "components/snap_recovery" } +snap_recovery = { path = "components/snap_recovery", default-features = false } sst_importer = { path = "components/sst_importer", default-features = false } test_backup = { path = "components/test_backup" } test_coprocessor = { path = "components/test_coprocessor", default-features = false } @@ -372,14 +390,17 @@ tracker = { path = "components/tracker" } txn_types = { path = "components/txn_types" } # External libs raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } +grpcio = { version = "0.10.4", default-features = false, features = ["openssl", "protobuf-codec", "nightly"] } grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } tipb = { git = "https://github.com/pingcap/tipb.git" } kvproto = { git = "https://github.com/pingcap/kvproto.git" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-executor = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +openssl = "0.10" +openssl-sys = "0.9" # TiFlash libs engine_store_ffi = { path = "proxy_components/engine_store_ffi", default-features = false } @@ -401,10 +422,6 @@ opt-level = 1 debug = false opt-level = 1 -[profile.dev.package.tirocks-sys] -debug = false -opt-level = 1 - [profile.dev.package.tests] debug = 1 opt-level = 1 diff --git a/Dockerfile.FIPS b/Dockerfile.FIPS new file mode 100644 index 00000000000..03195d4cf5b --- /dev/null +++ b/Dockerfile.FIPS @@ -0,0 +1,45 @@ +# This Docker image contains a minimal build environment for a FIPS compliant TiKV. + +FROM rockylinux:9 as builder + +RUN dnf install -y openssl-devel + +RUN dnf install -y \ + gcc \ + gcc-c++ \ + make \ + cmake \ + perl \ + git \ + findutils \ + curl \ + python3 --allowerasing && \ + dnf --enablerepo=crb install -y \ + libstdc++-static && \ + dnf clean all + +# Install Rustup +RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y +ENV PATH /root/.cargo/bin/:$PATH + +# Checkout TiKV source code. +WORKDIR /tikv +COPY .git .git +ARG GIT_HASH +RUN git checkout ${GIT_HASH} && git checkout . + +# Do not static link OpenSSL. +ENV ENABLE_FIPS 1 +RUN make build_dist_release + +# Export to a clean image +FROM rockylinux:9-minimal + +RUN microdnf install -y openssl + +COPY --from=builder /tikv/target/release/tikv-server /tikv-server +COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl + +EXPOSE 20160 20180 + +ENTRYPOINT ["/tikv-server"] diff --git a/Makefile b/Makefile index 352ac76cbdb..fd5b6fbb061 100644 --- a/Makefile +++ b/Makefile @@ -122,6 +122,19 @@ ENABLE_FEATURES += cloud-gcp ENABLE_FEATURES += cloud-azure endif +export DOCKER_FILE ?= Dockerfile +export DOCKER_IMAGE_NAME ?= pingcap/tikv +export DOCKER_IMAGE_TAG ?= latest +export DEV_DOCKER_IMAGE_NAME ?= pingcap/tikv_dev +export ENABLE_FIPS ?= 0 + +ifeq ($(ENABLE_FIPS),1) +DOCKER_IMAGE_TAG := ${DOCKER_IMAGE_TAG}-fips +DOCKER_FILE := ${DOCKER_FILE}.FIPS +else +ENABLE_FEATURES += openssl-vendored +endif + PROJECT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) BIN_PATH = $(CURDIR)/bin @@ -135,10 +148,6 @@ export PROXY_BUILD_RUSTC_VERSION := $(shell rustc --version 2> /dev/null || echo export PROXY_BUILD_GIT_HASH ?= $(shell git rev-parse HEAD 2> /dev/null || echo ${BUILD_INFO_GIT_FALLBACK}) export PROXY_BUILD_GIT_BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD 2> /dev/null || echo ${BUILD_INFO_GIT_FALLBACK}) -export DOCKER_IMAGE_NAME ?= "pingcap/tikv" -export DOCKER_IMAGE_TAG ?= "latest" -export DEV_DOCKER_IMAGE_NAME ?= "pingcap/tikv_dev" - # Turn on cargo pipelining to add more build parallelism. This has shown decent # speedups in TiKV. # @@ -155,6 +164,9 @@ ifeq ($(TIKV_BUILD_RUSTC_TARGET),aarch64-unknown-linux-gnu) export RUSTFLAGS := $(RUSTFLAGS) -Ctarget-feature=-outline-atomics endif +# If both python and python3 are installed, it will choose python as a preferred option. +PYTHON := $(shell command -v python 2> /dev/null || command -v python3 2> /dev/null) + # Almost all the rules in this Makefile are PHONY # Declaring a rule as PHONY could improve correctness # But probably instead just improves performance by a little bit diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index a36e72b3c64..9504c3a4eae 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -17,18 +17,14 @@ mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] cloud-aws = [ "encryption_export/cloud-aws", - "backup/cloud-aws", ] cloud-gcp = [ "encryption_export/cloud-gcp", - "backup/cloud-gcp", ] cloud-azure = [ "encryption_export/cloud-azure", - "backup/cloud-azure", ] -cloud-storage-grpc = ["backup/cloud-storage-grpc"] -cloud-storage-dylib = ["backup/cloud-storage-dylib"] +openssl-vendored = ["tikv/openssl-vendored"] test-engine-kv-rocksdb = [ "tikv/test-engine-kv-rocksdb" ] @@ -53,6 +49,7 @@ clap = "2.32" collections = { workspace = true } concurrency_manager = { workspace = true } crossbeam = "0.8" +crypto = { workspace = true } encryption_export = { workspace = true } engine_rocks = { workspace = true } engine_traits = { workspace = true } @@ -75,7 +72,6 @@ raft-engine = { git = "https://github.com/tikv/raft-engine.git" } raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git" } raft_log_engine = { workspace = true } raftstore = { workspace = true } -rand = "0.8" regex = "1" security = { workspace = true } serde_json = "1.0" diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index a145118acea..3e4e505a32a 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -715,7 +715,7 @@ pub trait DebugExecutor { _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange>; + ) -> Result<(), (KeyRange, grpcio::Error)>; fn get_region_read_progress(&self, region_id: u64, log: bool, min_start_ts: u64); } @@ -948,7 +948,7 @@ impl DebugExecutor for DebugClient { key_range: KeyRange, start_ts: u64, commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { let mut req = FlashbackToVersionRequest::default(); req.set_version(version); req.set_region_id(region_id); @@ -963,7 +963,7 @@ impl DebugExecutor for DebugClient { "flashback key_range {:?} with start_ts {:?}, commit_ts {:?} need to retry, err is {:?}", key_range, start_ts, commit_ts, err ); - Err(key_range) + Err((key_range, err)) } } } @@ -1293,7 +1293,7 @@ where _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { unimplemented!("only available for remote mode"); } @@ -1332,11 +1332,16 @@ impl DebugExecutor for DebuggerImplV2 { } fn get_region_size(&self, region: u64, cfs: Vec<&str>) -> Vec<(String, usize)> { - self.region_size(region, cfs) - .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) - .into_iter() - .map(|(cf, size)| (cf.to_owned(), size)) - .collect() + match self.region_size(region, cfs) { + Ok(v) => v + .into_iter() + .map(|(cf, size)| (cf.to_owned(), size)) + .collect(), + Err(e) => { + println!("Debugger::region_size: {}", e); + vec![] + } + } } fn get_region_info(&self, region: u64) -> RegionInfo { @@ -1510,7 +1515,7 @@ impl DebugExecutor for DebuggerImplV2 { _key_range: KeyRange, _start_ts: u64, _commit_ts: u64, - ) -> Result<(), KeyRange> { + ) -> Result<(), (KeyRange, grpcio::Error)> { unimplemented!("only available for remote mode"); } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 6baa1fe6c39..25f8cc1337b 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -20,12 +20,12 @@ use std::{ }; use collections::HashMap; +use crypto::fips; use encryption_export::{ - create_backend, data_key_manager_from_config, from_engine_encryption_method, DataKeyManager, - DecrypterReader, Iv, + create_backend, data_key_manager_from_config, DataKeyManager, DecrypterReader, Iv, }; use engine_rocks::get_env; -use engine_traits::{EncryptionKeyManager, Peekable}; +use engine_traits::Peekable; use file_system::calc_crc32; use futures::{executor::block_on, future::try_join_all}; use gag::BufferRedirect; @@ -61,11 +61,17 @@ mod fork_readonly_tikv; mod util; fn main() { + // OpenSSL FIPS mode should be enabled at the very start. + fips::maybe_enable(); + let opt = Opt::from_args(); // Initialize logger. init_ctl_logger(&opt.log_level); + // Print OpenSSL FIPS mode status. + fips::log_status(); + // Initialize configuration and security manager. let cfg_path = opt.config.as_ref(); let mut cfg = cfg_path.map_or_else( @@ -115,6 +121,9 @@ fn main() { } } Cmd::RaftEngineCtl { args } => { + if !validate_storage_data_dir(&mut cfg, opt.data_dir) { + return; + } let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .expect("data_key_manager_from_config should success"); @@ -136,6 +145,9 @@ fn main() { dump_snap_meta_file(path); } Cmd::DecryptFile { file, out_file } => { + if !validate_storage_data_dir(&mut cfg, opt.data_dir) { + return; + } let message = "This action will expose sensitive data as plaintext on persistent storage"; if !warning_prompt(message) { @@ -160,7 +172,7 @@ fn main() { let infile1 = Path::new(infile).canonicalize().unwrap(); let file_info = key_manager.get_file(infile1.to_str().unwrap()).unwrap(); - let mthd = from_engine_encryption_method(file_info.method); + let mthd = file_info.method; if mthd == EncryptionMethod::Plaintext { println!( "{} is not encrypted, skip to decrypt it into {}", @@ -184,28 +196,36 @@ fn main() { io::copy(&mut reader, &mut outf).unwrap(); println!("crc32: {}", calc_crc32(outfile).unwrap()); } - Cmd::EncryptionMeta { cmd: subcmd } => match subcmd { - EncryptionMetaCmd::DumpKey { ids } => { - let message = "This action will expose encryption key(s) as plaintext. Do not output the \ + Cmd::EncryptionMeta { cmd: subcmd } => { + if !validate_storage_data_dir(&mut cfg, opt.data_dir) { + return; + } + match subcmd { + EncryptionMetaCmd::DumpKey { ids } => { + let message = "This action will expose encryption key(s) as plaintext. Do not output the \ result in file on disk."; - if !warning_prompt(message) { - return; + if !warning_prompt(message) { + return; + } + DataKeyManager::dump_key_dict( + create_backend(&cfg.security.encryption.master_key) + .expect("encryption-meta master key creation"), + &cfg.storage.data_dir, + ids, + ) + .unwrap(); + } + EncryptionMetaCmd::DumpFile { path } => { + let path = path + .map(|path| fs::canonicalize(path).unwrap().to_str().unwrap().to_owned()); + DataKeyManager::dump_file_dict(&cfg.storage.data_dir, path.as_deref()).unwrap(); } - DataKeyManager::dump_key_dict( - create_backend(&cfg.security.encryption.master_key) - .expect("encryption-meta master key creation"), - &cfg.storage.data_dir, - ids, - ) - .unwrap(); - } - EncryptionMetaCmd::DumpFile { path } => { - let path = - path.map(|path| fs::canonicalize(path).unwrap().to_str().unwrap().to_owned()); - DataKeyManager::dump_file_dict(&cfg.storage.data_dir, path.as_deref()).unwrap(); } - }, + } Cmd::CleanupEncryptionMeta {} => { + if !validate_storage_data_dir(&mut cfg, opt.data_dir) { + return; + } let key_manager = match data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .expect("data_key_manager_from_config should success") @@ -906,7 +926,7 @@ fn flashback_whole_cluster( .await { Ok(res) => { - if let Err(key_range) = res { + if let Err((key_range, _)) = res { // Retry specific key range to prepare flashback. let stale_key_range = (key_range.start_key.clone(), key_range.end_key.clone()); let mut key_range_to_prepare = key_range_to_prepare.write().unwrap(); @@ -986,7 +1006,21 @@ fn flashback_whole_cluster( { Ok(res) => match res { Ok(_) => break, - Err(_) => { + Err((key_range, err)) => { + // Retry `NotLeader` or `RegionNotFound`. + if err.to_string().contains("not leader") || err.to_string().contains("not found") { + // When finished `PrepareFlashback`, the region may change leader in the `flashback in progress` + // Neet to retry specific key range to finish flashback. + let stale_key_range = (key_range.start_key.clone(), key_range.end_key.clone()); + let mut key_range_to_finish = key_range_to_finish.write().unwrap(); + // Remove stale key range. + key_range_to_finish.remove(&stale_key_range); + load_key_range(&pd_client, stale_key_range.0.clone(), stale_key_range.1.clone()) + .into_iter().for_each(|(key_range, region_info)| { + // Need to update `key_range_to_finish` to replace stale key range. + key_range_to_finish.insert(key_range, region_info); + }); + } thread::sleep(Duration::from_micros(WAIT_APPLY_FLASHBACK_STATE)); continue; } @@ -1048,7 +1082,7 @@ fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - let resource = cfg.rocksdb.build_resources(env); + let resource = cfg.rocksdb.build_resources(env, cfg.storage.engine); cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } @@ -1310,3 +1344,17 @@ fn read_cluster_id(config: &TikvConfig) -> Result { .unwrap(); Ok(ident.cluster_id) } + +fn validate_storage_data_dir(config: &mut TikvConfig, data_dir: Option) -> bool { + if let Some(data_dir) = data_dir { + if !Path::new(&data_dir).exists() { + eprintln!("--data-dir {:?} not exists", data_dir); + return false; + } + config.storage.data_dir = data_dir; + } else if config.storage.data_dir.is_empty() { + eprintln!("--data-dir or data-dir in the config file should not be empty"); + return false; + } + true +} diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index 409dc84a62d..cc99e05fb58 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -18,6 +18,7 @@ failpoints = ["server/failpoints"] cloud-aws = ["server/cloud-aws"] cloud-gcp = ["server/cloud-gcp"] cloud-azure = ["server/cloud-azure"] +openssl-vendored = ["tikv/openssl-vendored"] test-engine-kv-rocksdb = [ "server/test-engine-kv-rocksdb" ] @@ -33,6 +34,7 @@ pprof-fp = ["tikv/pprof-fp"] [dependencies] clap = "2.32" +crypto = { workspace = true } encryption_export = { workspace = true } engine_traits = { workspace = true } keys = { workspace = true } diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 9fdcad81c58..4c1eb4fc2c5 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -5,6 +5,7 @@ use std::{path::Path, process}; use clap::{crate_authors, App, Arg}; +use crypto::fips; use serde_json::{Map, Value}; use server::setup::{ensure_no_unrecognized_config, validate_and_persist_config}; use tikv::{ @@ -13,6 +14,9 @@ use tikv::{ }; fn main() { + // OpenSSL FIPS mode should be enabled at the very start. + fips::maybe_enable(); + let build_timestamp = option_env!("TIKV_BUILD_TIME"); let version_info = tikv::tikv_version_info(build_timestamp); @@ -217,6 +221,20 @@ fn main() { process::exit(1) } + // Sets the global logger ASAP. + // It is okay to use the config w/o `validate()`, + // because `initial_logger()` handles various conditions. + server::setup::initial_logger(&config); + + // Print version information. + tikv::log_tikv_info(build_timestamp); + + // Print OpenSSL FIPS mode status. + fips::log_status(); + + // Init memory related settings. + config.memory.init(); + let (service_event_tx, service_event_rx) = tikv_util::mpsc::unbounded(); // pipe for controling service match config.storage.engine { EngineType::RaftKv => server::server::run_tikv(config, service_event_tx, service_event_rx), diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 8c1edc89a48..a91b3fb071d 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -11,8 +11,6 @@ test-engines-rocksdb = ["tikv/test-engines-rocksdb"] failpoints = ["tikv/failpoints", "fail/failpoints"] backup-stream-debug = [] -metastore-etcd = ["tonic", "etcd-client"] - [[test]] name = "integration" path = "tests/integration/mod.rs" @@ -30,7 +28,6 @@ harness = true async-compression = { version = "0.3.14", features = ["tokio", "zstd"] } async-trait = { version = "0.1" } bytes = "1" -cfg-if = "1" chrono = "0.4" concurrency_manager = { workspace = true } crossbeam = "0.8" @@ -39,11 +36,7 @@ dashmap = "5" engine_rocks = { workspace = true } engine_traits = { workspace = true } error_code = { workspace = true } -# We cannot update the etcd-client to latest version because of the cyclic requirement. -# Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. -etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "41d393c32a7a7c728550cee1d9a138dafe6f3e27", features = ["pub-response-field", "tls-openssl-vendored"], optional = true } external_storage = { workspace = true } -external_storage_export = { workspace = true } fail = "0.5" file_system = { workspace = true } futures = "0.3" @@ -56,7 +49,7 @@ kvproto = { workspace = true } lazy_static = "1.4" log_wrappers = { workspace = true } online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } pd_client = { workspace = true } pin-project = "1.0" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } @@ -64,7 +57,6 @@ prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } raftstore = { workspace = true } -rand = "0.8.0" regex = "1" resolved_ts = { workspace = true } security = { path = "../security" } @@ -78,8 +70,7 @@ tikv_kv = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" -tokio-util = { version = "0.7", features = ["compat"] } -tonic = { version = "0.8", optional = true } +tokio-util = { version = "0.7", features = ["compat"] } txn_types = { workspace = true } uuid = "0.8" yatp = { workspace = true } @@ -91,6 +82,7 @@ engine_test = { workspace = true } grpcio = { workspace = true } hex = "0.4" protobuf = { version = "2.8", features = ["bytes"] } +rand = "0.8.0" tempdir = "0.3" tempfile = "3.0" test_pd = { workspace = true } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index b11259d5be6..f453469768c 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1,16 +1,24 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{any::Any, collections::HashSet, fmt, marker::PhantomData, sync::Arc, time::Duration}; +use std::{ + any::Any, + collections::HashSet, + fmt, + marker::PhantomData, + sync::{Arc, Mutex}, + time::Duration, +}; use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use error_code::ErrorCodeExt; -use futures::{stream::AbortHandle, FutureExt}; +use futures::{stream::AbortHandle, FutureExt, TryFutureExt}; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, }; use pd_client::PdClient; +use raft::StateRole; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, router::CdcHandle, @@ -21,6 +29,7 @@ use tikv_util::{ box_err, config::ReadableDuration, debug, defer, info, + memory::MemoryQuota, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, warn, @@ -30,7 +39,7 @@ use tikv_util::{ use tokio::{ io::Result as TokioResult, runtime::{Handle, Runtime}, - sync::oneshot, + sync::{oneshot, Semaphore}, }; use tokio_stream::StreamExt; use txn_types::TimeStamp; @@ -43,7 +52,7 @@ use crate::{ GetCheckpointResult, RegionIdWithVersion, Subscription, }, errors::{Error, Result}, - event_loader::{InitialDataLoader, PendingMemoryQuota}, + event_loader::InitialDataLoader, future, metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, @@ -60,7 +69,7 @@ const SLOW_EVENT_THRESHOLD: f64 = 120.0; /// task has fatal error. const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; -pub struct Endpoint { +pub struct Endpoint { // Note: those fields are more like a shared context between components. // For now, we copied them everywhere, maybe we'd better extract them into a // context type. @@ -69,7 +78,6 @@ pub struct Endpoint { pub(crate) store_id: u64, pub(crate) regions: R, pub(crate) engine: PhantomData, - pub(crate) router: RT, pub(crate) pd_client: Arc, pub(crate) subs: SubscriptionTracer, pub(crate) concurrency_manager: ConcurrencyManager, @@ -78,8 +86,6 @@ pub struct Endpoint { pub range_router: Router, observer: BackupStreamObserver, pool: Runtime, - initial_scan_memory_quota: PendingMemoryQuota, - initial_scan_throughput_quota: Limiter, region_operator: RegionSubscriptionManager, failover_time: Option, // We holds the config before, even it is useless for now, @@ -92,17 +98,17 @@ pub struct Endpoint { /// This is used for simulating an asynchronous background worker. /// Each time we spawn a task, once time goes by, we abort that task. pub abort_last_storage_save: Option, + pub initial_scan_semaphore: Arc, } -impl Endpoint +impl Endpoint where R: RegionInfoProvider + 'static + Clone, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, S: MetaStore + 'static, { - pub fn new( + pub fn new + 'static>( store_id: u64, store: S, config: BackupStreamConfig, @@ -134,8 +140,9 @@ where pool.spawn(Self::starts_flush_ticks(range_router.clone())); - let initial_scan_memory_quota = - PendingMemoryQuota::new(config.initial_scan_pending_memory_quota.0 as _); + let initial_scan_memory_quota = Arc::new(MemoryQuota::new( + config.initial_scan_pending_memory_quota.0 as _, + )); let limit = if config.initial_scan_rate_limit.0 > 0 { config.initial_scan_rate_limit.0 as f64 } else { @@ -145,17 +152,21 @@ where info!("the endpoint of stream backup started"; "path" => %config.temp_path); let subs = SubscriptionTracer::default(); + let initial_scan_semaphore = Arc::new(Semaphore::new(config.initial_scan_concurrency)); let (region_operator, op_loop) = RegionSubscriptionManager::start( InitialDataLoader::new( - router.clone(), - accessor.clone(), range_router.clone(), subs.clone(), scheduler.clone(), - initial_scan_memory_quota.clone(), - pool.handle().clone(), - initial_scan_throughput_quota.clone(), + initial_scan_memory_quota, + initial_scan_throughput_quota, + // NOTE: in fact we can get rid of the `Arc`. Just need to warp the router when the + // scanner pool is created. But at that time the handle has been sealed in the + // `InitialScan` trait -- we cannot do that. + Arc::new(Mutex::new(router)), + Arc::clone(&initial_scan_semaphore), ), + accessor.clone(), observer.clone(), meta_client.clone(), pd_client.clone(), @@ -166,6 +177,7 @@ where let mut checkpoint_mgr = CheckpointManager::default(); pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); let ep = Endpoint { + initial_scan_semaphore, meta_client, range_router, scheduler, @@ -174,12 +186,9 @@ where store_id, regions: accessor, engine: PhantomData, - router, pd_client, subs, concurrency_manager, - initial_scan_memory_quota, - initial_scan_throughput_quota, region_operator, failover_time: None, config, @@ -191,18 +200,64 @@ where } } -impl Endpoint +impl Endpoint where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, { fn get_meta_client(&self) -> MetadataClient { self.meta_client.clone() } + fn on_fatal_error_of_task(&self, task: &str, err: &Error) -> future![()] { + metrics::update_task_status(TaskStatus::Error, task); + let meta_cli = self.get_meta_client(); + let pdc = self.pd_client.clone(); + let store_id = self.store_id; + let sched = self.scheduler.clone(); + let safepoint_name = self.pause_guard_id_for_task(task); + let safepoint_ttl = self.pause_guard_duration(); + let code = err.error_code().code.to_owned(); + let msg = err.to_string(); + let task = task.to_owned(); + async move { + let err_fut = async { + let safepoint = meta_cli.global_progress_of_task(&task).await?; + pdc.update_service_safe_point( + safepoint_name, + TimeStamp::new(safepoint.saturating_sub(1)), + safepoint_ttl, + ) + .await?; + meta_cli.pause(&task).await?; + let mut last_error = StreamBackupError::new(); + last_error.set_error_code(code); + last_error.set_error_message(msg.clone()); + last_error.set_store_id(store_id); + last_error.set_happen_at(TimeStamp::physical_now()); + meta_cli.report_last_error(&task, last_error).await?; + Result::Ok(()) + }; + if let Err(err_report) = err_fut.await { + err_report.report(format_args!("failed to upload error {}", err_report)); + let name = task.to_owned(); + // Let's retry reporting after 5s. + tokio::task::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + try_send!( + sched, + Task::FatalError( + TaskSelector::ByName(name), + Box::new(annotate!(err_report, "origin error: {}", msg)) + ) + ); + }); + } + } + } + fn on_fatal_error(&self, select: TaskSelector, err: Box) { err.report_fatal(); let tasks = self @@ -212,49 +267,7 @@ where for task in tasks { // Let's pause the task first. self.unload_task(&task); - metrics::update_task_status(TaskStatus::Error, &task); - - let meta_cli = self.get_meta_client(); - let pdc = self.pd_client.clone(); - let store_id = self.store_id; - let sched = self.scheduler.clone(); - let safepoint_name = self.pause_guard_id_for_task(&task); - let safepoint_ttl = self.pause_guard_duration(); - let code = err.error_code().code.to_owned(); - let msg = err.to_string(); - self.pool.block_on(async move { - let err_fut = async { - let safepoint = meta_cli.global_progress_of_task(&task).await?; - pdc.update_service_safe_point( - safepoint_name, - TimeStamp::new(safepoint.saturating_sub(1)), - safepoint_ttl, - ) - .await?; - meta_cli.pause(&task).await?; - let mut last_error = StreamBackupError::new(); - last_error.set_error_code(code); - last_error.set_error_message(msg.clone()); - last_error.set_store_id(store_id); - last_error.set_happen_at(TimeStamp::physical_now()); - meta_cli.report_last_error(&task, last_error).await?; - Result::Ok(()) - }; - if let Err(err_report) = err_fut.await { - err_report.report(format_args!("failed to upload error {}", err_report)); - // Let's retry reporting after 5s. - tokio::task::spawn(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - try_send!( - sched, - Task::FatalError( - TaskSelector::ByName(task.to_owned()), - Box::new(annotate!(err_report, "origin error: {}", msg)) - ) - ); - }); - } - }); + self.pool.block_on(self.on_fatal_error_of_task(&task, &err)); } } @@ -494,20 +507,6 @@ where }); } - /// Make an initial data loader using the resource of the endpoint. - pub fn make_initial_loader(&self) -> InitialDataLoader { - InitialDataLoader::new( - self.router.clone(), - self.regions.clone(), - self.range_router.clone(), - self.subs.clone(), - self.scheduler.clone(), - self.initial_scan_memory_quota.clone(), - self.pool.handle().clone(), - self.initial_scan_throughput_quota.clone(), - ) - } - pub fn handle_watch_task(&self, op: TaskOp) { match op { TaskOp::AddTask(task) => { @@ -525,13 +524,12 @@ where } } - async fn observe_and_scan_region( + async fn observe_regions_in_range( &self, - init: InitialDataLoader, task: &StreamTask, start_key: Vec, end_key: Vec, - ) -> Result<()> { + ) { let start = Instant::now_coarse(); let success = self .observer @@ -549,7 +547,9 @@ where // directly and this would be fast. If this gets slow, maybe make it async // again. (Will that bring race conditions? say `Start` handled after // `ResfreshResolver` of some region.) - let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); + let range_init_result = self + .initialize_range(start_key.clone(), end_key.clone()) + .await; match range_init_result { Ok(()) => { info!("backup stream success to initialize"; @@ -561,6 +561,45 @@ where e.report("backup stream initialize failed"); } } + } + + /// initialize a range: it simply scan the regions with leader role and send + /// them to [`initialize_region`]. + pub async fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { + // Generally we will be very very fast to consume. + // Directly clone the initial data loader to the background thread looks a + // little heavier than creating a new channel. TODO: Perhaps we need a + // handle to the `InitialDataLoader`. Making it a `Runnable` worker might be a + // good idea. + let (tx, mut rx) = tokio::sync::mpsc::channel(1); + self.regions + .seek_region( + &start_key, + Box::new(move |i| { + // Ignore the error, this can only happen while the server is shutting down, the + // future has been canceled. + let _ = i + .filter(|r| r.role == StateRole::Leader) + .take_while(|r| r.region.start_key < end_key) + .try_for_each(|r| { + tx.blocking_send(ObserveOp::Start { + region: r.region.clone(), + }) + }); + }), + ) + .map_err(|err| { + Error::Other(box_err!( + "failed to seek region for start key {}: {}", + utils::redact(&start_key), + err + )) + })?; + // Don't reschedule this command: or once the endpoint's mailbox gets + // full, the system might deadlock. + while let Some(cmd) = rx.recv().await { + self.region_operator.request(cmd).await; + } Ok(()) } @@ -578,7 +617,6 @@ where /// Load the task into memory: this would make the endpint start to observe. fn load_task(&self, task: StreamTask) { let cli = self.meta_client.clone(); - let init = self.make_initial_loader(); let range_router = self.range_router.clone(); info!( @@ -604,6 +642,9 @@ where let run = async move { let task_name = task.info.get_name(); let ranges = cli.ranges_of_task(task_name).await?; + fail::fail_point!("load_task::error_when_fetching_ranges", |_| { + Err(Error::Other("what range? no such thing, go away.".into())) + }); info!( "register backup stream ranges"; "task" => ?task, @@ -621,10 +662,8 @@ where .await?; for (start_key, end_key) in ranges { - let init = init.clone(); - - self.observe_and_scan_region(init, &task, start_key, end_key) - .await? + self.observe_regions_in_range(&task, start_key, end_key) + .await } info!( "finish register backup stream ranges"; @@ -633,10 +672,8 @@ where Result::Ok(()) }; if let Err(e) = run.await { - e.report(format!( - "failed to register backup stream task {} to router: ranges not found", - task_clone.info.get_name() - )); + self.on_fatal_error_of_task(&task_clone.info.name, &Box::new(e)) + .await; } }); metrics::update_task_status(TaskStatus::Running, &task_name); @@ -859,11 +896,16 @@ where } fn on_update_change_config(&mut self, cfg: BackupStreamConfig) { + let concurrency_diff = + cfg.initial_scan_concurrency as isize - self.config.initial_scan_concurrency as isize; info!( "update log backup config"; "config" => ?cfg, + "concurrency_diff" => concurrency_diff, ); self.range_router.udpate_config(&cfg); + self.update_semaphore_capacity(&self.initial_scan_semaphore, concurrency_diff); + self.config = cfg; } @@ -873,6 +915,24 @@ where self.pool.block_on(self.region_operator.request(op)); } + fn update_semaphore_capacity(&self, sema: &Arc, diff: isize) { + use std::cmp::Ordering::*; + match diff.cmp(&0) { + Less => { + self.pool.spawn( + Arc::clone(sema) + .acquire_many_owned(-diff as _) + // It is OK to trivially ignore the Error case (semaphore has been closed, we are shutting down the server.) + .map_ok(|p| p.forget()), + ); + } + Equal => {} + Greater => { + sema.add_permits(diff as _); + } + } + } + pub fn run_task(&mut self, task: Task) { debug!("run backup stream task"; "task" => ?task, "store_id" => %self.store_id); let now = Instant::now_coarse(); @@ -1279,12 +1339,11 @@ impl Task { } } -impl Runnable for Endpoint +impl Runnable for Endpoint where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: CdcHandle + 'static, PDC: PdClient + 'static, { type Task = Task; @@ -1297,10 +1356,7 @@ where #[cfg(test)] mod test { use engine_rocks::RocksEngine; - use raftstore::{ - coprocessor::region_info_accessor::MockRegionInfoProvider, router::CdcRaftRouter, - }; - use test_raftstore::MockRaftStoreRouter; + use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; use tikv_util::worker::dummy_scheduler; use crate::{ @@ -1315,13 +1371,9 @@ mod test { cli.insert_task_with_range(&task, &[]).await.unwrap(); fail::cfg("failed_to_get_tasks", "1*return").unwrap(); - Endpoint::< - _, - MockRegionInfoProvider, - RocksEngine, - CdcRaftRouter, - MockPdClient, - >::start_and_watch_tasks(cli, sched) + Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockPdClient>::start_and_watch_tasks( + cli, sched, + ) .await .unwrap(); fail::remove("failed_to_get_tasks"); diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index c3cc91da9ff..67461e2978b 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -5,8 +5,6 @@ use std::{ }; use error_code::ErrorCodeExt; -#[cfg(feature = "metastore-etcd")] -use etcd_client::Error as EtcdError; use grpcio::Error as GrpcError; use kvproto::{errorpb::Error as StoreError, metapb::*}; use pd_client::Error as PdError; @@ -22,9 +20,6 @@ use crate::{endpoint::Task, metrics}; pub enum Error { #[error("gRPC meet error {0}")] Grpc(#[from] GrpcError), - #[cfg(feature = "metasotre-etcd")] - #[error("Etcd meet error {0}")] - Etcd(#[from] EtcdErrorExt), #[error("Protobuf meet error {0}")] Protobuf(#[from] ProtobufError), #[error("No such task {task_name:?}")] @@ -54,30 +49,10 @@ pub enum Error { Other(#[from] Box), } -#[cfg(feature = "metastore-etcd")] -impl From for Error { - fn from(value: EtcdError) -> Self { - Self::Etcd(value.into()) - } -} - -#[cfg(feature = "metastore-etcd")] -#[derive(ThisError, Debug)] -pub enum EtcdErrorExt { - #[error("{0}")] - Normal(#[from] EtcdError), - #[error("the watch canceled")] - WatchCanceled, - #[error("the required revision has been compacted, current is {current}")] - RevisionCompacted { current: i64 }, -} - impl ErrorCodeExt for Error { fn error_code(&self) -> error_code::ErrorCode { use error_code::backup_stream::*; match self { - #[cfg(feature = "metastore-etcd")] - Error::Etcd(_) => ETCD, Error::Protobuf(_) => PROTO, Error::NoSuchTask { .. } => NO_SUCH_TASK, Error::MalformedMetadata(_) => MALFORMED_META, diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 1b663c0e982..0a957ea87ed 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -3,77 +3,49 @@ use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; -use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::CmdType}; use raftstore::{ - coprocessor::{ObserveHandle, RegionInfoProvider}, + coprocessor::ObserveHandle, router::CdcHandle, store::{fsm::ChangeObserver, Callback}, }; use tikv::storage::{ kv::StatisticsSummary, mvcc::{DeltaScanner, ScannerBuilder}, - txn::{EntryBatch, TxnEntry, TxnEntryScanner}, + txn::{TxnEntry, TxnEntryScanner}, Snapshot, Statistics, }; use tikv_util::{ box_err, + memory::{MemoryQuota, OwnedAllocated}, time::{Instant, Limiter}, worker::Scheduler, }; -use tokio::{ - runtime::Handle, - sync::{OwnedSemaphorePermit, Semaphore}, -}; +use tokio::sync::Semaphore; use txn_types::{Key, Lock, TimeStamp}; use crate::{ annotate, debug, - endpoint::ObserveOp, errors::{ContextualResultExt, Error, Result}, metrics, router::{ApplyEvent, ApplyEvents, Router}, subscription_track::{Ref, RefMut, SubscriptionTracer, TwoPhaseResolver}, - try_send, - utils::{self, RegionPager}, - Task, + utils, Task, }; const MAX_GET_SNAPSHOT_RETRY: usize = 5; -#[derive(Clone)] -pub struct PendingMemoryQuota(Arc); - -impl std::fmt::Debug for PendingMemoryQuota { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PendingMemoryQuota") - .field("remain", &self.0.available_permits()) - .field("total", &self.0) - .finish() - } -} - -pub struct PendingMemory(OwnedSemaphorePermit); - -impl PendingMemoryQuota { - pub fn new(quota: usize) -> Self { - Self(Arc::new(Semaphore::new(quota))) - } - - pub fn pending(&self, size: usize) -> PendingMemory { - PendingMemory( - Handle::current() - .block_on(self.0.clone().acquire_many_owned(size as _)) - .expect("BUG: the semaphore is closed unexpectedly."), - ) - } +struct ScanResult { + more: bool, + out_of_memory: bool, + statistics: Statistics, } /// EventLoader transforms data from the snapshot into ApplyEvent. pub struct EventLoader { scanner: DeltaScanner, // pooling the memory. - entry_batch: EntryBatch, + entry_batch: Vec, } const ENTRY_BATCH_SIZE: usize = 1024; @@ -102,20 +74,48 @@ impl EventLoader { Ok(Self { scanner, - entry_batch: EntryBatch::with_capacity(ENTRY_BATCH_SIZE), + entry_batch: Vec::with_capacity(ENTRY_BATCH_SIZE), }) } + fn scan_result(&mut self, more: bool) -> ScanResult { + ScanResult { + more, + out_of_memory: false, + statistics: self.scanner.take_statistics(), + } + } + + fn out_of_memory(&mut self) -> ScanResult { + ScanResult { + more: true, + out_of_memory: true, + statistics: self.scanner.take_statistics(), + } + } + /// Scan a batch of events from the snapshot, and save them into the /// internal buffer. - fn fill_entries(&mut self) -> Result { + fn fill_entries(&mut self, memory_quota: &mut OwnedAllocated) -> Result { assert!( self.entry_batch.is_empty(), - "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `omit_entries` first. (len = {})", + "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `emit_entries_to` first. (len = {})", self.entry_batch.len() ); - self.scanner.scan_entries(&mut self.entry_batch)?; - Ok(self.scanner.take_statistics()) + let batch = &mut self.entry_batch; + while batch.len() < batch.capacity() { + match self.scanner.next_entry()? { + Some(entry) => { + let size = entry.size(); + batch.push(entry); + if memory_quota.alloc(size).is_err() { + return Ok(self.out_of_memory()); + } + } + None => return Ok(self.scan_result(false)), + } + } + Ok(self.scan_result(true)) } /// Drain the internal buffer, converting them to the [`ApplyEvents`], @@ -125,7 +125,7 @@ impl EventLoader { result: &mut ApplyEvents, resolver: &mut TwoPhaseResolver, ) -> Result<()> { - for entry in self.entry_batch.drain() { + for entry in self.entry_batch.drain(..) { match entry { TxnEntry::Prewrite { default: (key, value), @@ -175,121 +175,64 @@ impl EventLoader { } /// The context for loading incremental data between range. -/// Like [`cdc::Initializer`], but supports initialize over range. +/// Like [`cdc::Initializer`]. /// Note: maybe we can merge those two structures? -/// Note': maybe extract more fields to trait so it would be easier to test. #[derive(Clone)] -pub struct InitialDataLoader { +pub struct InitialDataLoader { // Note: maybe we can make it an abstract thing like `EventSink` with // method `async (KvEvent) -> Result<()>`? pub(crate) sink: Router, pub(crate) tracing: SubscriptionTracer, pub(crate) scheduler: Scheduler, - // Note: this is only for `init_range`, maybe make it an argument? - pub(crate) regions: R, - // Note: Maybe move those fields about initial scanning into some trait? - pub(crate) router: RT, - pub(crate) quota: PendingMemoryQuota, + + pub(crate) quota: Arc, pub(crate) limit: Limiter, + // If there are too many concurrent initial scanning, the limit of disk speed or pending memory + // quota will probably be triggered. Then the whole scanning will be pretty slow. And when + // we are holding a iterator for a long time, the memtable may not be able to be flushed. + // Using this to restrict the possibility of that. + concurrency_limit: Arc, + + cdc_handle: H, - pub(crate) handle: Handle, _engine: PhantomData, } -impl InitialDataLoader +impl InitialDataLoader where E: KvEngine, - R: RegionInfoProvider + Clone + 'static, - RT: CdcHandle, + H: CdcHandle + Sync, { pub fn new( - router: RT, - regions: R, sink: Router, tracing: SubscriptionTracer, sched: Scheduler, - quota: PendingMemoryQuota, - handle: Handle, + quota: Arc, limiter: Limiter, + cdc_handle: H, + concurrency_limit: Arc, ) -> Self { Self { - router, - regions, sink, tracing, scheduler: sched, _engine: PhantomData, quota, - handle, + cdc_handle, + concurrency_limit, limit: limiter, } } - pub fn observe_over_with_retry( + pub async fn capture_change( &self, region: &Region, - mut cmd: impl FnMut() -> ChangeObserver, + cmd: ChangeObserver, ) -> Result { - let mut last_err = None; - for _ in 0..MAX_GET_SNAPSHOT_RETRY { - let c = cmd(); - let r = self.observe_over(region, c); - match r { - Ok(s) => { - return Ok(s); - } - Err(e) => { - let can_retry = match e.without_context() { - Error::RaftRequest(pbe) => { - !(pbe.has_epoch_not_match() - || pbe.has_not_leader() - || pbe.get_message().contains("stale observe id") - || pbe.has_region_not_found()) - } - Error::RaftStore(raftstore::Error::RegionNotFound(_)) - | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, - _ => true, - }; - e.report(format_args!( - "during getting initial snapshot for region {:?}; can retry = {}", - region, can_retry - )); - last_err = match last_err { - None => Some(e), - Some(err) => Some(Error::Contextual { - context: format!("and error {}", err), - inner_error: Box::new(e), - }), - }; - - if !can_retry { - break; - } - std::thread::sleep(Duration::from_secs(1)); - continue; - } - } - } - Err(last_err.expect("BUG: max retry time exceed but no error")) - } - - /// Start observe over some region. - /// This will register the region to the raftstore as observing, - /// and return the current snapshot of that region. - fn observe_over(&self, region: &Region, cmd: ChangeObserver) -> Result { - // There are 2 ways for getting the initial snapshot of a region: - // - the BR method: use the interface in the RaftKv interface, read the - // key-values directly. - // - the CDC method: use the raftstore message `SignificantMsg::CaptureChange` - // to register the region to CDC observer and get a snapshot at the same time. - // Registering the observer to the raftstore is necessary because we should only - // listen events from leader. In CDC, the change observer is - // per-delegate(i.e. per-region), we can create the command per-region here too. - let (callback, fut) = tikv_util::future::paired_future_callback::>(); - self.router + self.cdc_handle .capture_change( region.get_id(), region.get_region_epoch().clone(), @@ -315,7 +258,8 @@ where region.get_id() ))?; - let snap = block_on(fut) + let snap = fut + .await .map_err(|err| { annotate!( err, @@ -332,6 +276,54 @@ where Ok(snap) } + pub async fn observe_over_with_retry( + &self, + region: &Region, + mut cmd: impl FnMut() -> ChangeObserver, + ) -> Result { + let mut last_err = None; + for _ in 0..MAX_GET_SNAPSHOT_RETRY { + let c = cmd(); + let r = self.capture_change(region, c).await; + match r { + Ok(s) => { + return Ok(s); + } + Err(e) => { + let can_retry = match e.without_context() { + Error::RaftRequest(pbe) => { + !(pbe.has_epoch_not_match() + || pbe.has_not_leader() + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) + } + Error::RaftStore(raftstore::Error::RegionNotFound(_)) + | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, + _ => true, + }; + e.report(format_args!( + "during getting initial snapshot for region {:?}; can retry = {}", + region, can_retry + )); + last_err = match last_err { + None => Some(e), + Some(err) => Some(Error::Contextual { + context: format!("and error {}", err), + inner_error: Box::new(e), + }), + }; + + if !can_retry { + break; + } + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + } + } + Err(last_err.expect("BUG: max retry time exceed but no error")) + } + fn with_resolver( &self, region: &Region, @@ -381,7 +373,7 @@ where f(v.value_mut().resolver()) } - fn scan_and_async_send( + async fn scan_and_async_send( &self, region: &Region, handle: &ObserveHandle, @@ -397,44 +389,48 @@ where let mut events = ApplyEvents::with_capacity(1024, region.id); // Note: the call of `fill_entries` is the only step which would read the disk. // we only need to record the disk throughput of this. - let (stat, disk_read) = - utils::with_record_read_throughput(|| event_loader.fill_entries()); - // We must use the size of entry batch here to check whether we have progress. - // Or we may exit too early if there are only records: - // - can be inlined to `write` CF (hence it won't be written to default CF) - // - are prewritten. (hence it will only contains `Prewrite` records). - // In this condition, ALL records generate no ApplyEvent(only lock change), - // and we would exit after the first run of loop :( - let no_progress = event_loader.entry_batch.is_empty(); - let stat = stat?; + let mut allocated = OwnedAllocated::new(Arc::clone(&self.quota)); + let (res, disk_read) = + utils::with_record_read_throughput(|| event_loader.fill_entries(&mut allocated)); + let res = res?; self.with_resolver(region, handle, |r| { event_loader.emit_entries_to(&mut events, r) })?; - if no_progress { - metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); - return Ok(stats.stat); - } - stats.add_statistics(&stat); + stats.add_statistics(&res.statistics); let region_id = region.get_id(); let sink = self.sink.clone(); let event_size = events.size(); let sched = self.scheduler.clone(); - let permit = self.quota.pending(event_size); - self.limit.blocking_consume(disk_read as _); + self.limit.consume(disk_read as _).await; debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); metrics::INCREMENTAL_SCAN_DISK_READ.inc_by(disk_read as f64); metrics::HEAP_MEMORY.add(event_size as _); + fail::fail_point!("scan_and_async_send::about_to_consume"); join_handles.push(tokio::spawn(async move { utils::handle_on_event_result(&sched, sink.on_events(events).await); metrics::HEAP_MEMORY.sub(event_size as _); + drop(allocated); debug!("apply event done"; "size" => %event_size, "region" => %region_id); - drop(permit); })); + if !res.more { + metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); + return Ok(stats.stat); + } + if res.out_of_memory { + futures::future::try_join_all(join_handles.drain(..)) + .await + .map_err(|err| { + annotate!( + err, + "failed to join tokio runtime during out-of-memory-quota" + ) + })?; + } } } - pub fn do_initial_scan( + pub async fn do_initial_scan( &self, region: &Region, // We are using this handle for checking whether the initial scan is stale. @@ -442,18 +438,25 @@ where start_ts: TimeStamp, snap: impl Snapshot, ) -> Result { - let _guard = self.handle.enter(); let tr = self.tracing.clone(); let region_id = region.get_id(); let mut join_handles = Vec::with_capacity(8); + let permit = self + .concurrency_limit + .acquire() + .await + .expect("BUG: semaphore closed"); // It is ok to sink more data than needed. So scan to +inf TS for convenance. let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; - let stats = self.scan_and_async_send(region, &handle, event_loader, &mut join_handles)?; + let stats = self + .scan_and_async_send(region, &handle, event_loader, &mut join_handles) + .await?; + drop(permit); - Handle::current() - .block_on(futures::future::try_join_all(join_handles)) + futures::future::try_join_all(join_handles) + .await .map_err(|err| annotate!(err, "tokio runtime failed to join consuming threads"))?; Self::with_resolver_by(&tr, region, &handle, |r| { @@ -467,39 +470,17 @@ where Ok(stats) } - - /// initialize a range: it simply scan the regions with leader role and send - /// them to [`initialize_region`]. - pub fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { - let mut pager = RegionPager::scan_from(self.regions.clone(), start_key, end_key); - loop { - let regions = pager.next_page(8)?; - debug!("scanning for entries in region."; "regions" => ?regions); - if regions.is_empty() { - break; - } - for r in regions { - // Note: Even we did the initial scanning, and blocking resolved ts from - // advancing, if the next_backup_ts was updated in some extreme condition, there - // is still little chance to lost data: For example, if a region cannot elect - // the leader for long time. (say, net work partition) At that time, we have - // nowhere to record the lock status of this region. - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::Start { region: r.region }) - ); - } - } - Ok(()) - } } #[cfg(test)] mod tests { + use std::sync::Arc; + use futures::executor::block_on; use kvproto::metapb::*; use tikv::storage::{txn::tests::*, TestEngineBuilder}; use tikv_kv::SnapContext; + use tikv_util::memory::{MemoryQuota, OwnedAllocated}; use txn_types::TimeStamp; use super::EventLoader; @@ -529,10 +510,13 @@ mod tests { let snap = block_on(async { tikv_kv::snapshot(&mut engine, SnapContext::default()).await }) .unwrap(); + let quota_inf = Arc::new(MemoryQuota::new(usize::MAX)); let mut loader = EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); - let (r, data_load) = with_record_read_throughput(|| loader.fill_entries()); + let (r, data_load) = with_record_read_throughput(|| { + loader.fill_entries(&mut OwnedAllocated::new(quota_inf)) + }); r.unwrap(); let mut events = ApplyEvents::with_capacity(1024, 42); let mut res = TwoPhaseResolver::new(42, None); diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index 3d4690d7f48..0402e5d2ee3 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -10,7 +10,7 @@ mod endpoint; pub mod errors; mod event_loader; pub mod metadata; -pub(crate) mod metrics; +pub mod metrics; pub mod observer; pub mod router; mod service; diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 1fdc1b3b1e8..2232770915f 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -286,7 +286,19 @@ impl MetadataClient { Ok(()) } - pub async fn get_last_error( + pub async fn get_last_error(&self, name: &str) -> Result> { + let key = MetaKey::last_errors_of(name); + + let r = self.meta_store.get_latest(Keys::Prefix(key)).await?.inner; + if r.is_empty() { + return Ok(None); + } + let r = &r[0]; + let err = protobuf::parse_from_bytes(r.value())?; + Ok(Some(err)) + } + + pub async fn get_last_error_of( &self, name: &str, store_id: u64, diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index a96e2f9bcb6..1150c2932bd 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -8,5 +8,3 @@ pub mod store; pub mod test; pub use client::{Checkpoint, CheckpointProvider, MetadataClient, MetadataEvent, StreamTask}; -#[cfg(feature = "metastore-etcd")] -pub use store::lazy_etcd::{ConnectionConfig, LazyEtcdClient}; diff --git a/components/backup-stream/src/metadata/store/etcd.rs b/components/backup-stream/src/metadata/store/etcd.rs deleted file mode 100644 index 62a246a08ef..00000000000 --- a/components/backup-stream/src/metadata/store/etcd.rs +++ /dev/null @@ -1,627 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - cmp::Ordering, - collections::{HashMap, HashSet}, - pin::Pin, - sync::{Arc, Weak}, - time::Duration, -}; - -use async_trait::async_trait; -use etcd_client::{ - Client, Compare, CompareOp, DeleteOptions, EventType, GetOptions, Member, PutOptions, - SortOrder, SortTarget, Txn, TxnOp, WatchOptions, -}; -use futures::StreamExt; -use tikv_util::{info, warn}; -use tokio::sync::Mutex; -use tokio_stream::Stream; - -use super::{ - GetExtra, GetResponse, Keys, KvChangeSubscription, KvEventType, MetaStore, Snapshot, - TransactionOp, -}; -use crate::{ - annotate, - errors::{Error, EtcdErrorExt, Result}, - metadata::{ - keys::{KeyValue, MetaKey}, - metrics::METADATA_KEY_OPERATION, - store::{KvEvent, Subscription}, - }, -}; -// Can we get rid of the mutex? (which means, we must use a singleton client.) -// Or make a pool of clients? -#[derive(Clone)] -pub struct EtcdStore(Arc>); - -#[derive(Default)] -pub(super) struct TopologyUpdater { - last_urls: HashSet, - client: Weak>, - - // back off configs - pub(super) loop_interval: Duration, - pub(super) loop_failure_back_off: Duration, -} - -impl std::fmt::Debug for TopologyUpdater { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TopologyUpdater") - .field("last_urls", &self.last_urls) - .finish() - } -} - -#[async_trait] -pub(super) trait ClusterInfoProvider { - async fn get_members(&mut self) -> Result>; - async fn add_endpoint(&mut self, endpoint: &str) -> Result<()>; - async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()>; -} - -#[async_trait] -impl ClusterInfoProvider for Client { - async fn get_members(&mut self) -> Result> { - let result = self.member_list().await?; - Ok(result.members().to_vec()) - } - - async fn add_endpoint(&mut self, endpoint: &str) -> Result<()> { - Client::add_endpoint(self, endpoint) - .await - .map_err(|err| annotate!(err, "during adding the endpoint {}", endpoint))?; - Ok(()) - } - - async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()> { - Client::remove_endpoint(self, endpoint) - .await - .map_err(|err| annotate!(err, "during removing the endpoint {}", endpoint))?; - Ok(()) - } -} - -#[derive(Debug, Clone, Copy)] -enum DiffType { - Add, - Remove, -} - -#[derive(Clone)] -struct Diff { - diff_type: DiffType, - url: String, -} - -impl std::fmt::Debug for Diff { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let syn = match self.diff_type { - DiffType::Add => "+", - DiffType::Remove => "-", - }; - write!(f, "{}{}", syn, self.url) - } -} - -impl TopologyUpdater { - // Note: we may require the initial endpoints from the arguments directly. - // So the internal map won't get inconsistent when the cluster config changed - // during initializing. - // But that is impossible for now because we cannot query the node ID before - // connecting. - pub fn new(cluster_ref: Weak>) -> Self { - Self { - last_urls: Default::default(), - client: cluster_ref, - - loop_interval: Duration::from_secs(60), - loop_failure_back_off: Duration::from_secs(10), - } - } - - pub fn init(&mut self, members: impl Iterator) { - for mem in members { - self.last_urls.insert(mem); - } - } - - fn diff(&self, incoming: &[Member]) -> Vec { - let newer = incoming - .iter() - .flat_map(|mem| mem.client_urls().iter()) - .collect::>(); - let mut result = vec![]; - for url in &newer { - if !self.last_urls.contains(*url) { - result.push(Diff { - diff_type: DiffType::Add, - url: String::clone(url), - }) - } - } - for url in &self.last_urls { - if !newer.contains(url) { - result.push(Diff { - diff_type: DiffType::Remove, - url: String::clone(url), - }) - } - } - result - } - - fn apply(&mut self, diff: &Diff) -> Option { - match diff.diff_type { - DiffType::Add => match self.last_urls.insert(diff.url.clone()) { - true => None, - false => Some(format!( - "the member to adding with url {} overrides existing urls.", - diff.url - )), - }, - DiffType::Remove => match self.last_urls.remove(&diff.url) { - true => None, - false => Some(format!( - "the member to remove with url {} hasn't been added.", - diff.url - )), - }, - } - } - - async fn update_topology_by(&mut self, cli: &mut C, diff: &Diff) -> Result<()> { - match diff.diff_type { - DiffType::Add => cli.add_endpoint(&diff.url).await?, - DiffType::Remove => cli.remove_endpoint(&diff.url).await?, - } - Ok(()) - } - - async fn do_update(&mut self, cli: &mut C) -> Result<()> { - let cluster = cli.get_members().await?; - let diffs = self.diff(cluster.as_slice()); - if !diffs.is_empty() { - info!("log backup updating store topology."; "diffs" => ?diffs, "current_state" => ?self); - } - for diff in diffs { - match self.apply(&diff) { - Some(warning) => { - warn!("log backup meet some wrong status when updating PD clients, skipping this update."; "warn" => %warning); - } - None => self.update_topology_by(cli, &diff).await?, - } - } - Result::Ok(()) - } - - pub(super) async fn update_topology_loop(&mut self) { - while let Some(cli) = self.client.upgrade() { - let mut lock = cli.lock().await; - let result = self.do_update(&mut lock).await; - drop(lock); - match result { - Ok(_) => tokio::time::sleep(self.loop_interval).await, - Err(err) => { - err.report("during updating etcd topology"); - tokio::time::sleep(self.loop_failure_back_off).await; - } - } - } - } - - pub async fn main_loop(mut self) { - info!("log backup topology updater finish initialization."; "current_state" => ?self); - self.update_topology_loop().await - } -} - -impl EtcdStore { - pub fn connect, S: AsRef<[E]>>(endpoints: S) -> Self { - // TODO remove block_on - let cli = - futures::executor::block_on(etcd_client::Client::connect(&endpoints, None)).unwrap(); - Self(Arc::new(Mutex::new(cli))) - } - - pub fn inner(&self) -> &Arc> { - &self.0 - } -} - -impl From for EtcdStore { - fn from(cli: etcd_client::Client) -> Self { - Self(Arc::new(Mutex::new(cli))) - } -} - -impl From for KvEventType { - fn from(e: EventType) -> Self { - match e { - EventType::Put => Self::Put, - EventType::Delete => Self::Delete, - } - } -} - -impl From for KeyValue { - fn from(kv: etcd_client::KeyValue) -> Self { - // TODO: we can move out the vector in the KeyValue struct here. (instead of - // copying.) But that isn't possible for now because: - // - The raw KV pair(defined by the protocol buffer of etcd) is private. - // - That did could be exported by `pub-fields` feature of the client. However - // that feature isn't published in theirs Cargo.toml (Is that a mistake?). - // - Indeed, we can use `mem::transmute` here because `etcd_client::KeyValue` - // has `#[repr(transparent)]`. But before here become a known bottle neck, I'm - // not sure whether it's worthwhile for involving unsafe code. - KeyValue(MetaKey(kv.key().to_owned()), kv.value().to_owned()) - } -} - -/// Prepare the etcd options required by the keys. -/// Return the start key for requesting. -macro_rules! prepare_opt { - ($opt:ident, $keys:expr) => { - match $keys { - Keys::Prefix(key) => { - $opt = $opt.with_prefix(); - key - } - Keys::Range(key, end_key) => { - $opt = $opt.with_range(end_key); - key - } - Keys::Key(key) => key, - } - }; -} - -#[async_trait] -impl MetaStore for EtcdStore { - type Snap = EtcdSnapshot; - - async fn snapshot(&self) -> Result { - let status = self.0.lock().await.status().await?; - Ok(EtcdSnapshot { - store: self.clone(), - revision: status.header().unwrap().revision(), - }) - } - - async fn watch(&self, keys: Keys, start_rev: i64) -> Result { - let mut opt = WatchOptions::new(); - let key = prepare_opt!(opt, keys); - opt = opt.with_start_revision(start_rev); - let (mut watcher, stream) = self.0.lock().await.watch(key, Some(opt)).await?; - Ok(Subscription { - stream: Box::pin(stream.flat_map( - |events| -> Pin> + Send>> { - match events { - Err(err) => Box::pin(tokio_stream::once(Err(err.into()))), - Ok(events) => { - if events.compact_revision() > 0 && events.canceled() { - return Box::pin(tokio_stream::once(Err(Error::Etcd( - EtcdErrorExt::RevisionCompacted { - current: events.compact_revision(), - }, - )))); - } - if events.canceled() { - return Box::pin(tokio_stream::once(Err(Error::Etcd( - EtcdErrorExt::WatchCanceled, - )))); - } - Box::pin(tokio_stream::iter( - // TODO: remove the copy here via access the protobuf field - // directly. - #[allow(clippy::unnecessary_to_owned)] - events.events().to_owned().into_iter().filter_map(|event| { - let kv = event.kv()?; - Some(Ok(KvEvent { - kind: event.event_type().into(), - pair: kv.clone().into(), - })) - }), - )) - } - } - }, - )), - cancel: Box::pin(async move { - if let Err(err) = watcher.cancel().await { - warn!("failed to cancel watch stream!"; "err" => %err); - } - }), - }) - } - - async fn txn(&self, t: super::Transaction) -> Result<()> { - let mut cli = self.0.lock().await; - let txns = Self::make_txn(&mut cli, t).await?; - for txn in txns { - cli.txn(txn).await?; - } - Ok(()) - } - - async fn set(&self, pair: KeyValue) -> Result<()> { - self.0.lock().await.put(pair.0, pair.1, None).await?; - Ok(()) - } - - async fn delete(&self, keys: Keys) -> Result<()> { - let mut opt = DeleteOptions::new(); - let key = prepare_opt!(opt, keys); - - self.0.lock().await.delete(key, Some(opt)).await?; - Ok(()) - } - - async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { - let mut cli = self.0.lock().await; - let txn = Self::make_conditional_txn(&mut cli, txn).await?; - cli.txn(txn).await?; - Ok(()) - } -} - -impl EtcdStore { - fn collect_leases_needed(txn: &super::Transaction) -> HashSet { - txn.ops - .iter() - .filter_map(|op| match op { - TransactionOp::Put(_, opt) if opt.ttl.as_secs() > 0 => Some(opt.ttl), - _ => None, - }) - .collect() - } - - async fn make_leases( - cli: &mut Client, - needed: HashSet, - ) -> Result> { - let mut map = HashMap::with_capacity(needed.len()); - for lease_time in needed { - let lease_id = cli.lease_grant(lease_time.as_secs() as _, None).await?.id(); - map.insert(lease_time, lease_id); - } - Ok(map) - } - - fn partition_txns(mut txn: super::Transaction, leases: HashMap) -> Vec { - txn.ops - .chunks_mut(128) - .map(|txn| Txn::default().and_then(Self::to_txn(txn, &leases))) - .collect() - } - - fn to_compare(cond: super::Condition) -> Compare { - let op = match cond.result { - Ordering::Less => CompareOp::Less, - Ordering::Equal => CompareOp::Equal, - Ordering::Greater => CompareOp::Greater, - }; - Compare::value(cond.over_key, op, cond.arg) - } - - /// Convert the transaction operations to etcd transaction ops. - fn to_txn(ops: &mut [super::TransactionOp], leases: &HashMap) -> Vec { - ops.iter_mut().map(|op| match op { - TransactionOp::Put(key, opt) => { - let opts = if opt.ttl.as_secs() > 0 { - let lease = leases.get(&opt.ttl); - match lease { - None => { - warn!("lease not found, the request key may not have a ttl"; "dur" => ?opt.ttl); - None - } - Some(lease_id) => { - Some(PutOptions::new().with_lease(*lease_id)) - } - } - } else { - None - }; - TxnOp::put(key.take_key(), key.take_value(), opts) - }, - TransactionOp::Delete(rng) => { - let rng = std::mem::replace(rng, Keys::Key(MetaKey(vec![]))); - let mut opt = DeleteOptions::new(); - let key = prepare_opt!(opt, rng); - TxnOp::delete(key, Some(opt)) - }, - }).collect::>() - } - - /// Make a conditional txn. - /// For now, this wouldn't split huge transaction into smaller ones, - /// so when playing with etcd in PD, conditional transaction should be - /// small. - async fn make_conditional_txn( - cli: &mut Client, - mut txn: super::CondTransaction, - ) -> Result { - let cond = Self::to_compare(txn.cond); - - let mut leases_needed = Self::collect_leases_needed(&txn.success); - leases_needed.extend(Self::collect_leases_needed(&txn.failure).into_iter()); - let leases = Self::make_leases(cli, leases_needed).await?; - let success = Self::to_txn(&mut txn.success.ops, &leases); - let failure = Self::to_txn(&mut txn.failure.ops, &leases); - Ok(Txn::new().when([cond]).and_then(success).or_else(failure)) - } - - async fn make_txn(cli: &mut Client, etcd_txn: super::Transaction) -> Result> { - let (put_cnt, delete_cnt) = etcd_txn.ops.iter().fold((0, 0), |(p, d), item| match item { - TransactionOp::Put(..) => (p + 1, d), - TransactionOp::Delete(_) => (p, d + 1), - }); - METADATA_KEY_OPERATION - .with_label_values(&["put"]) - .inc_by(put_cnt); - METADATA_KEY_OPERATION - .with_label_values(&["del"]) - .inc_by(delete_cnt); - let needed_leases = Self::collect_leases_needed(&etcd_txn); - let leases = Self::make_leases(cli, needed_leases).await?; - let txns = Self::partition_txns(etcd_txn, leases); - Ok(txns) - } -} - -pub struct EtcdSnapshot { - store: EtcdStore, - revision: i64, -} - -#[async_trait] -impl Snapshot for EtcdSnapshot { - async fn get_extra(&self, keys: Keys, extra: GetExtra) -> Result { - let mut opt = GetOptions::new(); - let key = prepare_opt!(opt, keys); - opt = opt.with_revision(self.revision); - if extra.desc_order { - opt = opt.with_sort(SortTarget::Key, SortOrder::Descend); - } - if extra.limit > 0 { - opt = opt.with_limit(extra.limit as _); - } - let resp = self.store.0.lock().await.get(key.0, Some(opt)).await?; - Ok(GetResponse { - kvs: resp - .kvs() - .iter() - .map(|kv| KeyValue(MetaKey(kv.key().to_owned()), kv.value().to_owned())) - .collect(), - more: resp.more(), - }) - } - - fn revision(&self) -> i64 { - self.revision - } -} - -#[cfg(test)] -mod test { - use std::{ - collections::{HashMap, HashSet}, - fmt::Display, - sync::Arc, - time::Duration, - }; - - use async_trait::async_trait; - use etcd_client::{proto::PbMember, Member}; - use tokio::{sync::Mutex, time::timeout}; - - use super::{ClusterInfoProvider, TopologyUpdater}; - use crate::errors::Result; - - #[derive(Default, Debug)] - struct FakeCluster { - id_alloc: u64, - members: HashMap, - endpoints: HashSet, - } - - #[async_trait] - impl ClusterInfoProvider for FakeCluster { - async fn get_members(&mut self) -> Result> { - let members = self.members.values().cloned().collect(); - Ok(members) - } - - async fn add_endpoint(&mut self, endpoint: &str) -> Result<()> { - self.endpoints.insert(endpoint.to_owned()); - Ok(()) - } - - async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()> { - self.endpoints.remove(endpoint); - Ok(()) - } - } - - impl FakeCluster { - fn new_id(&mut self) -> u64 { - let i = self.id_alloc; - self.id_alloc += 1; - i - } - - fn init_with_member(&mut self, n: usize) -> Vec { - let mut endpoints = Vec::with_capacity(n); - for _ in 0..n { - let mem = self.add_member(); - let url = format!("fakestore://{}", mem); - self.endpoints.insert(url.clone()); - endpoints.push(url); - } - endpoints - } - - fn add_member(&mut self) -> u64 { - let id = self.new_id(); - let mut mem = PbMember::default(); - mem.id = id; - mem.client_ur_ls = vec![format!("fakestore://{}", id)]; - // Safety: `Member` is #[repr(transparent)]. - self.members.insert(id, unsafe { std::mem::transmute(mem) }); - id - } - - fn remove_member(&mut self, id: u64) -> bool { - self.members.remove(&id).is_some() - } - - fn check_consistency(&self, message: impl Display) { - let urls = self - .members - .values() - .flat_map(|mem| mem.client_urls().iter().cloned()) - .collect::>(); - assert_eq!( - urls, self.endpoints, - "{}: consistency check not passed.", - message - ); - } - } - - #[test] - fn test_topology_updater() { - let mut c = FakeCluster::default(); - let eps = c.init_with_member(3); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - let sc = Arc::new(Mutex::new(c)); - let mut tu = TopologyUpdater::new(Arc::downgrade(&sc)); - tu.loop_failure_back_off = Duration::ZERO; - tu.loop_interval = Duration::from_millis(100); - tu.init(eps.into_iter()); - - { - let mut sc = sc.blocking_lock(); - sc.check_consistency("after init"); - sc.add_member(); - rt.block_on(tu.do_update(&mut sc)).unwrap(); - sc.check_consistency("adding nodes"); - sc.add_member(); - sc.add_member(); - rt.block_on(tu.do_update(&mut sc)).unwrap(); - sc.check_consistency("adding more nodes"); - assert!(sc.remove_member(0), "{:?}", sc); - rt.block_on(tu.do_update(&mut sc)).unwrap(); - sc.check_consistency("removing nodes"); - } - - drop(sc); - rt.block_on(async { timeout(Duration::from_secs(1), tu.update_topology_loop()).await }) - .unwrap() - } -} diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs deleted file mode 100644 index 7dacf45e697..00000000000 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ /dev/null @@ -1,316 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - sync::Arc, - time::{Duration, SystemTime}, -}; - -use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; -use futures::Future; -use openssl::{ - pkey::PKey, - x509::{verify::X509VerifyFlags, X509}, -}; -use security::SecurityManager; -use tikv_util::{ - info, - stream::{RetryError, RetryExt}, - warn, -}; -use tokio::sync::Mutex as AsyncMutex; - -use super::{ - etcd::{EtcdSnapshot, TopologyUpdater}, - EtcdStore, MetaStore, -}; -use crate::errors::{ContextualResultExt, Result}; - -const RPC_TIMEOUT: Duration = Duration::from_secs(5); - -#[derive(Clone)] -pub struct LazyEtcdClient(Arc>); - -#[derive(Clone)] -pub struct ConnectionConfig { - pub tls: Arc, - pub keep_alive_interval: Duration, - pub keep_alive_timeout: Duration, -} - -impl Default for ConnectionConfig { - fn default() -> Self { - Self { - tls: Default::default(), - keep_alive_interval: Duration::from_secs(10), - keep_alive_timeout: Duration::from_secs(3), - } - } -} - -impl std::fmt::Debug for ConnectionConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ConnectionConfig") - .field("keep_alive_interval", &self.keep_alive_interval) - .field("keep_alive_timeout", &self.keep_alive_timeout) - .finish() - } -} - -impl ConnectionConfig { - /// Convert the config to the connection option. - fn to_connection_options(&self) -> ConnectOptions { - let mut opts = ConnectOptions::new(); - if let Some(tls) = &self - .tls - .client_suite() - .map_err(|err| warn!("failed to load client suite!"; "err" => %err)) - .ok() - { - opts = opts.with_openssl_tls( - OpenSslClientConfig::default() - .ca_cert_pem(&tls.ca) - // Some of users may prefer using multi-level self-signed certs. - // In this scenario, we must set this flag or openssl would probably complain it cannot found the root CA. - // (Because the flags we provide allows users providing exactly one CA cert.) - // We haven't make it configurable because it is enabled in gRPC by default too. - // TODO: Perhaps implement grpc-io based etcd client, fully remove the difference between gRPC TLS and our custom TLS? - .manually(|c| c.cert_store_mut().set_flags(X509VerifyFlags::PARTIAL_CHAIN)) - .manually(|c| { - let mut client_certs= X509::stack_from_pem(&tls.client_cert)?; - let client_key = PKey::private_key_from_pem(&tls.client_key.0)?; - if !client_certs.is_empty() { - c.set_certificate(&client_certs[0])?; - } - if client_certs.len() > 1 { - for i in client_certs.drain(1..) { - c.add_extra_chain_cert(i)?; - } - } - c.set_private_key(&client_key)?; - Ok(()) - }), - ) - } - opts = opts - .with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout) - .with_keep_alive_while_idle(false) - .with_timeout(RPC_TIMEOUT); - - opts - } -} - -impl LazyEtcdClient { - pub fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - let mut inner = LazyEtcdClientInner::new(endpoints, conf); - inner.normalize_urls(); - Self(Arc::new(AsyncMutex::new(inner))) - } - - // For testing -- check whether the endpoints are properly normalized. - #[cfg(test)] - pub(super) fn endpoints(&self) -> Vec { - self.0.blocking_lock().endpoints.clone() - } - - async fn get_cli(&self) -> Result { - let mut l = self.0.lock().await; - l.get_cli().await.cloned() - } -} - -#[derive(Clone)] -pub struct LazyEtcdClientInner { - conf: ConnectionConfig, - endpoints: Vec, - - last_modified: Option, - cli: Option, -} - -impl LazyEtcdClientInner { - fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - LazyEtcdClientInner { - conf, - endpoints: endpoints.iter().map(ToString::to_string).collect(), - last_modified: None, - cli: None, - } - } -} - -fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { - match etcd_err { - EtcdError::InvalidArgs(_) - | EtcdError::InvalidUri(_) - | EtcdError::Utf8Error(_) - | EtcdError::InvalidHeaderValue(_) - | EtcdError::EndpointError(_) - | EtcdError::OpenSsl(_) => false, - EtcdError::TransportError(_) - | EtcdError::IoError(_) - | EtcdError::WatchError(_) - | EtcdError::LeaseKeepAliveError(_) - | EtcdError::ElectError(_) => true, - EtcdError::GRpcStatus(grpc) => matches!( - grpc.code(), - tonic::Code::Unavailable - | tonic::Code::Aborted - | tonic::Code::Internal - | tonic::Code::ResourceExhausted - ), - } -} - -#[derive(Debug)] -struct RetryableEtcdError(EtcdError); - -impl RetryError for RetryableEtcdError { - fn is_retryable(&self) -> bool { - etcd_error_is_retryable(&self.0) - } -} - -impl From for RetryableEtcdError { - fn from(e: EtcdError) -> Self { - Self(e) - } -} - -pub async fn retry(mut action: impl FnMut() -> F) -> Result -where - F: Future>, -{ - use futures::TryFutureExt; - let r = tikv_util::stream::retry_ext( - move || action().err_into::(), - RetryExt::default().with_fail_hook(|err| info!("retry it"; "err" => ?err)), - ) - .await; - r.map_err(|err| err.0.into()) -} - -impl LazyEtcdClientInner { - fn normalize_urls(&mut self) { - let enabled_tls = self.conf.tls.client_suite().is_ok(); - for endpoint in self.endpoints.iter_mut() { - // Don't touch them when the schemes already provided. - // Given etcd is based on gRPC (which relies on HTTP/2), - // there shouldn't be other schemes available (Hopefully...) - if endpoint.starts_with("http://") || endpoint.starts_with("https://") { - continue; - } - let expected_scheme = if enabled_tls { "https" } else { "http" }; - *endpoint = format!("{}://{}", expected_scheme, endpoint) - } - info!("log backup normalized etcd endpoints"; "endpoints" => ?self.endpoints); - } - - async fn connect(&mut self) -> Result<&EtcdStore> { - let store = retry(|| { - // For now, the interface of the `etcd_client` doesn't us to control - // how to create channels when connecting, hence we cannot update the tls config - // at runtime, now what we did is manually check that each time we are getting - // the clients. - etcd_client::Client::connect( - self.endpoints.clone(), - Some(self.conf.to_connection_options()), - ) - }) - .await - .context("during connecting to the etcd")?; - let store = EtcdStore::from(store); - let mut updater = TopologyUpdater::new(Arc::downgrade(store.inner())); - self.cli = Some(store); - updater.init(self.endpoints.iter().cloned()); - tokio::task::spawn(updater.main_loop()); - Ok(self.cli.as_ref().unwrap()) - } - - pub async fn get_cli(&mut self) -> Result<&EtcdStore> { - let modified = self.conf.tls.get_config().is_modified(&mut self.last_modified) - // Don't reload once we cannot check whether it is modified. - // Because when TLS disabled, this would always fail. - .unwrap_or(false); - if !modified && self.cli.is_some() { - return Ok(self.cli.as_ref().unwrap()); - } - info!("log backup reconnecting to the etcd service."; "tls_modified" => %modified, "connected_before" => %self.cli.is_some()); - self.connect().await - } -} - -#[async_trait::async_trait] -impl MetaStore for LazyEtcdClient { - type Snap = EtcdSnapshot; - - async fn snapshot(&self) -> Result { - self.get_cli().await?.snapshot().await - } - - async fn watch( - &self, - keys: super::Keys, - start_rev: i64, - ) -> Result { - self.get_cli().await?.watch(keys, start_rev).await - } - - async fn txn(&self, txn: super::Transaction) -> Result<()> { - self.get_cli().await?.txn(txn).await - } - - async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { - self.get_cli().await?.txn_cond(txn).await - } -} - -#[cfg(test)] -mod tests { - use std::{fs::File, io::Write, path::PathBuf, sync::Arc}; - - use security::{SecurityConfig, SecurityManager}; - use tempfile::TempDir; - - use super::LazyEtcdClient; - use crate::{errors::Result, metadata::ConnectionConfig}; - - #[test] - fn test_normalize_url() -> Result<()> { - let endpoints = ["http://pd-1".to_owned(), "pd-2".to_owned()]; - let le = LazyEtcdClient::new(&endpoints, Default::default()); - assert_eq!(le.endpoints(), &["http://pd-1", "http://pd-2"]); - - let tempdir = TempDir::new()?; - let write_all = |path: &PathBuf, content| { - let mut f = File::create(path)?; - f.write_all(content)?; - Result::Ok(()) - }; - let ca = tempdir.path().join("ca"); - let cert = tempdir.path().join("cert"); - let key = tempdir.path().join("key"); - write_all(&ca, b"CA :3")?; - write_all(&cert, b"Cert :D")?; - write_all(&key, b"Key X)")?; - - let cfg = SecurityConfig { - ca_path: ca.to_string_lossy().into_owned(), - cert_path: cert.to_string_lossy().into_owned(), - key_path: key.to_string_lossy().into_owned(), - - ..Default::default() - }; - let sm = SecurityManager::new(&cfg).unwrap(); - let endpoints = ["https://pd-1".to_owned(), "pd-2".to_owned()]; - let le = LazyEtcdClient::new( - &endpoints, - ConnectionConfig { - tls: Arc::new(sm), - ..Default::default() - }, - ); - assert_eq!(le.endpoints(), &["https://pd-1", "https://pd-2"]); - Result::Ok(()) - } -} diff --git a/components/backup-stream/src/metadata/store/mod.rs b/components/backup-stream/src/metadata/store/mod.rs index 7cecda9720e..00f18c47042 100644 --- a/components/backup-stream/src/metadata/store/mod.rs +++ b/components/backup-stream/src/metadata/store/mod.rs @@ -1,13 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -cfg_if::cfg_if! { - if #[cfg(feature = "metastore-etcd")] { - pub mod etcd; - pub mod lazy_etcd; - pub use etcd::EtcdStore; - } -} - // Note: these mods also used for integration tests, // so we cannot compile them only when `#[cfg(test)]`. // (See https://github.com/rust-lang/rust/issues/84629) diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index b0d3453c958..00ce93635e8 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -14,8 +14,7 @@ use std::{ }; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_WRITE}; -use external_storage::{BackendConfig, UnpinReader}; -use external_storage_export::{create_storage, ExternalStorage}; +use external_storage::{create_storage, BackendConfig, ExternalStorage, UnpinReader}; use futures::io::Cursor; use kvproto::{ brpb::{ @@ -540,6 +539,15 @@ impl RouterInner { let task_info = self.get_task_info(&task).await?; task_info.on_events(events).await?; let file_size_limit = self.temp_file_size_limit.load(Ordering::SeqCst); + #[cfg(features = "failpoints")] + { + let delayed = (|| { + fail::fail_point!("router_on_event_delay_ms", |v| { + v.and_then(|v| v.parse::().ok()).unwrap_or(0) + }) + })(); + tokio::time::sleep(Duration::from_millis(delayed)).await; + } // When this event make the size of temporary files exceeds the size limit, make // a flush. Note that we only flush if the size is less than the limit before @@ -708,22 +716,25 @@ impl TempFileKey { /// The full name of the file owns the key. fn temp_file_name(&self) -> String { + let timestamp = (|| { + fail::fail_point!("temp_file_name_timestamp", |t| t.map_or_else( + || TimeStamp::physical_now(), + |v| + // reduce the precision of timestamp + v.parse::().ok().map_or(0, |u| TimeStamp::physical_now() / u) + )); + TimeStamp::physical_now() + })(); + let uuid = uuid::Uuid::new_v4(); if self.is_meta { format!( - "meta_{:08}_{}_{:?}_{}.temp.log", - self.region_id, - self.cf, - self.cmd_type, - TimeStamp::physical_now(), + "meta_{:08}_{}_{:?}_{:?}_{}.temp.log", + self.region_id, self.cf, self.cmd_type, uuid, timestamp, ) } else { format!( - "{:08}_{:08}_{}_{:?}_{}.temp.log", - self.table_id, - self.region_id, - self.cf, - self.cmd_type, - TimeStamp::physical_now(), + "{:08}_{:08}_{}_{:?}_{:?}_{}.temp.log", + self.table_id, self.region_id, self.cf, self.cmd_type, uuid, timestamp, ) } } @@ -819,6 +830,28 @@ pub struct StreamTaskInfo { temp_file_pool: Arc, } +impl Drop for StreamTaskInfo { + fn drop(&mut self) { + let (success, failed): (Vec<_>, Vec<_>) = self + .flushing_files + .get_mut() + .drain(..) + .chain(self.flushing_meta_files.get_mut().drain(..)) + .map(|(_, f, _)| f.inner.path().to_owned()) + .map(|p| self.temp_file_pool.remove(&p)) + .partition(|r| *r); + info!("stream task info dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); + let (success, failed): (Vec<_>, Vec<_>) = self + .files + .get_mut() + .drain() + .map(|(_, f)| f.into_inner().inner.path().to_owned()) + .map(|p| self.temp_file_pool.remove(&p)) + .partition(|r| *r); + info!("stream task info dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); + } +} + impl std::fmt::Debug for StreamTaskInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("StreamTaskInfo") @@ -864,6 +897,7 @@ impl StreamTaskInfo { } async fn on_events_of_key(&self, key: TempFileKey, events: ApplyEvents) -> Result<()> { + fail::fail_point!("before_generate_temp_file"); if let Some(f) = self.files.read().await.get(&key) { self.total_size .fetch_add(f.lock().await.on_events(events).await?, Ordering::SeqCst); @@ -886,6 +920,7 @@ impl StreamTaskInfo { let f = w.get(&key).unwrap(); self.total_size .fetch_add(f.lock().await.on_events(events).await?, Ordering::SeqCst); + fail::fail_point!("after_write_to_file"); Ok(()) } @@ -968,7 +1003,9 @@ impl StreamTaskInfo { pub async fn move_to_flushing_files(&self) -> Result<&Self> { // if flushing_files is not empty, which represents this flush is a retry // operation. - if !self.flushing_files.read().await.is_empty() { + if !self.flushing_files.read().await.is_empty() + || !self.flushing_meta_files.read().await.is_empty() + { return Ok(self); } @@ -1030,7 +1067,12 @@ impl StreamTaskInfo { // and push it into merged_file_info(DataFileGroup). file_info_clone.set_range_offset(stat_length); data_files_open.push({ - let file = shared_pool.open_raw_for_read(data_file.inner.path())?; + let file = shared_pool + .open_raw_for_read(data_file.inner.path()) + .context(format_args!( + "failed to open read file {:?}", + data_file.inner.path() + ))?; let compress_length = file.len().await?; stat_length += compress_length; file_info_clone.set_range_length(compress_length); @@ -1095,7 +1137,6 @@ impl StreamTaskInfo { .await?; self.merge_log(metadata, storage.clone(), &self.flushing_meta_files, true) .await?; - Ok(()) } @@ -1155,7 +1196,8 @@ impl StreamTaskInfo { UnpinReader(Box::new(Cursor::new(meta_buff))), buflen as _, ) - .await?; + .await + .context(format_args!("flush meta {:?}", meta_path))?; } Ok(()) } @@ -1189,13 +1231,14 @@ impl StreamTaskInfo { .await? .generate_metadata(store_id) .await?; + + fail::fail_point!("after_moving_to_flushing_files"); crate::metrics::FLUSH_DURATION .with_label_values(&["generate_metadata"]) .observe(sw.lap().as_secs_f64()); // flush log file to storage. self.flush_log(&mut metadata_info).await?; - // the field `min_resolved_ts` of metadata will be updated // only after flush is done. metadata_info.min_resolved_ts = metadata_info @@ -1855,7 +1898,7 @@ mod tests { #[tokio::test] async fn test_do_flush() { let tmp_dir = tempfile::tempdir().unwrap(); - let backend = external_storage_export::make_local_backend(tmp_dir.path()); + let backend = external_storage::make_local_backend(tmp_dir.path()); let mut task_info = StreamBackupTaskInfo::default(); task_info.set_storage(backend); let stream_task = StreamTask { @@ -2068,6 +2111,12 @@ mod tests { let (task, _path) = task("cleanup_test".to_owned()).await?; must_register_table(&router, task, 1).await; write_simple_data(&router).await; + let tempfiles = router + .get_task_info("cleanup_test") + .await + .unwrap() + .temp_file_pool + .clone(); router .get_task_info("cleanup_test") .await? @@ -2076,6 +2125,7 @@ mod tests { write_simple_data(&router).await; let mut w = walkdir::WalkDir::new(&tmp).into_iter(); assert!(w.next().is_some(), "the temp files doesn't created"); + assert!(tempfiles.mem_used() > 0, "the temp files doesn't created."); drop(router); let w = walkdir::WalkDir::new(&tmp) .into_iter() @@ -2093,6 +2143,11 @@ mod tests { "the temp files should be removed, but it is {:?}", w ); + assert_eq!( + tempfiles.mem_used(), + 0, + "the temp files hasn't been cleared." + ); Ok(()) } @@ -2228,7 +2283,7 @@ mod tests { async fn test_update_global_checkpoint() -> Result<()> { // create local storage let tmp_dir = tempfile::tempdir().unwrap(); - let backend = external_storage_export::make_local_backend(tmp_dir.path()); + let backend = external_storage::make_local_backend(tmp_dir.path()); // build a StreamTaskInfo let mut task_info = StreamBackupTaskInfo::default(); @@ -2411,4 +2466,91 @@ mod tests { let r = cfg_manager.dispatch(changed); assert!(r.is_err()); } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn test_flush_on_events_race() -> Result<()> { + let (tx, _rx) = dummy_scheduler(); + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let router = Arc::new(RouterInner::new( + tx, + Config { + prefix: tmp.clone(), + // disable auto flush. + temp_file_size_limit: 1000, + temp_file_memory_quota: 2, + max_flush_interval: Duration::from_secs(300), + }, + )); + + let (task, _path) = task("race".to_owned()).await?; + must_register_table(router.as_ref(), task, 1).await; + router + .must_mut_task_info("race", |i| { + i.storage = Arc::new(NoopStorage::default()); + }) + .await; + let mut b = KvEventsBuilder::new(42, 0); + b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); + let events_before_flush = b.finish(); + + b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); + let events_after_flush = b.finish(); + + // make timestamp precision to 1 seconds. + fail::cfg("temp_file_name_timestamp", "return(1000)").unwrap(); + + let (trigger_tx, trigger_rx) = std::sync::mpsc::sync_channel(0); + let trigger_rx = std::sync::Mutex::new(trigger_rx); + + let (fp_tx, fp_rx) = std::sync::mpsc::sync_channel(0); + let fp_rx = std::sync::Mutex::new(fp_rx); + + let t = router.get_task_info("race").await.unwrap(); + let _ = router.on_events(events_before_flush).await; + + // make generate temp files ***happen after*** moving files to flushing_files + // and read flush file ***happen between*** genenrate file name and + // write kv to file. T1 is write thread. T2 is flush thread + // The order likes + // [T1] generate file name -> [T2] moving files to flushing_files -> [T1] write + // kv to file -> [T2] read flush file. + fail::cfg_callback("after_write_to_file", move || { + fp_tx.send(()).unwrap(); + }) + .unwrap(); + + fail::cfg_callback("before_generate_temp_file", move || { + trigger_rx.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + + fail::cfg_callback("after_moving_to_flushing_files", move || { + trigger_tx.send(()).unwrap(); + fp_rx.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + + // set flush status to true, because we disabled the auto flush. + t.set_flushing_status(true); + let router_clone = router.clone(); + let _ = tokio::join!( + // do flush in another thread + tokio::spawn(async move { + router_clone.do_flush("race", 42, TimeStamp::max()).await; + }), + router.on_events(events_after_flush) + ); + fail::remove("after_write_to_file"); + fail::remove("before_generate_temp_file"); + fail::remove("after_moving_to_flushing_files"); + fail::remove("temp_file_name_timestamp"); + + // set flush status to true, because we disabled the auto flush. + t.set_flushing_status(true); + let res = router.do_flush("race", 42, TimeStamp::max()).await; + // this time flush should success. + assert!(res.is_some()); + assert_eq!(t.files.read().await.len(), 0,); + Ok(()) + } } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index e418d59029d..7aeecb775cc 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -1,15 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::{sync::Arc, time::Duration}; -use crossbeam::channel::{Receiver as SyncReceiver, Sender as SyncSender}; -use crossbeam_channel::SendError; use engine_traits::KvEngine; use error_code::ErrorCodeExt; use futures::FutureExt; @@ -22,10 +14,11 @@ use raftstore::{ store::fsm::ChangeObserver, }; use tikv::storage::Statistics; -use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tikv_util::{ + box_err, debug, info, sys::thread::ThreadBuildWrapper, time::Instant, warn, worker::Scheduler, +}; +use tokio::sync::mpsc::{channel, error::SendError, Receiver, Sender}; use txn_types::TimeStamp; -use yatp::task::callback::Handle as YatpHandle; use crate::{ annotate, @@ -43,7 +36,7 @@ use crate::{ Task, }; -type ScanPool = yatp::ThreadPool; +type ScanPool = tokio::runtime::Runtime; const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; @@ -128,8 +121,9 @@ fn should_retry(err: &Error) -> bool { } /// the abstraction over a "DB" which provides the initial scanning. -trait InitialScan: Clone { - fn do_initial_scan( +#[async_trait::async_trait] +trait InitialScan: Clone + Sync + Send + 'static { + async fn do_initial_scan( &self, region: &Region, start_ts: TimeStamp, @@ -139,13 +133,13 @@ trait InitialScan: Clone { fn handle_fatal_error(&self, region: &Region, err: Error); } -impl InitialScan for InitialDataLoader +#[async_trait::async_trait] +impl InitialScan for InitialDataLoader where E: KvEngine, - R: RegionInfoProvider + Clone + 'static, - RT: CdcHandle, + RT: CdcHandle + Sync + 'static, { - fn do_initial_scan( + async fn do_initial_scan( &self, region: &Region, start_ts: TimeStamp, @@ -155,12 +149,14 @@ where let h = handle.clone(); // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep // retrying here? - let snap = self.observe_over_with_retry(region, move || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; + let snap = self + .observe_over_with_retry(region, move || { + ChangeObserver::from_pitr(region_id, handle.clone()) + }) + .await?; #[cfg(feature = "failpoints")] fail::fail_point!("scan_after_get_snapshot"); - let stat = self.do_initial_scan(region, h, start_ts, snap)?; + let stat = self.do_initial_scan(region, h, start_ts, snap).await?; Ok(stat) } @@ -180,7 +176,7 @@ where impl ScanCmd { /// execute the initial scanning via the specificated [`InitialDataLoader`]. - fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { + async fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { let Self { region, handle, @@ -188,7 +184,9 @@ impl ScanCmd { .. } = self; let begin = Instant::now_coarse(); - let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; + let stat = initial_scan + .do_initial_scan(region, *last_checkpoint, handle.clone()) + .await?; info!("initial scanning finished!"; "takes" => ?begin.saturating_elapsed(), "from_ts" => %last_checkpoint, utils::slog_region(region)); utils::record_cf_stat("lock", &stat.lock); utils::record_cf_stat("write", &stat.write); @@ -197,17 +195,12 @@ impl ScanCmd { } /// execute the command, when meeting error, retrying. - fn exec_by_with_retry(self, init: impl InitialScan, cancel: &AtomicBool) { + async fn exec_by_with_retry(self, init: impl InitialScan) { let mut retry_time = INITIAL_SCAN_FAILURE_MAX_RETRY_TIME; loop { - if cancel.load(Ordering::SeqCst) { - return; - } - match self.exec_by(init.clone()) { + match self.exec_by(init.clone()).await { Err(err) if should_retry(&err) && retry_time > 0 => { - // NOTE: blocking this thread may stick the process. - // Maybe spawn a task to tokio and reschedule the task then? - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; warn!("meet retryable error"; "err" => %err, "retry_time" => retry_time); retry_time -= 1; continue; @@ -223,82 +216,62 @@ impl ScanCmd { } } -fn scan_executor_loop( - init: impl InitialScan, - cmds: SyncReceiver, - canceled: Arc, -) { - while let Ok(cmd) = cmds.recv() { - fail::fail_point!("execute_scan_command"); +async fn scan_executor_loop(init: impl InitialScan, mut cmds: Receiver) { + while let Some(cmd) = cmds.recv().await { debug!("handling initial scan request"; "region_id" => %cmd.region.get_id()); metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["queuing"]) .dec(); - if canceled.load(Ordering::Acquire) { - return; + #[cfg(feature = "failpoints")] + { + let sleep = (|| { + fail::fail_point!("execute_scan_command_sleep_100", |_| { 100 }); + 0 + })(); + tokio::time::sleep(std::time::Duration::from_secs(sleep)).await; } - metrics::PENDING_INITIAL_SCAN_LEN - .with_label_values(&["executing"]) - .inc(); - cmd.exec_by_with_retry(init.clone(), &canceled); - metrics::PENDING_INITIAL_SCAN_LEN - .with_label_values(&["executing"]) - .dec(); + let init = init.clone(); + tokio::task::spawn(async move { + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .inc(); + cmd.exec_by_with_retry(init).await; + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .dec(); + }); } } /// spawn the executors in the scan pool. -/// we make workers thread instead of spawn scan task directly into the pool -/// because the [`InitialDataLoader`] isn't `Sync` hence we must use it very -/// carefully or rustc (along with tokio) would complain that we made a `!Send` -/// future. so we have moved the data loader to the synchronous context so its -/// reference won't be shared between threads any more. -fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> ScanPoolHandle { - let (tx, rx) = crossbeam::channel::bounded(MESSAGE_BUFFER_SIZE); +fn spawn_executors( + init: impl InitialScan + Send + Sync + 'static, + number: usize, +) -> ScanPoolHandle { + let (tx, rx) = tokio::sync::mpsc::channel(MESSAGE_BUFFER_SIZE); let pool = create_scan_pool(number); - let stopped = Arc::new(AtomicBool::new(false)); - for _ in 0..number { - let init = init.clone(); - let rx = rx.clone(); - let stopped = stopped.clone(); - pool.spawn(move |_: &mut YatpHandle<'_>| { - let _io_guard = file_system::WithIoType::new(file_system::IoType::Replication); - scan_executor_loop(init, rx, stopped); - }) - } - ScanPoolHandle { - tx, - _pool: pool, - stopped, - } + pool.spawn(async move { + scan_executor_loop(init, rx).await; + }); + ScanPoolHandle { tx, _pool: pool } } struct ScanPoolHandle { - tx: SyncSender, - stopped: Arc, + // Theoretically, we can get rid of the sender, and spawn a new task via initial loader in each + // thread. But that will make `SubscribeManager` holds a reference to the implementation of + // `InitialScan`, which will get the type information a mass. + tx: Sender, - // in fact, we won't use the pool any more. - // but we should hold the reference to the pool so it won't try to join the threads running. _pool: ScanPool, } -impl Drop for ScanPoolHandle { - fn drop(&mut self) { - self.stopped.store(true, Ordering::Release); - } -} - impl ScanPoolHandle { - fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { - if self.stopped.load(Ordering::Acquire) { - warn!("scan pool is stopped, ignore the scan command"; "region" => %cmd.region.get_id()); - return Ok(()); - } + async fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["queuing"]) .inc(); - self.tx.send(cmd) + self.tx.send(cmd).await } } @@ -348,11 +321,20 @@ where } } -/// Create a yatp pool for doing initial scanning. +/// Create a pool for doing initial scanning. fn create_scan_pool(num_threads: usize) -> ScanPool { - yatp::Builder::new("log-backup-scan") - .max_thread_count(num_threads) - .build_callback_pool() + tokio::runtime::Builder::new_multi_thread() + .with_sys_and_custom_hooks( + move || { + file_system::set_io_type(file_system::IoType::Replication); + }, + || {}, + ) + .thread_name("log-backup-scan") + .enable_time() + .worker_threads(num_threads) + .build() + .unwrap() } impl RegionSubscriptionManager @@ -367,22 +349,24 @@ where /// /// a two-tuple, the first is the handle to the manager, the second is the /// operator loop future. - pub fn start( - initial_loader: InitialDataLoader, + pub fn start( + initial_loader: InitialDataLoader, + regions: R, observer: BackupStreamObserver, meta_cli: MetadataClient, pd_client: Arc, scan_pool_size: usize, - resolver: BackupStreamResolver, + resolver: BackupStreamResolver, ) -> (Self, future![()]) where E: KvEngine, - RT: CdcHandle + 'static, + HInit: CdcHandle + Sync + 'static, + HChkLd: CdcHandle + 'static, { let (tx, rx) = channel(MESSAGE_BUFFER_SIZE); let scan_pool_handle = spawn_executors(initial_loader.clone(), scan_pool_size); let op = Self { - regions: initial_loader.regions.clone(), + regions, meta_cli, pd_client, range_router: initial_loader.sink.clone(), @@ -522,7 +506,8 @@ where region, self.get_last_checkpoint_of(&for_task, region).await?, handle.clone(), - ); + ) + .await; Result::Ok(()) } .await; @@ -567,7 +552,8 @@ where Err(Error::Other(box_err!("Nature is boring"))) }); let tso = self.get_last_checkpoint_of(&for_task, region).await?; - self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()); + self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()) + .await; } } Ok(()) @@ -702,13 +688,13 @@ where Ok(cp.ts) } - fn spawn_scan(&self, cmd: ScanCmd) { + async fn spawn_scan(&self, cmd: ScanCmd) { // we should not spawn initial scanning tasks to the tokio blocking pool // because it is also used for converting sync File I/O to async. (for now!) // In that condition, if we blocking for some resources(for example, the // `MemoryQuota`) at the block threads, we may meet some ghosty // deadlock. - let s = self.scan_pool_handle.request(cmd); + let s = self.scan_pool_handle.request(cmd).await; if let Err(err) = s { let region_id = err.0.region.get_id(); annotate!(err, "BUG: scan_pool closed") @@ -716,7 +702,7 @@ where } } - fn observe_over_with_initial_data_from_checkpoint( + async fn observe_over_with_initial_data_from_checkpoint( &self, region: &Region, last_checkpoint: TimeStamp, @@ -730,6 +716,7 @@ where last_checkpoint, _work: self.scans.clone().work(), }) + .await } fn find_task_by_region(&self, r: &Region) -> Option { @@ -748,8 +735,9 @@ mod test { #[derive(Clone, Copy)] struct NoopInitialScan; + #[async_trait::async_trait] impl InitialScan for NoopInitialScan { - fn do_initial_scan( + async fn do_initial_scan( &self, _region: &Region, _start_ts: txn_types::TimeStamp, @@ -787,17 +775,20 @@ mod test { let pool = spawn_executors(NoopInitialScan, 1); let wg = CallbackWaitGroup::new(); - fail::cfg("execute_scan_command", "sleep(100)").unwrap(); + fail::cfg("execute_scan_command_sleep_100", "return").unwrap(); for _ in 0..100 { let wg = wg.clone(); - pool.request(ScanCmd { - region: Default::default(), - handle: Default::default(), - last_checkpoint: Default::default(), - // Note: Maybe make here a Box or some other trait? - _work: wg.work(), - }) - .unwrap() + assert!( + pool._pool + .block_on(pool.request(ScanCmd { + region: Default::default(), + handle: Default::default(), + last_checkpoint: Default::default(), + // Note: Maybe make here a Box or some other trait? + _work: wg.work(), + })) + .is_ok() + ) } should_finish_in(move || drop(pool), Duration::from_secs(5)); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index e92759bc2b2..c70ad9c8038 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -8,8 +8,8 @@ use dashmap::{ }; use kvproto::metapb::Region; use raftstore::coprocessor::*; -use resolved_ts::Resolver; -use tikv_util::{info, warn}; +use resolved_ts::{Resolver, TsSource, TxnLocks}; +use tikv_util::{info, memory::MemoryQuota, warn}; use txn_types::TimeStamp; use crate::{debug, metrics::TRACK_REGION, utils}; @@ -82,6 +82,7 @@ impl ActiveSubscription { self.handle.stop_observing(); } + #[cfg(test)] pub fn is_observing(&self) -> bool { self.handle.is_observing() } @@ -99,7 +100,7 @@ impl ActiveSubscription { pub enum CheckpointType { MinTs, StartTsOfInitialScan, - StartTsOfTxn(Option>), + StartTsOfTxn(Option<(TimeStamp, TxnLocks)>), } impl std::fmt::Debug for CheckpointType { @@ -109,10 +110,7 @@ impl std::fmt::Debug for CheckpointType { Self::StartTsOfInitialScan => write!(f, "StartTsOfInitialScan"), Self::StartTsOfTxn(arg0) => f .debug_tuple("StartTsOfTxn") - .field(&format_args!( - "{}", - utils::redact(&arg0.as_ref().map(|x| x.as_ref()).unwrap_or(&[])) - )) + .field(&format_args!("{:?}", arg0)) .finish(), } } @@ -322,6 +320,7 @@ impl SubscriptionTracer { } /// check whether the region_id should be observed by this observer. + #[cfg(test)] pub fn is_observing(&self, region_id: u64) -> bool { let sub = self.0.get_mut(®ion_id); match sub { @@ -401,7 +400,7 @@ impl<'a> SubscriptionRef<'a> { } } -/// This enhanced version of `Resolver` allow some unordered lock events. +/// This enhanced version of `Resolver` allow some unordered lock events. /// The name "2-phase" means this is used for 2 *concurrency* phases of /// observing a region: /// 1. Doing the initial scanning. @@ -466,9 +465,11 @@ impl std::fmt::Debug for FutureLock { impl TwoPhaseResolver { /// try to get one of the key of the oldest lock in the resolver. - pub fn sample_far_lock(&self) -> Option> { - let (_, keys) = self.resolver.locks().first_key_value()?; - keys.iter().next().cloned() + pub fn sample_far_lock(&self) -> Option<(TimeStamp, TxnLocks)> { + self.resolver + .locks() + .first_key_value() + .map(|(ts, txn_locks)| (*ts, txn_locks.clone())) } pub fn in_phase_one(&self) -> bool { @@ -479,7 +480,8 @@ impl TwoPhaseResolver { if !self.in_phase_one() { warn!("backup stream tracking lock as if in phase one"; "start_ts" => %start_ts, "key" => %utils::redact(&key)) } - self.resolver.track_lock(start_ts, key, None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + self.resolver.track_lock(start_ts, key, None).unwrap(); } pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec) { @@ -487,7 +489,8 @@ impl TwoPhaseResolver { self.future_locks.push(FutureLock::Lock(key, start_ts)); return; } - self.resolver.track_lock(start_ts, key, None) + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + self.resolver.track_lock(start_ts, key, None).unwrap(); } pub fn untrack_lock(&mut self, key: &[u8]) { @@ -501,7 +504,10 @@ impl TwoPhaseResolver { fn handle_future_lock(&mut self, lock: FutureLock) { match lock { - FutureLock::Lock(key, ts) => self.resolver.track_lock(ts, key, None), + FutureLock::Lock(key, ts) => { + // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. + self.resolver.track_lock(ts, key, None).unwrap(); + } FutureLock::Unlock(key) => self.resolver.untrack_lock(&key, None), } } @@ -511,7 +517,7 @@ impl TwoPhaseResolver { return min_ts.min(stable_ts); } - self.resolver.resolve(min_ts, None) + self.resolver.resolve(min_ts, None, TsSource::BackupStream) } pub fn resolved_ts(&self) -> TimeStamp { @@ -523,8 +529,10 @@ impl TwoPhaseResolver { } pub fn new(region_id: u64, stable_ts: Option) -> Self { + // TODO: limit the memory usage of the resolver. + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); Self { - resolver: Resolver::new(region_id), + resolver: Resolver::new(region_id, memory_quota), future_locks: Default::default(), stable_ts, } @@ -541,7 +549,7 @@ impl TwoPhaseResolver { // advance the internal resolver. // the start ts of initial scanning would be a safe ts for min ts // -- because is used to be a resolved ts. - self.resolver.resolve(ts, None); + self.resolver.resolve(ts, None, TsSource::BackupStream); } None => { warn!("BUG: a two-phase resolver is executing phase_one_done when not in phase one"; "resolver" => ?self) @@ -565,6 +573,7 @@ mod test { use kvproto::metapb::{Region, RegionEpoch}; use raftstore::coprocessor::ObserveHandle; + use resolved_ts::TxnLocks; use txn_types::TimeStamp; use super::{SubscriptionTracer, TwoPhaseResolver}; @@ -667,7 +676,13 @@ mod test { ( region(4, 8, 1), 128.into(), - StartTsOfTxn(Some(Arc::from(b"Alpi".as_slice()))) + StartTsOfTxn(Some(( + TimeStamp::new(128), + TxnLocks { + lock_count: 1, + sample_lock: Some(Arc::from(b"Alpi".as_slice())), + } + ))) ), ] ); diff --git a/components/backup-stream/src/tempfiles.rs b/components/backup-stream/src/tempfiles.rs index add1ee67c12..b8f9c9e1120 100644 --- a/components/backup-stream/src/tempfiles.rs +++ b/components/backup-stream/src/tempfiles.rs @@ -259,6 +259,11 @@ impl TempFilePool { &self.cfg } + #[cfg(test)] + pub fn mem_used(&self) -> usize { + self.current.load(Ordering::Acquire) + } + /// Create a file for writting. /// This function is synchronous so we can call it easier in the polling /// context. (Anyway, it is really hard to call an async function in the diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 1b150eaa1f0..ed8b7579587 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -18,14 +18,12 @@ use std::{ use async_compression::{tokio::write::ZstdEncoder, Level}; use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; +use futures::{ready, task::Poll, FutureExt}; use kvproto::{ brpb::CompressionType, metapb::Region, raft_cmdpb::{CmdType, Request}, }; -use raft::StateRole; -use raftstore::{coprocessor::RegionInfoProvider, RegionInfo}; use tikv::storage::CfStatistics; use tikv_util::{ box_err, @@ -33,7 +31,6 @@ use tikv_util::{ self_thread_inspector, IoStat, ThreadInspector, ThreadInspectorImpl as OsInspector, }, time::Instant, - warn, worker::Scheduler, Either, }; @@ -79,65 +76,6 @@ pub fn redact(key: &impl AsRef<[u8]>) -> log_wrappers::Value<'_> { log_wrappers::Value::key(key.as_ref()) } -/// RegionPager seeks regions with leader role in the range. -pub struct RegionPager

{ - regions: P, - start_key: Vec, - end_key: Vec, - reach_last_region: bool, -} - -impl RegionPager

{ - pub fn scan_from(regions: P, start_key: Vec, end_key: Vec) -> Self { - Self { - regions, - start_key, - end_key, - reach_last_region: false, - } - } - - pub fn next_page(&mut self, size: usize) -> Result> { - if self.start_key >= self.end_key || self.reach_last_region { - return Ok(vec![]); - } - - let (mut tx, rx) = mpsc::channel(size); - let end_key = self.end_key.clone(); - self.regions - .seek_region( - &self.start_key, - Box::new(move |i| { - let r = i - .filter(|r| r.role == StateRole::Leader) - .take(size) - .take_while(|r| r.region.start_key < end_key) - .try_for_each(|r| tx.try_send(r.clone())); - if let Err(_err) = r { - warn!("failed to scan region and send to initlizer") - } - }), - ) - .map_err(|err| { - Error::Other(box_err!( - "failed to seek region for start key {}: {}", - redact(&self.start_key), - err - )) - })?; - let collected_regions = block_on(rx.collect::>()); - self.start_key = collected_regions - .last() - .map(|region| region.region.end_key.to_owned()) - // no leader region found. - .unwrap_or_default(); - if self.start_key.is_empty() { - self.reach_last_region = true; - } - Ok(collected_regions) - } -} - /// StopWatch is a utility for record time cost in multi-stage tasks. /// NOTE: Maybe it should be generic over somewhat Clock type? pub struct StopWatch(Instant); @@ -1058,7 +996,7 @@ mod test { let (items, size) = super::with_record_read_throughput(|| { let mut items = vec![]; - let snap = engine.snapshot(); + let snap = engine.snapshot(None); snap.scan(CF_DEFAULT, b"", b"", false, |k, v| { items.push((k.to_owned(), v.to_owned())); Ok(true) diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index ff9b9f82ba1..ea09e9c7a1f 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -9,7 +9,13 @@ pub use suite::*; mod all { - use std::time::Duration; + use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, + }; use backup_stream::{ metadata::{ @@ -19,18 +25,44 @@ mod all { GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; use futures::executor::block_on; - use tikv_util::defer; + use tikv_util::{config::ReadableSize, defer}; use super::{ make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, }; + use crate::make_table_key; + + #[test] + fn failed_register_task() { + let suite = SuiteBuilder::new_named("failed_register_task").build(); + fail::cfg("load_task::error_when_fetching_ranges", "return").unwrap(); + let cli = suite.get_meta_cli(); + block_on(cli.insert_task_with_range( + &suite.simple_task("failed_register_task"), + &[(&make_table_key(1, b""), &make_table_key(2, b""))], + )) + .unwrap(); + + for _ in 0..10 { + if block_on(cli.get_last_error_of("failed_register_task", 1)) + .unwrap() + .is_some() + { + return; + } + std::thread::sleep(Duration::from_millis(100)); + } + + suite.dump_slash_etc(); + panic!("No error uploaded when failed to comminate to PD."); + } #[test] fn basic() { let mut suite = SuiteBuilder::new_named("basic").build(); fail::cfg("try_start_observe", "1*return").unwrap(); - run_async_test(async { + let (round1, round2) = run_async_test(async { // write data before the task starting, for testing incremental scanning. let round1 = suite.write_records(0, 128, 1).await; suite.must_register_task(1, "test_basic"); @@ -38,13 +70,13 @@ mod all { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_basic"); suite.wait_for_flush(); - suite - .check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ) - .await; + (round1, round2) }); + suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ); + suite.cluster.shutdown(); } #[test] @@ -97,10 +129,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("region_failure"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); } #[test] fn initial_scan_failure() { @@ -121,10 +153,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("initial_scan_failure"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); } #[test] fn failed_during_refresh_region() { @@ -147,10 +179,10 @@ mod all { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("fail_to_refresh_region"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - )); + ); let leader = suite.cluster.leader_of_region(1).unwrap().store_id; let (tx, rx) = std::sync::mpsc::channel(); suite.endpoints[&leader] @@ -186,7 +218,8 @@ mod all { suite.must_split(&make_split_key_at_record(1, 42)); std::thread::sleep(Duration::from_secs(2)); - let error = run_async_test(suite.get_meta_cli().get_last_error("retry_abort", 1)).unwrap(); + let error = + run_async_test(suite.get_meta_cli().get_last_error_of("retry_abort", 1)).unwrap(); let error = error.expect("no error uploaded"); error .get_error_message() @@ -212,12 +245,7 @@ mod all { let items = run_async_test(suite.write_records(0, 128, 1)); suite.force_flush_files("retry_abort"); suite.wait_for_flush(); - run_async_test( - suite.check_for_write_records( - suite.flushed_files.path(), - items.iter().map(Vec::as_slice), - ), - ); + suite.check_for_write_records(suite.flushed_files.path(), items.iter().map(Vec::as_slice)); } #[test] fn failure_and_split() { @@ -240,12 +268,42 @@ mod all { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("failure_and_split"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - )); + ); let cp = suite.global_checkpoint(); assert!(cp > 512, "it is {}", cp); suite.cluster.shutdown(); } + + #[test] + fn memory_quota() { + let mut suite = SuiteBuilder::new_named("memory_quota") + .cfg(|cfg| cfg.initial_scan_pending_memory_quota = ReadableSize::kb(2)) + .build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + let failed = Arc::new(AtomicBool::new(false)); + fail::cfg("router_on_event_delay_ms", "6*return(1000)").unwrap(); + fail::cfg_callback("scan_and_async_send::about_to_consume", { + let failed = failed.clone(); + move || { + let v = backup_stream::metrics::HEAP_MEMORY.get(); + // Not greater than max key length * concurrent initial scan number. + if v > 4096 * 6 { + println!("[[ FAILED ]] The memory usage is {v} which exceeds the quota"); + failed.store(true, Ordering::SeqCst); + } + } + }) + .unwrap(); + suite.must_register_task(1, "memory_quota"); + suite.force_flush_files("memory_quota"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys.iter().map(|v| v.as_slice()), + ); + assert!(!failed.load(Ordering::SeqCst)); + } } diff --git a/components/backup-stream/tests/integration/mod.rs b/components/backup-stream/tests/integration/mod.rs index a209572c6d8..04fee6b2c09 100644 --- a/components/backup-stream/tests/integration/mod.rs +++ b/components/backup-stream/tests/integration/mod.rs @@ -16,6 +16,7 @@ mod all { use futures::{Stream, StreamExt}; use pd_client::PdClient; use test_raftstore::IsolationFilterFactory; + use tikv::config::BackupStreamConfig; use tikv_util::{box_err, defer, info, HandyRwLock}; use tokio::time::timeout; use txn_types::{Key, TimeStamp}; @@ -27,20 +28,19 @@ mod all { #[test] fn with_split() { let mut suite = SuiteBuilder::new_named("with_split").build(); - run_async_test(async { + let (round1, round2) = run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; suite.must_split(&make_split_key_at_record(1, 42)); suite.must_register_task(1, "test_with_split"); let round2 = suite.write_records(256, 128, 1).await; - suite.force_flush_files("test_with_split"); - suite.wait_for_flush(); - suite - .check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ) - .await; + (round1, round2) }); + suite.force_flush_files("test_with_split"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ); suite.cluster.shutdown(); } @@ -62,7 +62,7 @@ mod all { #[test] fn with_split_txn() { let mut suite = SuiteBuilder::new_named("split_txn").build(); - run_async_test(async { + let (commit_ts, start_ts, keys) = run_async_test(async { let start_ts = suite.cluster.pd_client.get_tso().await.unwrap(); let keys = (1..1960).map(|i| make_record_key(1, i)).collect::>(); suite.must_kv_prewrite( @@ -75,26 +75,25 @@ mod all { start_ts, ); let commit_ts = suite.cluster.pd_client.get_tso().await.unwrap(); - suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); - suite.must_register_task(1, "test_split_txn"); - suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); - suite.force_flush_files("test_split_txn"); - suite.wait_for_flush(); - let keys_encoded = keys - .iter() - .map(|v| { - Key::from_raw(v.as_slice()) - .append_ts(commit_ts) - .into_encoded() - }) - .collect::>(); - suite - .check_for_write_records( - suite.flushed_files.path(), - keys_encoded.iter().map(Vec::as_slice), - ) - .await; + (commit_ts, start_ts, keys) }); + suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); + suite.must_register_task(1, "test_split_txn"); + suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); + suite.force_flush_files("test_split_txn"); + suite.wait_for_flush(); + let keys_encoded = keys + .iter() + .map(|v| { + Key::from_raw(v.as_slice()) + .append_ts(commit_ts) + .into_encoded() + }) + .collect::>(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys_encoded.iter().map(Vec::as_slice), + ); suite.cluster.shutdown(); } @@ -110,10 +109,10 @@ mod all { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("test_leader_down"); suite.wait_for_flush(); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - )); + ); suite.cluster.shutdown(); } @@ -161,7 +160,7 @@ mod all { let err = run_async_test( suite .get_meta_cli() - .get_last_error("test_fatal_error", *victim), + .get_last_error_of("test_fatal_error", *victim), ) .unwrap() .unwrap(); @@ -345,10 +344,10 @@ mod all { } assert_eq!(items.last().unwrap().end_key, Vec::::default()); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(|x| x.as_slice()), - )); + ); } #[test] @@ -372,18 +371,18 @@ mod all { .unwrap(); suite.sync(); std::thread::sleep(Duration::from_secs(2)); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|x| x.as_slice()), - )); + ); assert!(suite.global_checkpoint() > 256); suite.force_flush_files("r"); suite.wait_for_flush(); assert!(suite.global_checkpoint() > 512); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(|x| x.as_slice()), - )); + ); } #[test] @@ -425,9 +424,30 @@ mod all { ts, cps ); - run_async_test(suite.check_for_write_records( + suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|k| k.as_slice()), - )) + ) + } + + #[test] + fn update_config() { + let suite = SuiteBuilder::new_named("network_partition") + .nodes(1) + .build(); + let mut basic_config = BackupStreamConfig::default(); + basic_config.initial_scan_concurrency = 4; + suite.run(|| Task::ChangeConfig(basic_config.clone())); + suite.wait_with(|e| { + assert_eq!(e.initial_scan_semaphore.available_permits(), 4,); + true + }); + + basic_config.initial_scan_concurrency = 16; + suite.run(|| Task::ChangeConfig(basic_config.clone())); + suite.wait_with(|e| { + assert_eq!(e.initial_scan_semaphore.available_permits(), 16,); + true + }); } } diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index e1df628d76b..434d81fff48 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -2,7 +2,8 @@ use std::{ collections::{HashMap, HashSet}, - path::Path, + fmt::Display, + path::{Path, PathBuf}, sync::Arc, time::Duration, }; @@ -20,6 +21,7 @@ use backup_stream::{ utils, BackupStreamResolver, Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, }; +use engine_rocks::RocksEngine; use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt}; use grpcio::{ChannelBuilder, Server, ServerBuilder}; use kvproto::{ @@ -30,15 +32,11 @@ use kvproto::{ tikvpb::*, }; use pd_client::PdClient; -use protobuf::parse_from_bytes; -use raftstore::{ - router::{CdcRaftRouter, ServerRaftStoreRouter}, - RegionInfoAccessor, -}; +use raftstore::{router::CdcRaftRouter, RegionInfoAccessor}; use resolved_ts::LeadershipResolver; use tempdir::TempDir; use test_pd_client::TestPdClient; -use test_raftstore::{new_server_cluster, Cluster, ServerCluster, SimulateTransport}; +use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; use tikv::config::BackupStreamConfig; use tikv_util::{ @@ -46,22 +44,29 @@ use tikv_util::{ number::NumberEncoder, stream_event::{EventIterator, Iterator}, }, - info, + debug, info, worker::LazyWorker, HandyRwLock, }; use txn_types::{Key, TimeStamp, WriteRef}; use walkdir::WalkDir; +#[derive(Debug)] +pub struct FileSegments { + path: PathBuf, + segments: Vec<(usize, usize)>, +} + +#[derive(Default, Debug)] +pub struct LogFiles { + default_cf: Vec, + write_cf: Vec, +} + pub type TestEndpoint = Endpoint< ErrorStore, RegionInfoAccessor, engine_test::kv::KvTestEngine, - CdcRaftRouter< - SimulateTransport< - ServerRaftStoreRouter, - >, - >, TestPdClient, >; @@ -245,7 +250,7 @@ impl MetaStore for ErrorStore { pub struct Suite { pub endpoints: HashMap>, pub meta_store: ErrorStore, - pub cluster: Cluster, + pub cluster: Cluster>, tikv_cli: HashMap, log_backup_cli: HashMap, obs: HashMap, @@ -391,6 +396,11 @@ impl Suite { MetadataClient::new(self.meta_store.clone(), 0) } + #[allow(dead_code)] + pub fn dump_slash_etc(&self) { + self.meta_store.inner.blocking_lock().dump(); + } + pub fn must_split(&mut self, key: &[u8]) { let region = self.cluster.get_region(key); self.cluster.must_split(®ion, key); @@ -461,7 +471,12 @@ impl Suite { for ts in (from..(from + n)).map(|x| x * 2) { let ts = ts as u64; let key = make_record_key(for_table, ts); - let muts = vec![mutation(key.clone(), b"hello, world".to_vec())]; + let value = if ts % 4 == 0 { + b"hello, world".to_vec() + } else { + [0xdd; 4096].to_vec() + }; + let muts = vec![mutation(key.clone(), value)]; let enc_key = Key::from_raw(&key).into_encoded(); let region = self.cluster.get_region_id(&enc_key); let start_ts = self.cluster.pd_client.get_tso().await.unwrap(); @@ -518,45 +533,53 @@ impl Suite { } } - pub fn load_metadata_for_write_records( - &self, - path: &Path, - ) -> HashMap> { - let mut meta_map: HashMap> = HashMap::new(); - for entry in WalkDir::new(path) { - let entry = entry.unwrap(); - if entry.file_type().is_file() - && entry - .file_name() - .to_str() - .map_or(false, |s| s.ends_with(".meta")) - { - let content = std::fs::read(entry.path()).unwrap(); - let meta = parse_from_bytes::(content.as_ref()).unwrap(); - for g in meta.file_groups.into_iter() { - let path = g.path.split('/').last().unwrap(); - for f in g.data_files_info.into_iter() { - let file_info = meta_map.get_mut(path); - if let Some(v) = file_info { - v.push(( - f.range_offset as usize, - (f.range_offset + f.range_length) as usize, - )); + pub fn get_files_to_check(&self, path: &Path) -> std::io::Result { + let mut res = LogFiles::default(); + for entry in WalkDir::new(path.join("v1/backupmeta")) { + let entry = entry?; + println!("reading {}", entry.path().display()); + if entry.file_name().to_str().unwrap().ends_with(".meta") { + let content = std::fs::read(entry.path())?; + let meta = protobuf::parse_from_bytes::(&content)?; + for fg in meta.get_file_groups() { + let mut default_segs = vec![]; + let mut write_segs = vec![]; + for file in fg.get_data_files_info() { + let v = if file.cf == "default" || file.cf.is_empty() { + Some(&mut default_segs) + } else if file.cf == "write" { + Some(&mut write_segs) } else { - let v = vec![( - f.range_offset as usize, - (f.range_offset + f.range_length) as usize, - )]; - meta_map.insert(String::from(path), v); - } + None + }; + v.into_iter().for_each(|v| { + v.push(( + file.get_range_offset() as usize, + (file.get_range_offset() + file.get_range_length()) as usize, + )) + }); + } + let p = path.join(fg.get_path()); + if !default_segs.is_empty() { + res.default_cf.push(FileSegments { + path: p.clone(), + segments: default_segs, + }) + } + if !write_segs.is_empty() { + res.write_cf.push(FileSegments { + path: p, + segments: write_segs, + }) } } } } - meta_map + Ok(res) } - pub async fn check_for_write_records<'a>( + #[track_caller] + pub fn check_for_write_records<'a>( &self, path: &Path, key_set: impl std::iter::Iterator, @@ -565,45 +588,72 @@ impl Suite { let n = remain_keys.len(); let mut extra_key = 0; let mut extra_len = 0; - let meta_map = self.load_metadata_for_write_records(path); - for entry in WalkDir::new(path) { - let entry = entry.unwrap(); - println!("checking: {:?}", entry); - if entry.file_type().is_file() - && entry - .file_name() - .to_str() - .map_or(false, |s| s.ends_with(".log")) - { - let buf = std::fs::read(entry.path()).unwrap(); - let file_infos = meta_map.get(entry.file_name().to_str().unwrap()).unwrap(); - for &file_info in file_infos { - let mut decoder = ZstdDecoder::new(Vec::new()); - let pbuf: &[u8] = &buf[file_info.0..file_info.1]; - decoder.write_all(pbuf).await.unwrap(); - decoder.flush().await.unwrap(); - decoder.close().await.unwrap(); - let content = decoder.into_inner(); - - let mut iter = EventIterator::new(&content); - loop { - if !iter.valid() { - break; - } - iter.next().unwrap(); - if !remain_keys.remove(iter.key()) { - extra_key += 1; - extra_len += iter.key().len() + iter.value().len(); - } + let files = self.get_files_to_check(path).unwrap_or_default(); + let mut default_keys = HashSet::new(); + let content_of = |buf: &[u8], range: (usize, usize)| { + let mut decoder = ZstdDecoder::new(Vec::new()); + let pbuf: &[u8] = &buf[range.0..range.1]; + run_async_test(async { + decoder.write_all(pbuf).await.unwrap(); + decoder.flush().await.unwrap(); + decoder.close().await.unwrap(); + }); + decoder.into_inner() + }; + for entry in files.write_cf { + debug!("checking write: {:?}", entry); + + let buf = std::fs::read(&entry.path).unwrap(); + for &file_info in entry.segments.iter() { + let data = content_of(&buf, file_info); + let mut iter = EventIterator::new(&data); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !remain_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } - let value = iter.value(); - let wf = WriteRef::parse(value).unwrap(); + let value = iter.value(); + let wf = WriteRef::parse(value).unwrap(); + if wf.short_value.is_none() { + let mut key = Key::from_encoded_slice(iter.key()).truncate_ts().unwrap(); + key.append_ts_inplace(wf.start_ts); + + default_keys.insert(key.into_encoded()); + } else { assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); } } } } + for entry in files.default_cf { + debug!("checking default: {:?}", entry); + + let buf = std::fs::read(&entry.path).unwrap(); + for &file_info in entry.segments.iter() { + let data = content_of(&buf, file_info); + let mut iter = EventIterator::new(&data); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !default_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } + + let value = iter.value(); + assert_eq!(value, &[0xdd; 4096]); + } + } + } + if extra_key != 0 { println!( "check_for_write_records of “{}”: extra {} keys ({:.02}% of recorded keys), extra {} bytes.", @@ -613,17 +663,19 @@ impl Suite { extra_len ) } - if !remain_keys.is_empty() { - panic!( - "not all keys are recorded: it remains {:?} (total = {})", - remain_keys - .iter() - .take(3) - .map(|v| hex::encode(v)) - .collect::>(), - remain_keys.len() - ); - } + assert_empty(&remain_keys, "not all keys are recorded"); + assert_empty(&default_keys, "some keys don't have default entry"); + } +} + +#[track_caller] +fn assert_empty(v: &HashSet>, msg: impl Display) { + if !v.is_empty() { + panic!( + "{msg}: it remains {:?}... (total = {})", + v.iter().take(3).map(|v| hex::encode(v)).collect::>(), + v.len() + ); } } diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index 225a88a3e8f..03b6e439879 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -5,12 +5,7 @@ edition = "2021" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] -cloud-aws = ["external_storage_export/cloud-aws"] -cloud-gcp = ["external_storage_export/cloud-gcp"] -cloud-azure = ["external_storage_export/cloud-azure"] -cloud-storage-grpc = ["external_storage_export/cloud-storage-grpc"] -cloud-storage-dylib = ["external_storage_export/cloud-storage-dylib"] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] test-engine-kv-rocksdb = [ "tikv/test-engine-kv-rocksdb" ] @@ -45,7 +40,6 @@ engine_rocks = { workspace = true } engine_traits = { workspace = true } error_code = { workspace = true } external_storage = { workspace = true } -external_storage_export = { workspace = true } file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index a4efc162092..5c243a1e8d8 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -12,8 +12,7 @@ use async_channel::SendError; use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use concurrency_manager::ConcurrencyManager; use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, KvEngine, SstCompressionType}; -use external_storage::{BackendConfig, HdfsConfig}; -use external_storage_export::{create_storage, ExternalStorage}; +use external_storage::{create_storage, BackendConfig, ExternalStorage, HdfsConfig}; use futures::{channel::mpsc::*, executor::block_on}; use kvproto::{ brpb::*, @@ -928,7 +927,7 @@ impl Endpoint { let sst_max_size = self.config_manager.0.read().unwrap().sst_max_size.0; let limit = self.softlimit.limit(); let resource_limiter = self.resource_ctl.as_ref().and_then(|r| { - r.get_resource_limiter(&request.resource_group_name, &request.source_tag) + r.get_background_resource_limiter(&request.resource_group_name, &request.source_tag) }); self.pool.borrow_mut().spawn(async move { @@ -1302,7 +1301,7 @@ pub mod tests { use api_version::{api_v2::RAW_KEY_PREFIX, dispatch_api_version, KvFormat, RawValue}; use collections::HashSet; use engine_traits::MiscExt; - use external_storage_export::{make_local_backend, make_noop_backend}; + use external_storage::{make_local_backend, make_noop_backend}; use file_system::{IoOp, IoRateLimiter, IoType}; use futures::{executor::block_on, stream::StreamExt}; use kvproto::metapb; diff --git a/components/backup/src/service.rs b/components/backup/src/service.rs index 237234c061e..8420b7ded9c 100644 --- a/components/backup/src/service.rs +++ b/components/backup/src/service.rs @@ -144,7 +144,7 @@ mod tests { use std::{sync::Arc, time::Duration}; use engine_rocks::RocksEngine; - use external_storage_export::make_local_backend; + use external_storage::make_local_backend; use tikv::storage::txn::tests::{must_commit, must_prewrite_put}; use tikv_util::worker::{dummy_scheduler, ReceiverWrapper}; use txn_types::TimeStamp; diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 715c4f68291..a2d8a31f0ea 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -7,7 +7,7 @@ use engine_traits::{ CfName, ExternalSstFileInfo, KvEngine, SstCompressionType, SstExt, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; -use external_storage_export::{ExternalStorage, UnpinReader}; +use external_storage::{ExternalStorage, UnpinReader}; use file_system::Sha256Reader; use futures_util::io::AllowStdIo; use kvproto::{ @@ -121,7 +121,7 @@ impl Writer { .with_label_values(&[cf.into()]) .inc_by(self.total_kvs); let file_name = format!("{}_{}.sst", name, cf); - let iv = Iv::new_ctr(); + let iv = Iv::new_ctr().map_err(|e| Error::Other(box_err!("new IV error: {:?}", e)))?; let encrypter_reader = EncrypterReader::new(sst_reader, cipher.cipher_type, &cipher.cipher_key, iv) .map_err(|e| Error::Other(box_err!("new EncrypterReader error: {:?}", e)))?; @@ -485,9 +485,8 @@ mod tests { .build() .unwrap(); let db = rocks.get_rocksdb(); - let backend = external_storage_export::make_local_backend(temp.path()); - let storage = - external_storage_export::create_storage(&backend, Default::default()).unwrap(); + let backend = external_storage::make_local_backend(temp.path()); + let storage = external_storage::create_storage(&backend, Default::default()).unwrap(); // Test empty file. let mut r = kvproto::metapb::Region::default(); diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index ac69d544a21..bd1ae6c56b4 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -10,6 +10,7 @@ test-runner = ["derive_more"] [dependencies] collections = { workspace = true } crossbeam = "0.8" +dashmap = "5.2" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 119b7875506..4f886fe3b3d 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -1,21 +1,17 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::{ - cell::Cell, - mem, - sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, Mutex, - }, +use std::sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, }; -use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; +use dashmap::DashMap; +use tikv_util::{debug, info, time::Instant, Either}; use crate::{ - fsm::{Fsm, FsmScheduler, FsmState}, + fsm::{Fsm, FsmScheduler}, mailbox::{BasicMailbox, Mailbox}, metrics::*, }; @@ -27,18 +23,14 @@ pub struct RouterTrace { pub leak: usize, } -struct NormalMailMap { - map: HashMap>, - // Count of Mailboxes that is stored in `map`. - alive_cnt: Arc, -} - enum CheckDoResult { NotExist, Invalid, Valid(T), } +const ROUTER_SHRINK_SIZE: usize = 1000; + /// Router routes messages to its target FSM's mailbox. /// /// In our abstract model, every batch system has two different kind of @@ -54,8 +46,7 @@ enum CheckDoResult { /// Normal FSM and control FSM can have different scheduler, but this is not /// required. pub struct Router { - normals: Arc>>, - caches: Cell>>, + normals: Arc>>, pub(super) control_box: BasicMailbox, // TODO: These two schedulers should be unified as single one. However // it's not possible to write FsmScheduler + FsmScheduler @@ -85,11 +76,7 @@ where state_cnt: Arc, ) -> Router { Router { - normals: Arc::new(Mutex::new(NormalMailMap { - map: HashMap::default(), - alive_cnt: Arc::default(), - })), - caches: Cell::new(LruCache::with_capacity_and_sample(1024, 7)), + normals: Arc::new(DashMap::default()), control_box, normal_scheduler, control_scheduler, @@ -106,72 +93,32 @@ where /// A helper function that tries to unify a common access pattern to /// mailbox. /// - /// Generally, when sending a message to a mailbox, cache should be - /// check first, if not found, lock should be acquired. - /// /// Returns None means there is no mailbox inside the normal registry. /// Some(None) means there is expected mailbox inside the normal registry /// but it returns None after apply the given function. Some(Some) means - /// the given function returns Some and cache is updated if it's invalid. + /// the given function returns Some. #[inline] fn check_do(&self, addr: u64, mut f: F) -> CheckDoResult where F: FnMut(&BasicMailbox) -> Option, { - let caches = unsafe { &mut *self.caches.as_ptr() }; - let mut connected = true; - if let Some(mailbox) = caches.get(&addr) { - match f(mailbox) { - Some(r) => return CheckDoResult::Valid(r), - None => { - connected = false; - } - } - } - - let (cnt, mailbox) = { - let mut boxes = self.normals.lock().unwrap(); - let cnt = boxes.map.len(); - let b = match boxes.map.get_mut(&addr) { - Some(mailbox) => mailbox.clone(), - None => { - drop(boxes); - if !connected { - caches.remove(&addr); - } - return CheckDoResult::NotExist; - } - }; - (cnt, b) - }; - if cnt > caches.capacity() || cnt < caches.capacity() / 2 { - caches.resize(cnt); - } - - let res = f(&mailbox); - match res { - Some(r) => { - caches.insert(addr, mailbox); - CheckDoResult::Valid(r) - } + let mailbox = match self.normals.get_mut(&addr) { + Some(mailbox) => mailbox, None => { - if !connected { - caches.remove(&addr); - } - CheckDoResult::Invalid + return CheckDoResult::NotExist; } + }; + match f(&mailbox) { + Some(r) => CheckDoResult::Valid(r), + None => CheckDoResult::Invalid, } } /// Register a mailbox with given address. pub fn register(&self, addr: u64, mailbox: BasicMailbox) { - let mut normals = self.normals.lock().unwrap(); - if let Some(mailbox) = normals.map.insert(addr, mailbox) { + if let Some(mailbox) = self.normals.insert(addr, mailbox) { mailbox.close(); } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); } /// Same as send a message and then register the mailbox. @@ -183,32 +130,22 @@ where mailbox: BasicMailbox, msg: N::Message, ) -> Result<(), (BasicMailbox, N::Message)> { - let mut normals = self.normals.lock().unwrap(); - // Send has to be done within lock, otherwise the message may be handled - // before the mailbox is register. + if let Some(mailbox) = self.normals.insert(addr, mailbox.clone()) { + mailbox.close(); + } if let Err(SendError(m)) = mailbox.force_send(msg, &self.normal_scheduler) { + self.normals.remove(&addr); return Err((mailbox, m)); } - if let Some(mailbox) = normals.map.insert(addr, mailbox) { - mailbox.close(); - } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); Ok(()) } pub fn register_all(&self, mailboxes: Vec<(u64, BasicMailbox)>) { - let mut normals = self.normals.lock().unwrap(); - normals.map.reserve(mailboxes.len()); for (addr, mailbox) in mailboxes { - if let Some(m) = normals.map.insert(addr, mailbox) { + if let Some(m) = self.normals.insert(addr, mailbox) { m.close(); } } - normals - .alive_cnt - .store(normals.map.len(), Ordering::Relaxed); } /// Get the mailbox of specified address. @@ -280,13 +217,11 @@ where pub fn force_send(&self, addr: u64, msg: N::Message) -> Result<(), SendError> { match self.send(addr, msg) { Ok(()) => Ok(()), - Err(TrySendError::Full(m)) => { - let caches = unsafe { &mut *self.caches.as_ptr() }; - caches - .get(&addr) - .unwrap() - .force_send(m, &self.normal_scheduler) - } + Err(TrySendError::Full(m)) => self + .normals + .get(&addr) + .unwrap() + .force_send(m, &self.normal_scheduler), Err(TrySendError::Disconnected(m)) => { if self.is_shutdown() { Ok(()) @@ -321,10 +256,9 @@ where /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { let timer = Instant::now_coarse(); - let mailboxes = self.normals.lock().unwrap(); - for mailbox in mailboxes.map.values() { + self.normals.iter().for_each(|mailbox| { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); - } + }); BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } @@ -332,12 +266,13 @@ where pub fn broadcast_shutdown(&self) { info!("broadcasting shutdown"); self.shutdown.store(true, Ordering::SeqCst); - unsafe { &mut *self.caches.as_ptr() }.clear(); - let mut mailboxes = self.normals.lock().unwrap(); - for (addr, mailbox) in mailboxes.map.drain() { + for e in self.normals.iter() { + let addr = e.key(); + let mailbox = e.value(); debug!("[region {}] shutdown mailbox", addr); mailbox.close(); } + self.normals.clear(); self.control_box.close(); self.normal_scheduler.shutdown(); self.control_scheduler.shutdown(); @@ -346,51 +281,32 @@ where /// Close the mailbox of address. pub fn close(&self, addr: u64) { info!("shutdown mailbox"; "region_id" => addr); - unsafe { &mut *self.caches.as_ptr() }.remove(&addr); - let mut mailboxes = self.normals.lock().unwrap(); - if let Some(mb) = mailboxes.map.remove(&addr) { + if let Some((_, mb)) = self.normals.remove(&addr) { mb.close(); } - mailboxes - .alive_cnt - .store(mailboxes.map.len(), Ordering::Relaxed); - } - - pub fn clear_cache(&self) { - unsafe { &mut *self.caches.as_ptr() }.clear(); + if self.normals.capacity() - self.normals.len() > ROUTER_SHRINK_SIZE { + self.normals.shrink_to_fit(); + } } pub fn state_cnt(&self) -> &Arc { &self.state_cnt } - pub fn alive_cnt(&self) -> Arc { - self.normals.lock().unwrap().alive_cnt.clone() + pub fn alive_cnt(&self) -> usize { + self.normals.len() } pub fn trace(&self) -> RouterTrace { - let alive = self.normals.lock().unwrap().alive_cnt.clone(); + let alive = self.alive_cnt(); let total = self.state_cnt.load(Ordering::Relaxed); - let alive = alive.load(Ordering::Relaxed); // 1 represents the control fsm. let leak = if total > alive + 1 { total - alive - 1 } else { 0 }; - let mailbox_unit = mem::size_of::<(u64, BasicMailbox)>(); - let state_unit = mem::size_of::>(); - // Every message in crossbeam sender needs 8 bytes to store state. - let message_unit = mem::size_of::() + 8; - // crossbeam unbounded channel sender has a list of blocks. Every block has 31 - // unit and every sender has at least one sender. - let sender_block_unit = 31; - RouterTrace { - alive: (mailbox_unit * 8 / 7 // hashmap uses 7/8 of allocated memory. - + state_unit + message_unit * sender_block_unit) - * alive, - leak: (state_unit + message_unit * sender_block_unit) * leak, - } + RouterTrace { alive, leak } } } @@ -398,7 +314,6 @@ impl Clone for Router { fn clone(&self) -> Router { Router { normals: self.normals.clone(), - caches: Cell::new(LruCache::with_capacity_and_sample(1024, 7)), control_box: self.control_box.clone(), // These two schedulers should be unified as single one. However // it's not possible to write FsmScheduler + FsmScheduler diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index d746dfad5cb..66d0770d544 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -143,25 +143,19 @@ fn test_router_trace() { router.close(addr); }; - let router_clone = router.clone(); + let mut mailboxes = vec![]; for i in 0..10 { register_runner(i); - // Read mailbox to cache. - router_clone.mailbox(i).unwrap(); + mailboxes.push(router.mailbox(i).unwrap()); } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 10); + assert_eq!(router.alive_cnt(), 10); assert_eq!(router.state_cnt().load(Ordering::Relaxed), 11); - // Routers closed but exist in the cache. for i in 0..10 { close_runner(i); } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 0); + assert_eq!(router.alive_cnt(), 0); assert_eq!(router.state_cnt().load(Ordering::Relaxed), 11); - for i in 0..1024 { - register_runner(i); - // Read mailbox to cache, closed routers should be evicted. - router_clone.mailbox(i).unwrap(); - } - assert_eq!(router.alive_cnt().load(Ordering::Relaxed), 1024); - assert_eq!(router.state_cnt().load(Ordering::Relaxed), 1025); + drop(mailboxes); + assert_eq!(router.alive_cnt(), 0); + assert_eq!(router.state_cnt().load(Ordering::Relaxed), 1); } diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index b11799d87c1..b386c3561bb 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -1,13 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - fmt, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, - time::Duration, -}; +use std::{fmt, sync::Arc, time::Duration}; use futures::{ channel::mpsc::{ @@ -20,7 +13,13 @@ use futures::{ use grpcio::WriteFlags; use kvproto::cdcpb::{ChangeDataEvent, Event, ResolvedTs}; use protobuf::Message; -use tikv_util::{future::block_on_timeout, impl_display_as_debug, time::Instant, warn}; +use tikv_util::{ + future::block_on_timeout, + impl_display_as_debug, + memory::{MemoryQuota, MemoryQuotaExceeded}, + time::Instant, + warn, +}; use crate::metrics::*; @@ -57,6 +56,9 @@ pub enum CdcEvent { impl CdcEvent { pub fn size(&self) -> u32 { + fail::fail_point!("cdc_event_size", |size| size + .map(|s| s.parse::().unwrap()) + .unwrap_or(0)); match self { CdcEvent::ResolvedTs(ref r) => { // For region id, it is unlikely to exceed 100,000,000 which is @@ -185,71 +187,7 @@ impl EventBatcher { } } -#[derive(Clone)] -pub struct MemoryQuota { - capacity: Arc, - in_use: Arc, -} - -impl MemoryQuota { - pub fn new(capacity: usize) -> MemoryQuota { - MemoryQuota { - capacity: Arc::new(AtomicUsize::new(capacity)), - in_use: Arc::new(AtomicUsize::new(0)), - } - } - - pub fn in_use(&self) -> usize { - self.in_use.load(Ordering::Relaxed) - } - - pub(crate) fn capacity(&self) -> usize { - self.capacity.load(Ordering::Acquire) - } - - pub(crate) fn set_capacity(&self, capacity: usize) { - self.capacity.store(capacity, Ordering::Release) - } - - fn alloc(&self, bytes: usize) -> bool { - let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - let capacity = self.capacity.load(Ordering::Acquire); - loop { - if in_use_bytes + bytes > capacity { - return false; - } - let new_in_use_bytes = in_use_bytes + bytes; - match self.in_use.compare_exchange_weak( - in_use_bytes, - new_in_use_bytes, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => return true, - Err(current) => in_use_bytes = current, - } - } - } - - fn free(&self, bytes: usize) { - let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); - loop { - // Saturating at the numeric bounds instead of overflowing. - let new_in_use_bytes = in_use_bytes - std::cmp::min(bytes, in_use_bytes); - match self.in_use.compare_exchange_weak( - in_use_bytes, - new_in_use_bytes, - Ordering::Acquire, - Ordering::Relaxed, - ) { - Ok(_) => return, - Err(current) => in_use_bytes = current, - } - } - } -} - -pub fn channel(buffer: usize, memory_quota: MemoryQuota) -> (Sink, Drain) { +pub fn channel(buffer: usize, memory_quota: Arc) -> (Sink, Drain) { let (unbounded_sender, unbounded_receiver) = unbounded(); let (bounded_sender, bounded_receiver) = bounded(buffer); ( @@ -300,19 +238,25 @@ impl_from_future_send_error! { TrySendError<(CdcEvent, usize)>, } +impl From for SendError { + fn from(_: MemoryQuotaExceeded) -> Self { + SendError::Congested + } +} + #[derive(Clone)] pub struct Sink { unbounded_sender: UnboundedSender<(CdcEvent, usize)>, bounded_sender: Sender<(CdcEvent, usize)>, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl Sink { pub fn unbounded_send(&self, event: CdcEvent, force: bool) -> Result<(), SendError> { // Try it's best to send error events. let bytes = if !force { event.size() as usize } else { 0 }; - if bytes != 0 && !self.memory_quota.alloc(bytes) { - return Err(SendError::Congested); + if bytes != 0 { + self.memory_quota.alloc(bytes)?; } match self.unbounded_sender.unbounded_send((event, bytes)) { Ok(_) => Ok(()), @@ -331,9 +275,7 @@ impl Sink { let bytes = event.size(); total_bytes += bytes; } - if !self.memory_quota.alloc(total_bytes as _) { - return Err(SendError::Congested); - } + self.memory_quota.alloc(total_bytes as _)?; for event in events { let bytes = event.size() as usize; if let Err(e) = self.bounded_sender.feed((event, bytes)).await { @@ -354,7 +296,7 @@ impl Sink { pub struct Drain { unbounded_receiver: UnboundedReceiver<(CdcEvent, usize)>, bounded_receiver: Receiver<(CdcEvent, usize)>, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl<'a> Drain { @@ -451,7 +393,7 @@ mod tests { type Send = Box Result<(), SendError>>; fn new_test_channel(buffer: usize, capacity: usize, force_send: bool) -> (Send, Drain) { - let memory_quota = MemoryQuota::new(capacity); + let memory_quota = Arc::new(MemoryQuota::new(capacity)); let (mut tx, rx) = channel(buffer, memory_quota); let mut flag = true; let send = move |event| { @@ -599,7 +541,7 @@ mod tests { // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); - let memory_quota = MemoryQuota::new(max_pending_bytes as _); + let memory_quota = Arc::new(MemoryQuota::new(max_pending_bytes as _)); let (tx, _rx) = channel(buffer as _, memory_quota); for _ in 0..buffer { tx.unbounded_send(CdcEvent::Event(e.clone()), false) @@ -636,9 +578,9 @@ mod tests { } } let memory_quota = rx.memory_quota.clone(); - assert_eq!(memory_quota.alloc(event.size() as _), false,); + memory_quota.alloc(event.size() as _).unwrap_err(); drop(rx); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); } // Make sure memory quota is freed when tx is dropped before rx. { @@ -653,10 +595,10 @@ mod tests { } } let memory_quota = rx.memory_quota.clone(); - assert_eq!(memory_quota.alloc(event.size() as _), false,); + memory_quota.alloc(event.size() as _).unwrap_err(); drop(send); drop(rx); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); } // Make sure sending message to a closed channel does not leak memory quota. { @@ -668,7 +610,7 @@ mod tests { send(CdcEvent::Event(e.clone())).unwrap_err(); } assert_eq!(memory_quota.in_use(), 0); - assert_eq!(memory_quota.alloc(1024), true); + memory_quota.alloc(1024).unwrap(); // Freeing bytes should not cause overflow. memory_quota.free(1024); diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 4c8b2226f49..637ecab0440 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -28,9 +28,13 @@ use raftstore::{ store::util::compare_region_epoch, Error as RaftStoreError, }; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, TsSource, ON_DROP_WARN_HEAP_SIZE}; use tikv::storage::{txn::TxnEntry, Statistics}; -use tikv_util::{debug, info, warn}; +use tikv_util::{ + debug, info, + memory::{HeapSize, MemoryQuota}, + warn, +}; use txn_types::{Key, Lock, LockType, TimeStamp, WriteBatchFlags, WriteRef, WriteType}; use crate::{ @@ -202,6 +206,12 @@ impl Downstream { self.sink_error_event(region_id, err_event) } + pub fn sink_server_is_busy(&self, region_id: u64, reason: String) -> Result<()> { + let mut err_event = EventError::default(); + err_event.mut_server_is_busy().reason = reason; + self.sink_error_event(region_id, err_event) + } + pub fn set_sink(&mut self, sink: Sink) { self.sink = Some(sink); } @@ -226,16 +236,73 @@ impl Downstream { } } -#[derive(Default)] struct Pending { - pub downstreams: Vec, - pub locks: Vec, - pub pending_bytes: usize, + downstreams: Vec, + locks: Vec, + pending_bytes: usize, + memory_quota: Arc, +} + +impl Pending { + fn new(memory_quota: Arc) -> Pending { + Pending { + downstreams: vec![], + locks: vec![], + pending_bytes: 0, + memory_quota, + } + } + + fn push_pending_lock(&mut self, lock: PendingLock) -> Result<()> { + let bytes = lock.heap_size(); + self.memory_quota.alloc(bytes)?; + self.locks.push(lock); + self.pending_bytes += bytes; + CDC_PENDING_BYTES_GAUGE.add(bytes as i64); + Ok(()) + } + + fn on_region_ready(&mut self, resolver: &mut Resolver) -> Result<()> { + fail::fail_point!("cdc_pending_on_region_ready", |_| Err( + Error::MemoryQuotaExceeded(tikv_util::memory::MemoryQuotaExceeded) + )); + // Must take locks, otherwise it may double free memory quota on drop. + for lock in mem::take(&mut self.locks) { + self.memory_quota.free(lock.heap_size()); + match lock { + PendingLock::Track { key, start_ts } => { + resolver.track_lock(start_ts, key, None)?; + } + PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), + } + } + Ok(()) + } } impl Drop for Pending { fn drop(&mut self) { CDC_PENDING_BYTES_GAUGE.sub(self.pending_bytes as i64); + let locks = mem::take(&mut self.locks); + if locks.is_empty() { + return; + } + + // Free memory quota used by pending locks and unlocks. + let mut bytes = 0; + let num_locks = locks.len(); + for lock in locks { + bytes += lock.heap_size(); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("cdc drop huge Pending"; + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => self.memory_quota.in_use(), + "memory_quota_capacity" => self.memory_quota.capacity(), + ); + } + self.memory_quota.free(bytes); } } @@ -244,6 +311,14 @@ enum PendingLock { Untrack { key: Vec }, } +impl HeapSize for PendingLock { + fn heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key } => key.heap_size(), + } + } +} + /// A CDC delegate of a raftstore region peer. /// /// It converts raft commands into CDC events and broadcast to downstreams. @@ -265,14 +340,18 @@ pub struct Delegate { impl Delegate { /// Create a Delegate the given region. - pub fn new(region_id: u64, txn_extra_op: Arc>) -> Delegate { + pub fn new( + region_id: u64, + txn_extra_op: Arc>, + memory_quota: Arc, + ) -> Delegate { Delegate { region_id, handle: ObserveHandle::new(), resolver: None, region: None, resolved_downstreams: Vec::new(), - pending: Some(Pending::default()), + pending: Some(Pending::new(memory_quota)), txn_extra_op, failed: false, } @@ -350,10 +429,15 @@ impl Delegate { downstream.state.store(DownstreamState::Stopped); let error_event = error.clone(); if let Err(err) = downstream.sink_error_event(region_id, error_event) { - warn!("cdc broadcast error failed"; + warn!("cdc send region error failed"; "region_id" => region_id, "error" => ?err, "origin_error" => ?error, "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); + } else { + info!("cdc send region error success"; + "region_id" => region_id, "origin_error" => ?error, + "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, + "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); } Ok(()) }; @@ -395,7 +479,7 @@ impl Delegate { &mut self, mut resolver: Resolver, region: Region, - ) -> Vec<(&Downstream, Error)> { + ) -> Result> { assert!( self.resolver.is_none(), "region {} resolver should not be ready", @@ -408,26 +492,24 @@ impl Delegate { } // Mark the delegate as initialized. - let mut pending = self.pending.take().unwrap(); - self.region = Some(region); info!("cdc region is ready"; "region_id" => self.region_id); + // Downstreams in pending must be moved to resolved_downstreams + // immediately and must not return in the middle, otherwise the delegate + // loses downstreams. + let mut pending = self.pending.take().unwrap(); + self.resolved_downstreams = mem::take(&mut pending.downstreams); - for lock in mem::take(&mut pending.locks) { - match lock { - PendingLock::Track { key, start_ts } => resolver.track_lock(start_ts, key, None), - PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), - } - } + pending.on_region_ready(&mut resolver)?; self.resolver = Some(resolver); + self.region = Some(region); - self.resolved_downstreams = mem::take(&mut pending.downstreams); let mut failed_downstreams = Vec::new(); for downstream in self.downstreams() { if let Err(e) = self.check_epoch_on_ready(downstream) { failed_downstreams.push((downstream, e)); } } - failed_downstreams + Ok(failed_downstreams) } /// Try advance and broadcast resolved ts. @@ -439,7 +521,7 @@ impl Delegate { } debug!("cdc try to advance ts"; "region_id" => self.region_id, "min_ts" => min_ts); let resolver = self.resolver.as_mut().unwrap(); - let resolved_ts = resolver.resolve(min_ts, None); + let resolved_ts = resolver.resolve(min_ts, None, TsSource::Cdc); debug!("cdc resolved ts updated"; "region_id" => self.region_id, "resolved_ts" => resolved_ts); Some(resolved_ts) @@ -608,16 +690,14 @@ impl Delegate { let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); let mut raw_rows: Vec = Vec::new(); for mut req in requests { - match req.get_cmd_type() { - CmdType::Put => { - self.sink_put( - req.take_put(), - is_one_pc, - &mut txn_rows, - &mut raw_rows, - &mut read_old_value, - )?; - } + let res = match req.get_cmd_type() { + CmdType::Put => self.sink_put( + req.take_put(), + is_one_pc, + &mut txn_rows, + &mut raw_rows, + &mut read_old_value, + ), CmdType::Delete => self.sink_delete(req.take_delete()), _ => { debug!( @@ -625,7 +705,12 @@ impl Delegate { "region_id" => self.region_id, "command" => ?req, ); + Ok(()) } + }; + if res.is_err() { + self.mark_failed(); + return res; } } @@ -822,17 +907,15 @@ impl Delegate { // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { - resolver.track_lock(row.start_ts.into(), row.key.clone(), None) + resolver.track_lock(row.start_ts.into(), row.key.clone(), None)?; } None => { assert!(self.pending.is_some(), "region resolver not ready"); let pending = self.pending.as_mut().unwrap(); - pending.locks.push(PendingLock::Track { + pending.push_pending_lock(PendingLock::Track { key: row.key.clone(), start_ts: row.start_ts.into(), - }); - pending.pending_bytes += row.key.len(); - CDC_PENDING_BYTES_GAUGE.add(row.key.len() as i64); + })?; } } @@ -854,7 +937,7 @@ impl Delegate { Ok(()) } - fn sink_delete(&mut self, mut delete: DeleteRequest) { + fn sink_delete(&mut self, mut delete: DeleteRequest) -> Result<()> { match delete.cf.as_str() { "lock" => { let raw_key = Key::from_encoded(delete.take_key()).into_raw().unwrap(); @@ -862,11 +945,8 @@ impl Delegate { Some(ref mut resolver) => resolver.untrack_lock(&raw_key, None), None => { assert!(self.pending.is_some(), "region resolver not ready"); - let key_len = raw_key.len(); let pending = self.pending.as_mut().unwrap(); - pending.locks.push(PendingLock::Untrack { key: raw_key }); - pending.pending_bytes += key_len; - CDC_PENDING_BYTES_GAUGE.add(key_len as i64); + pending.push_pending_lock(PendingLock::Untrack { key: raw_key })?; } } } @@ -875,6 +955,7 @@ impl Delegate { panic!("invalid cf {}", other); } } + Ok(()) } fn sink_admin(&mut self, request: AdminRequest, mut response: AdminResponse) -> Result<()> { @@ -945,7 +1026,7 @@ impl Delegate { } fn stop_observing(&self) { - info!("stop observing"; "region_id" => self.region_id, "failed" => self.failed); + info!("cdc stop observing"; "region_id" => self.region_id, "failed" => self.failed); // Stop observe further events. self.handle.stop_observing(); // To inform transaction layer no more old values are required for the region. @@ -1151,9 +1232,10 @@ mod tests { use api_version::RawValue; use futures::{executor::block_on, stream::StreamExt}; use kvproto::{errorpb::Error as ErrorHeader, metapb::Region}; + use tikv_util::memory::MemoryQuota; use super::*; - use crate::channel::{channel, recv_timeout, MemoryQuota}; + use crate::channel::{channel, recv_timeout}; #[test] fn test_error() { @@ -1165,7 +1247,7 @@ mod tests { region.mut_region_epoch().set_conf_ver(2); let region_epoch = region.get_region_epoch().clone(); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (sink, mut drain) = crate::channel::channel(1, quota); let rx = drain.drain(); let request_id = 123; @@ -1179,11 +1261,18 @@ mod tests { ObservedRange::default(), ); downstream.set_sink(sink); - let mut delegate = Delegate::new(region_id, Default::default()); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + let mut delegate = Delegate::new(region_id, Default::default(), memory_quota); delegate.subscribe(downstream).unwrap(); assert!(delegate.handle.is_observing()); - let resolver = Resolver::new(region_id); - assert!(delegate.on_region_ready(resolver, region).is_empty()); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(region_id, memory_quota); + assert!( + delegate + .on_region_ready(resolver, region) + .unwrap() + .is_empty() + ); assert!(delegate.downstreams()[0].observed_range.all_key_covered); let rx_wrap = Cell::new(Some(rx)); @@ -1307,8 +1396,9 @@ mod tests { }; // Create a new delegate. + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op.clone()); + let mut delegate = Delegate::new(1, txn_extra_op.clone(), memory_quota); assert_eq!(txn_extra_op.load(), TxnExtraOp::Noop); assert!(delegate.handle.is_observing()); @@ -1333,7 +1423,10 @@ mod tests { region.mut_region_epoch().set_conf_ver(1); region.mut_region_epoch().set_version(1); { - let failures = delegate.on_region_ready(Resolver::new(1), region); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let failures = delegate + .on_region_ready(Resolver::new(1, memory_quota), region) + .unwrap(); assert_eq!(failures.len(), 1); let id = failures[0].0.id; delegate.unsubscribe(id, None); @@ -1424,8 +1517,9 @@ mod tests { Key::from_raw(b"d").into_encoded(), ) .unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op); + let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); assert!(delegate.handle.is_observing()); let mut map = HashMap::default(); @@ -1456,7 +1550,7 @@ mod tests { } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, @@ -1493,8 +1587,9 @@ mod tests { Key::from_raw(b"f").into_encoded(), ) .unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op); + let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); assert!(delegate.handle.is_observing()); let mut map = HashMap::default(); @@ -1529,7 +1624,7 @@ mod tests { } assert_eq!(map.len(), 5); - let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); let downstream = Downstream { id: DownstreamId::new(), req_id: 1, diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 23a3e410467..9f840ab49d5 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -5,7 +5,10 @@ use std::{ cmp::{Ord, Ordering as CmpOrdering, PartialOrd, Reverse}, collections::BinaryHeap, fmt, - sync::{Arc, Mutex as StdMutex}, + sync::{ + atomic::{AtomicIsize, Ordering}, + Arc, Mutex as StdMutex, + }, time::Duration, }; @@ -41,6 +44,7 @@ use tikv::{ }; use tikv_util::{ debug, defer, error, impl_display_as_debug, info, + memory::MemoryQuota, mpsc::bounded, slow_log, sys::thread::ThreadBuildWrapper, @@ -56,7 +60,7 @@ use tokio::{ use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ - channel::{CdcEvent, MemoryQuota, SendError}, + channel::{CdcEvent, SendError}, delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, initializer::Initializer, metrics::*, @@ -79,6 +83,11 @@ pub enum Deregister { conn_id: ConnId, request_id: u64, }, + Region { + conn_id: ConnId, + request_id: u64, + region_id: u64, + }, Downstream { conn_id: ConnId, request_id: u64, @@ -111,6 +120,16 @@ impl fmt::Debug for Deregister { .field("conn_id", conn_id) .field("request_id", request_id) .finish(), + Deregister::Region { + ref conn_id, + ref request_id, + ref region_id, + } => de + .field("deregister", &"region") + .field("conn_id", conn_id) + .field("request_id", request_id) + .field("region_id", region_id) + .finish(), Deregister::Downstream { ref conn_id, ref request_id, @@ -366,11 +385,14 @@ pub struct Endpoint { // Incremental scan workers: Runtime, + // The total number of scan tasks including running and pending. + scan_task_counter: Arc, scan_concurrency_semaphore: Arc, scan_speed_limiter: Limiter, + fetch_speed_limiter: Limiter, max_scan_batch_bytes: usize, max_scan_batch_size: usize, - sink_memory_quota: MemoryQuota, + sink_memory_quota: Arc, old_value_cache: OldValueCache, resolved_region_heap: RefCell, @@ -401,7 +423,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, security_mgr: Arc, - sink_memory_quota: MemoryQuota, + sink_memory_quota: Arc, causal_ts_provider: Option>, ) -> Endpoint { let workers = Builder::new_multi_thread() @@ -423,11 +445,16 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint 0 { + let scan_speed_limiter = Limiter::new(if config.incremental_scan_speed_limit.0 > 0 { config.incremental_scan_speed_limit.0 as f64 } else { f64::INFINITY }); + let fetch_speed_limiter = Limiter::new(if config.incremental_fetch_speed_limit.0 > 0 { + config.incremental_fetch_speed_limit.0 as f64 + } else { + f64::INFINITY + }); CDC_SINK_CAP.set(sink_memory_quota.capacity() as i64); // For scan efficiency, the scan batch bytes should be around 1MB. @@ -453,7 +480,9 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint 0 { + self.config.incremental_fetch_speed_limit.0 as f64 + } else { + f64::INFINITY + }; + + self.fetch_speed_limiter.set_speed_limit(new_speed_limit); + } } pub fn set_max_scan_batch_size(&mut self, max_scan_batch_size: usize) { @@ -582,8 +620,20 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { let conn = self.connections.get_mut(&conn_id).unwrap(); - for (region, downstream) in conn.unsubscribe_request(request_id) { - self.deregister_downstream(region, downstream, None); + for (region_id, downstream) in conn.unsubscribe_request(request_id) { + let err = Some(Error::Other("region not found".into())); + self.deregister_downstream(region_id, downstream, err); + } + } + Deregister::Region { + conn_id, + request_id, + region_id, + } => { + let conn = self.connections.get_mut(&conn_id).unwrap(); + if let Some(downstream) = conn.unsubscribe(request_id, region_id) { + let err = Some(Error::Other("region not found".into())); + self.deregister_downstream(region_id, downstream, err); } } Deregister::Downstream { @@ -677,6 +727,26 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint self.config.incremental_scan_concurrency_limit as isize { + debug!("cdc rejects registration, too many scan tasks"; + "region_id" => region_id, + "conn_id" => ?conn_id, + "req_id" => request_id, + "scan_task_count" => scan_task_count, + "incremental_scan_concurrency_limit" => self.config.incremental_scan_concurrency_limit, + ); + // To avoid OOM (e.g., https://github.com/tikv/tikv/issues/16035), + // TiKV needs to reject and return error immediately. + let _ = downstream + .sink_server_is_busy(region_id, "too many pending incremental scans".to_owned()); + return; + } + let txn_extra_op = match self.store_meta.lock().unwrap().reader(region_id) { Some(reader) => reader.txn_extra_op.clone(), None => { @@ -708,7 +778,11 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint e.into_mut(), HashMapEntry::Vacant(e) => { is_new_delegate = true; - e.insert(Delegate::new(region_id, txn_extra_op)) + e.insert(Delegate::new( + region_id, + txn_extra_op, + self.sink_memory_quota.clone(), + )) } }; @@ -761,7 +835,8 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { @@ -792,6 +868,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint { + for (downstream, e) in fails { + deregisters.push(Deregister::Downstream { + conn_id: downstream.get_conn_id(), + request_id: downstream.get_req_id(), + region_id, + downstream_id: downstream.get_id(), + err: Some(e), + }); + } + } + Err(e) => deregisters.push(Deregister::Delegate { region_id, - downstream_id: downstream.get_id(), - err: Some(e), - }); + observe_id, + err: e, + }), } } else { debug!("cdc stale region ready"; @@ -855,7 +940,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta + Send> Runnable for Endpoint { fn on_timeout(&mut self) { - CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); - // Reclaim resolved_region_heap memory. self.resolved_region_heap .borrow_mut() .reset_and_shrink_to(self.capture_regions.len()); + CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); CDC_CAPTURED_REGION_COUNT.set(self.capture_regions.len() as i64); CDC_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["unresolved"]) @@ -1261,6 +1345,7 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable CDC_REGION_RESOLVE_STATUS_GAUGE_VEC .with_label_values(&["resolved"]) .set(self.resolved_region_count as _); + if self.min_resolved_ts != TimeStamp::max() { CDC_MIN_RESOLVED_TS_REGION.set(self.min_ts_region_id as i64); CDC_MIN_RESOLVED_TS.set(self.min_resolved_ts.physical() as i64); @@ -1341,7 +1426,7 @@ mod tests { recv_timeout, }; - fn set_conn_verion_task(conn_id: ConnId, version: semver::Version) -> Task { + fn set_conn_version_task(conn_id: ConnId, version: semver::Version) -> Task { Task::SetConnVersion { conn_id, version, @@ -1455,7 +1540,7 @@ mod tests { ConcurrencyManager::new(1.into()), env, security_mgr, - MemoryQuota::new(usize::MAX), + Arc::new(MemoryQuota::new(usize::MAX)), causal_ts_provider, ); @@ -1476,14 +1561,14 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); let conn = Conn::new(tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); - suite.run(set_conn_verion_task( + suite.run(set_conn_version_task( conn_id, FeatureGate::batch_resolved_ts(), )); @@ -1728,11 +1813,38 @@ mod tests { < f64::EPSILON ); } + + // Modify incremental_fetch_speed_limit. + { + let mut updated_cfg = cfg.clone(); + { + updated_cfg.incremental_fetch_speed_limit = ReadableSize::mb(2048); + } + let diff = cfg.diff(&updated_cfg); + + assert_eq!( + ep.config.incremental_fetch_speed_limit, + ReadableSize::mb(512) + ); + assert!( + (ep.fetch_speed_limiter.speed_limit() - ReadableSize::mb(512).0 as f64).abs() + < f64::EPSILON + ); + ep.run(Task::ChangeConfig(diff)); + assert_eq!( + ep.config.incremental_fetch_speed_limit, + ReadableSize::mb(2048) + ); + assert!( + (ep.fetch_speed_limiter.speed_limit() - ReadableSize::mb(2048).0 as f64).abs() + < f64::EPSILON + ); + } } #[test] fn test_raftstore_is_busy() { - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, _rx) = channel::channel(1, quota); let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); @@ -1743,7 +1855,10 @@ mod tests { let conn = Conn::new(tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); - suite.run(set_conn_verion_task(conn_id, semver::Version::new(0, 0, 0))); + suite.run(set_conn_version_task( + conn_id, + semver::Version::new(0, 0, 0), + )); let mut req_header = Header::default(); req_header.set_cluster_id(0); @@ -1785,7 +1900,7 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -1795,7 +1910,7 @@ mod tests { // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); - suite.run(set_conn_verion_task(conn_id, version)); + suite.run(set_conn_version_task(conn_id, version)); let mut req_header = Header::default(); req_header.set_cluster_id(0); @@ -1931,6 +2046,97 @@ mod tests { } } + #[test] + fn test_too_many_scan_tasks() { + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(Duration::from_secs(60)), + incremental_scan_concurrency: 1, + incremental_scan_concurrency_limit: 1, + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); + + // Pause scan task runtime. + suite.endpoint.workers = Builder::new_multi_thread() + .worker_threads(1) + .build() + .unwrap(); + let (pause_tx, pause_rx) = std::sync::mpsc::channel::<()>(); + suite.endpoint.workers.spawn(async move { + let _ = pause_rx.recv(); + }); + + suite.add_region(1, 100); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let (tx, mut rx) = channel::channel(1, quota); + let mut rx = rx.drain(); + + let conn = Conn::new(tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + + // Enable batch resolved ts in the test. + let version = FeatureGate::batch_resolved_ts(); + suite.run(set_conn_version_task(conn_id, version)); + + let mut req_header = Header::default(); + req_header.set_cluster_id(0); + let mut req = ChangeDataRequest::default(); + req.set_region_id(1); + req.set_request_id(1); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + 1, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + }); + assert_eq!(suite.endpoint.capture_regions.len(), 1); + + // Test too many scan tasks error. + req.set_request_id(2); + let downstream = Downstream::new( + "".to_string(), + region_epoch, + 2, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + }); + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + if let CdcEvent::Event(mut e) = cdc_event.0 { + assert_eq!(e.region_id, 1); + assert_eq!(e.request_id, 2); + let event = e.event.take().unwrap(); + match event { + Event_oneof_event::Error(err) => { + assert!(err.has_server_is_busy()); + } + other => panic!("unknown event {:?}", other), + } + } else { + panic!("unknown cdc event {:?}", cdc_event); + } + + drop(pause_tx); + } + #[test] fn test_raw_causal_min_ts() { let sleep_interval = Duration::from_secs(1); @@ -1966,7 +2172,7 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); let mut region = Region::default(); @@ -1977,7 +2183,7 @@ mod tests { // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); - suite.run(set_conn_verion_task(conn_id, version)); + suite.run(set_conn_version_task(conn_id, version)); let mut req_header = Header::default(); req_header.set_cluster_id(0); @@ -1999,7 +2205,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(1); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(1, memory_quota); let observe_id = suite.endpoint.capture_regions[&1].handle.id; suite.on_region_ready(observe_id, resolver, region.clone()); suite.run(Task::MinTs { @@ -2035,7 +2242,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(2); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(2, memory_quota); region.set_id(2); let observe_id = suite.endpoint.capture_regions[&2].handle.id; suite.on_region_ready(observe_id, resolver, region); @@ -2056,7 +2264,7 @@ mod tests { } // Register region 3 to another conn which is not support batch resolved ts. - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx2) = channel::channel(1, quota); let mut rx2 = rx2.drain(); let mut region = Region::default(); @@ -2064,7 +2272,10 @@ mod tests { let conn = Conn::new(tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); - suite.run(set_conn_verion_task(conn_id, semver::Version::new(4, 0, 5))); + suite.run(set_conn_version_task( + conn_id, + semver::Version::new(4, 0, 5), + )); req.set_region_id(3); req.set_request_id(3); @@ -2084,7 +2295,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(3); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(3, memory_quota); region.set_id(3); let observe_id = suite.endpoint.capture_regions[&3].handle.id; suite.on_region_ready(observe_id, resolver, region); @@ -2127,14 +2339,17 @@ mod tests { fn test_deregister() { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); let conn = Conn::new(tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); - suite.run(set_conn_verion_task(conn_id, semver::Version::new(0, 0, 0))); + suite.run(set_conn_version_task( + conn_id, + semver::Version::new(0, 0, 0), + )); let mut req_header = Header::default(); req_header.set_cluster_id(0); @@ -2279,7 +2494,7 @@ mod tests { // Open two connections a and b, registers region 1, 2 to conn a and // region 3 to conn b. let mut conn_rxs = vec![]; - let quota = channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); for region_ids in vec![vec![1, 2], vec![3]] { let (tx, rx) = channel::channel(1, quota.clone()); conn_rxs.push(rx); @@ -2287,7 +2502,7 @@ mod tests { let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); let version = FeatureGate::batch_resolved_ts(); - suite.run(set_conn_verion_task(conn_id, version)); + suite.run(set_conn_version_task(conn_id, version)); for region_id in region_ids { suite.add_region(region_id, 100); @@ -2311,7 +2526,8 @@ mod tests { downstream, conn_id, }); - let resolver = Resolver::new(region_id); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let resolver = Resolver::new(region_id, memory_quota); let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; let mut region = Region::default(); region.set_id(region_id); @@ -2392,14 +2608,14 @@ mod tests { fn test_deregister_conn_then_delegate() { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); // Open conn a let (tx1, _rx1) = channel::channel(1, quota.clone()); let conn_a = Conn::new(tx1, String::new()); let conn_id_a = conn_a.get_id(); suite.run(Task::OpenConn { conn: conn_a }); - suite.run(set_conn_verion_task( + suite.run(set_conn_version_task( conn_id_a, semver::Version::new(0, 0, 0), )); @@ -2410,7 +2626,7 @@ mod tests { let conn_b = Conn::new(tx2, String::new()); let conn_id_b = conn_b.get_id(); suite.run(Task::OpenConn { conn: conn_b }); - suite.run(set_conn_verion_task( + suite.run(set_conn_version_task( conn_id_b, semver::Version::new(0, 0, 0), )); @@ -2470,10 +2686,11 @@ mod tests { let mut region = Region::default(); region.id = 1; region.set_region_epoch(region_epoch_2); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); suite.run(Task::ResolverReady { observe_id, region: region.clone(), - resolver: Resolver::new(1), + resolver: Resolver::new(1, memory_quota), }); // Deregister deletgate due to epoch not match for conn b. @@ -2557,7 +2774,7 @@ mod tests { ..Default::default() }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -2566,7 +2783,7 @@ mod tests { suite.run(Task::OpenConn { conn }); // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); - suite.run(set_conn_verion_task(conn_id, version)); + suite.run(set_conn_version_task(conn_id, version)); let mut req_header = Header::default(); req_header.set_cluster_id(0); @@ -2596,8 +2813,11 @@ mod tests { conn_id, }); - let mut resolver = Resolver::new(id); - resolver.track_lock(TimeStamp::compose(0, id), vec![], None); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(id, memory_quota); + resolver + .track_lock(TimeStamp::compose(0, id), vec![], None) + .unwrap(); let mut region = Region::default(); region.id = id; region.set_region_epoch(region_epoch); @@ -2605,7 +2825,8 @@ mod tests { .capture_regions .get_mut(&id) .unwrap() - .on_region_ready(resolver, region); + .on_region_ready(resolver, region) + .unwrap(); assert!(failed.is_empty()); } suite @@ -2646,7 +2867,7 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (tx, mut rx) = channel::channel(1, quota); let mut rx = rx.drain(); @@ -2655,7 +2876,7 @@ mod tests { suite.run(Task::OpenConn { conn }); let version = FeatureGate::batch_resolved_ts(); - suite.run(set_conn_verion_task(conn_id, version)); + suite.run(set_conn_version_task(conn_id, version)); let mut req_header = Header::default(); req_header.set_cluster_id(0); @@ -2834,5 +3055,67 @@ mod tests { })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); assert_eq!(suite.capture_regions.len(), 0); + for _ in 0..2 { + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + let check = matches!(cdc_event.0, CdcEvent::Event(e) if { + matches!(e.event, Some(Event_oneof_event::Error(ref err)) if { + err.has_region_not_found() + }) + }); + assert!(check); + } + + // Resubscribe the region. + suite.add_region(2, 100); + for i in 1..=2 { + req.set_request_id(1); + req.set_region_id(i); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + 1, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + }); + assert_eq!(suite.connections[&conn_id].downstreams_count(), i as usize); + } + + // Deregister regions one by one in the request. + suite.run(Task::Deregister(Deregister::Region { + conn_id, + request_id: 1, + region_id: 1, + })); + assert_eq!(suite.connections[&conn_id].downstreams_count(), 1); + assert_eq!(suite.capture_regions.len(), 1); + + suite.run(Task::Deregister(Deregister::Region { + conn_id, + request_id: 1, + region_id: 2, + })); + assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); + assert_eq!(suite.capture_regions.len(), 0); + + for _ in 0..2 { + let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) + .unwrap() + .unwrap(); + let check = matches!(cdc_event.0, CdcEvent::Event(e) if { + matches!(e.event, Some(Event_oneof_event::Error(ref err)) if { + err.has_region_not_found() + }) + }); + assert!(check); + } } } diff --git a/components/cdc/src/errors.rs b/components/cdc/src/errors.rs index c9a61c73dc4..e7bd7605e7d 100644 --- a/components/cdc/src/errors.rs +++ b/components/cdc/src/errors.rs @@ -10,6 +10,7 @@ use tikv::storage::{ mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, }; +use tikv_util::memory::MemoryQuotaExceeded; use txn_types::Error as TxnTypesError; use crate::channel::SendError; @@ -35,6 +36,8 @@ pub enum Error { EngineTraits(#[from] EngineTraitsError), #[error("Sink send error {0:?}")] Sink(#[from] SendError), + #[error("Memory quota exceeded")] + MemoryQuotaExceeded(#[from] MemoryQuotaExceeded), } macro_rules! impl_from { diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 2c0884bb303..ef39a693e3e 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -23,7 +23,7 @@ use raftstore::{ msg::{Callback, ReadResponse}, }, }; -use resolved_ts::Resolver; +use resolved_ts::{Resolver, TsSource}; use tikv::storage::{ kv::Snapshot, mvcc::{DeltaScanner, ScannerBuilder}, @@ -35,7 +35,8 @@ use tikv_kv::Iterator; use tikv_util::{ box_err, codec::number, - debug, error, info, + debug, defer, error, info, + memory::MemoryQuota, sys::inspector::{self_thread_inspector, ThreadInspector}, time::{Instant, Limiter}, warn, @@ -89,7 +90,9 @@ pub(crate) struct Initializer { pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, - pub(crate) speed_limiter: Limiter, + pub(crate) scan_speed_limiter: Limiter, + pub(crate) fetch_speed_limiter: Limiter, + pub(crate) max_scan_batch_bytes: usize, pub(crate) max_scan_batch_size: usize, @@ -107,29 +110,11 @@ impl Initializer { change_observer: ChangeObserver, cdc_handle: T, concurrency_semaphore: Arc, + memory_quota: Arc, ) -> Result<()> { fail_point!("cdc_before_initialize"); let _permit = concurrency_semaphore.acquire().await; - // When downstream_state is Stopped, it means the corresponding delegate - // is stopped. The initialization can be safely canceled. - // - // Acquiring a permit may take some time, it is possible that - // initialization can be canceled. - if self.downstream_state.load() == DownstreamState::Stopped { - info!("cdc async incremental scan canceled"; - "region_id" => self.region_id, - "downstream_id" => ?self.downstream_id, - "observe_id" => ?self.observe_id, - "conn_id" => ?self.conn_id); - return Err(box_err!("scan canceled")); - } - - CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); - tikv_util::defer!({ - CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec(); - }); - // To avoid holding too many snapshots and holding them too long, // we need to acquire scan concurrency permit before taking snapshot. let sched = self.sched.clone(); @@ -172,7 +157,7 @@ impl Initializer { } match fut.await { - Ok(resp) => self.on_change_cmd_response(resp).await, + Ok(resp) => self.on_change_cmd_response(resp, memory_quota).await, Err(e) => Err(Error::Other(box_err!(e))), } } @@ -180,11 +165,13 @@ impl Initializer { pub(crate) async fn on_change_cmd_response( &mut self, mut resp: ReadResponse, + memory_quota: Arc, ) -> Result<()> { if let Some(region_snapshot) = resp.snapshot { - assert_eq!(self.region_id, region_snapshot.get_region().get_id()); let region = region_snapshot.get_region().clone(); - self.async_incremental_scan(region_snapshot, region).await + assert_eq!(self.region_id, region.get_id()); + self.async_incremental_scan(region_snapshot, region, memory_quota) + .await } else { assert!( resp.response.get_header().has_error(), @@ -200,11 +187,29 @@ impl Initializer { &mut self, snap: S, region: Region, + memory_quota: Arc, ) -> Result<()> { - let downstream_id = self.downstream_id; + CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); + defer!(CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec()); + let region_id = region.get_id(); + let downstream_id = self.downstream_id; let observe_id = self.observe_id; + let conn_id = self.conn_id; let kv_api = self.kv_api; + let on_cancel = || -> Result<()> { + info!("cdc async incremental scan canceled"; + "region_id" => region_id, + "downstream_id" => ?downstream_id, + "observe_id" => ?observe_id, + "conn_id" => ?conn_id); + Err(box_err!("scan canceled")) + }; + + if self.downstream_state.load() == DownstreamState::Stopped { + return on_cancel(); + } + self.observed_range.update_region_key_range(®ion); debug!("cdc async incremental scan"; "region_id" => region_id, @@ -215,7 +220,7 @@ impl Initializer { "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); let mut resolver = if self.build_resolver { - Some(Resolver::new(region_id)) + Some(Resolver::new(region_id, memory_quota)) } else { None }; @@ -253,7 +258,6 @@ impl Initializer { }; fail_point!("cdc_incremental_scan_start"); - let conn_id = self.conn_id; let mut done = false; let start = Instant::now_coarse(); @@ -263,15 +267,6 @@ impl Initializer { DownstreamState::Initializing | DownstreamState::Stopped )); - let on_cancel = || -> Result<()> { - info!("cdc async incremental scan canceled"; - "region_id" => region_id, - "downstream_id" => ?downstream_id, - "observe_id" => ?observe_id, - "conn_id" => ?conn_id); - Err(box_err!("scan canceled")) - }; - while !done { // When downstream_state is Stopped, it means the corresponding // delegate is stopped. The initialization can be safely canceled. @@ -399,16 +394,14 @@ impl Initializer { perf_delta, } = self.do_scan(scanner, old_value_cursors, &mut entries)?; - CDC_SCAN_BYTES.inc_by(emit as _); TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += perf_delta); tls_flush_perf_stats(); - let require = if let Some(bytes) = disk_read { + if let Some(bytes) = disk_read { CDC_SCAN_DISK_READ_BYTES.inc_by(bytes as _); - bytes - } else { - perf_delta.block_read_byte as usize - }; - self.speed_limiter.consume(require).await; + self.scan_speed_limiter.consume(bytes).await; + } + CDC_SCAN_BYTES.inc_by(emit as _); + self.fetch_speed_limiter.consume(emit as _).await; if let Some(resolver) = resolver { // Track the locks. @@ -418,7 +411,9 @@ impl Initializer { let key = Key::from_encoded_slice(encoded_key).into_raw().unwrap(); let lock = Lock::parse(value)?; match lock.lock_type { - LockType::Put | LockType::Delete => resolver.track_lock(lock.ts, key, None), + LockType::Put | LockType::Delete => { + resolver.track_lock(lock.ts, key, None)?; + } _ => (), }; } @@ -458,7 +453,7 @@ impl Initializer { fn finish_building_resolver(&self, mut resolver: Resolver, region: Region) { let observe_id = self.observe_id; - let rts = resolver.resolve(TimeStamp::zero(), None); + let rts = resolver.resolve(TimeStamp::zero(), None, TsSource::Cdc); info!( "cdc resolver initialized and schedule resolver ready"; "region_id" => region.get_id(), @@ -568,7 +563,6 @@ mod tests { time::Duration, }; - use collections::HashSet; use engine_rocks::RocksEngine; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; @@ -577,6 +571,7 @@ mod tests { errorpb::Error as ErrorHeader, }; use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; + use resolved_ts::TxnLocks; use test_raftstore::MockRaftStoreRouter; use tikv::storage::{ kv::Engine, @@ -587,6 +582,7 @@ mod tests { TestEngineBuilder, }; use tikv_util::{ + memory::MemoryQuota, sys::thread::ThreadBuildWrapper, worker::{LazyWorker, Runnable}, }; @@ -616,7 +612,8 @@ mod tests { } fn mock_initializer( - speed_limit: usize, + scan_limit: usize, + fetch_limit: usize, buffer: usize, engine: Option, kv_api: ChangeDataRequestKvApi, @@ -629,7 +626,7 @@ mod tests { crate::channel::Drain, ) { let (receiver_worker, rx) = new_receiver_worker(); - let quota = crate::channel::MemoryQuota::new(usize::MAX); + let quota = Arc::new(MemoryQuota::new(usize::MAX)); let (sink, drain) = crate::channel::channel(buffer, quota); let pool = Builder::new_multi_thread() @@ -657,7 +654,8 @@ mod tests { conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), - speed_limiter: Limiter::new(speed_limit as _), + scan_speed_limiter: Limiter::new(scan_limit as _), + fetch_speed_limiter: Limiter::new(fetch_limit as _), max_scan_batch_bytes: 1024 * 1024, max_scan_batch_size: 1024, build_resolver: true, @@ -673,7 +671,7 @@ mod tests { fn test_initializer_build_resolver() { let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); - let mut expected_locks = BTreeMap::>>::new(); + let mut expected_locks = BTreeMap::::new(); // Only observe ["", "b\0x90"] let observed_range = ObservedRange::new( @@ -696,10 +694,12 @@ mod tests { total_bytes += v.len(); let ts = TimeStamp::new(i as _); must_prewrite_put(&mut engine, k, v, k, ts); - expected_locks - .entry(ts) - .or_default() - .insert(k.to_vec().into()); + let txn_locks = expected_locks.entry(ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(k.to_vec().into()); + txn_locks + }); + txn_locks.lock_count += 1; } let region = Region::default(); @@ -707,6 +707,7 @@ mod tests { // Buffer must be large enough to unblock async incremental scan. let buffer = 1000; let (mut worker, pool, mut initializer, rx, mut drain) = mock_initializer( + total_bytes, total_bytes, buffer, engine.kv_engine(), @@ -737,21 +738,37 @@ mod tests { } }); - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); check_result(); initializer .downstream_state .store(DownstreamState::Initializing); initializer.max_scan_batch_bytes = total_bytes; - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); check_result(); initializer .downstream_state .store(DownstreamState::Initializing); initializer.build_resolver = false; - block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + block_on(initializer.async_incremental_scan( + snap.clone(), + region.clone(), + memory_quota.clone(), + )) + .unwrap(); loop { let task = rx.recv_timeout(Duration::from_millis(100)); @@ -764,7 +781,8 @@ mod tests { // Test cancellation. initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.async_incremental_scan(snap.clone(), region)).unwrap_err(); + block_on(initializer.async_incremental_scan(snap.clone(), region, memory_quota.clone())) + .unwrap_err(); // Cancel error should trigger a deregsiter. let mut region = Region::default(); @@ -776,14 +794,15 @@ mod tests { response: Default::default(), txn_extra_op: Default::default(), }; - block_on(initializer.on_change_cmd_response(resp.clone())).unwrap_err(); + block_on(initializer.on_change_cmd_response(resp.clone(), memory_quota.clone())) + .unwrap_err(); // Disconnect sink by dropping runtime (it also drops drain). drop(pool); initializer .downstream_state .store(DownstreamState::Initializing); - block_on(initializer.on_change_cmd_response(resp)).unwrap_err(); + block_on(initializer.on_change_cmd_response(resp, memory_quota)).unwrap_err(); worker.stop(); } @@ -804,6 +823,7 @@ mod tests { // Buffer must be large enough to unblock async incremental scan. let buffer = 1000; let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + total_bytes, total_bytes, buffer, engine.kv_engine(), @@ -811,8 +831,9 @@ mod tests { filter_loop, ); let th = pool.spawn(async move { + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default()) + .async_incremental_scan(snap, Region::default(), memory_quota) .await .unwrap(); }); @@ -885,6 +906,7 @@ mod tests { // Do incremental scan with different `hint_min_ts` values. for checkpoint_ts in [200, 100, 150] { let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + usize::MAX, usize::MAX, 1000, engine.kv_engine(), @@ -896,8 +918,9 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let th = pool.spawn(async move { + let memory_qutoa = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default()) + .async_incremental_scan(snap, Region::default(), memory_qutoa) .await .unwrap(); }); @@ -949,6 +972,7 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + total_bytes, total_bytes, buffer, None, @@ -1004,17 +1028,19 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, pool, mut initializer, _rx, _drain) = - mock_initializer(total_bytes, buffer, None, kv_api, false); + mock_initializer(total_bytes, total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); let concurrency_semaphore = Arc::new(Semaphore::new(1)); + let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer.downstream_state.store(DownstreamState::Stopped); block_on(initializer.initialize( change_cmd, raft_router.clone(), concurrency_semaphore.clone(), + memory_quota.clone(), )) .unwrap_err(); @@ -1040,7 +1066,7 @@ mod tests { &concurrency_semaphore, ); let res = initializer - .initialize(change_cmd, raft_router, concurrency_semaphore) + .initialize(change_cmd, raft_router, concurrency_semaphore, memory_quota) .await; tx1.send(res).unwrap(); }); diff --git a/components/cdc/src/lib.rs b/components/cdc/src/lib.rs index c913cefb92e..64f110f5c45 100644 --- a/components/cdc/src/lib.rs +++ b/components/cdc/src/lib.rs @@ -15,7 +15,7 @@ mod old_value; mod service; mod txn_source; -pub use channel::{recv_timeout, CdcEvent, MemoryQuota}; +pub use channel::{recv_timeout, CdcEvent}; pub use config::CdcConfigManager; pub use delegate::Delegate; pub use endpoint::{CdcTxnExtraScheduler, Endpoint, Task, Validate}; diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index aac2842e404..965a31ac7ff 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -121,7 +121,8 @@ impl CmdObserver for CdcObserver { // Create a snapshot here for preventing the old value was GC-ed. // TODO: only need it after enabling old value, may add a flag to indicate // whether to get it. - let snapshot = RegionSnapshot::from_snapshot(Arc::new(engine.snapshot()), Arc::new(region)); + let snapshot = + RegionSnapshot::from_snapshot(Arc::new(engine.snapshot(None)), Arc::new(region)); let get_old_value = move |key, query_ts, old_value_cache: &mut OldValueCache, @@ -177,20 +178,26 @@ impl RegionChangeObserver for CdcObserver { event: RegionChangeEvent, _: StateRole, ) { - if let RegionChangeEvent::Destroy = event { - let region_id = ctx.region().get_id(); - if let Some(observe_id) = self.is_subscribed(region_id) { - // Unregister all downstreams. - let store_err = RaftStoreError::RegionNotFound(region_id); - let deregister = Deregister::Delegate { - region_id, - observe_id, - err: CdcError::request(store_err.into()), - }; - if let Err(e) = self.sched.schedule(Task::Deregister(deregister)) { - error!("cdc schedule cdc task failed"; "error" => ?e); + match event { + RegionChangeEvent::Destroy + | RegionChangeEvent::Update( + RegionChangeReason::Split | RegionChangeReason::CommitMerge, + ) => { + let region_id = ctx.region().get_id(); + if let Some(observe_id) = self.is_subscribed(region_id) { + // Unregister all downstreams. + let store_err = RaftStoreError::RegionNotFound(region_id); + let deregister = Deregister::Delegate { + region_id, + observe_id, + err: CdcError::request(store_err.into()), + }; + if let Err(e) = self.sched.schedule(Task::Deregister(deregister)) { + error!("cdc schedule cdc task failed"; "error" => ?e); + } } } + _ => {} } } } diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index e343ccc226f..02f1bd00507 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -308,7 +308,7 @@ mod tests { value: Option, ) -> Statistics { let key = key.clone().append_ts(ts.into()); - let snapshot = Arc::new(kv_engine.snapshot()); + let snapshot = Arc::new(kv_engine.snapshot(None)); let mut cursor = new_write_cursor_on_key(&snapshot, &key); let load_default = Either::Left(&snapshot); let mut stats = Statistics::default(); @@ -527,7 +527,7 @@ mod tests { must_commit(&mut engine, &key, 200, 201); } - let snapshot = Arc::new(kv_engine.snapshot()); + let snapshot = Arc::new(kv_engine.snapshot(None)); let mut cursor = new_old_value_cursor(&snapshot, CF_WRITE); let mut default_cursor = new_old_value_cursor(&snapshot, CF_DEFAULT); let mut load_default = |use_default_cursor: bool| { @@ -598,7 +598,7 @@ mod tests { } let key = format!("zkey-{:0>3}", 0).into_bytes(); - let snapshot = Arc::new(kv_engine.snapshot()); + let snapshot = Arc::new(kv_engine.snapshot(None)); let perf_instant = ReadPerfInstant::new(); let value = get_old_value( &snapshot, diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index d07b5283380..7cbf268f2b7 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -16,10 +16,10 @@ use kvproto::{ }, kvrpcpb::ApiVersion, }; -use tikv_util::{error, info, warn, worker::*}; +use tikv_util::{error, info, memory::MemoryQuota, warn, worker::*}; use crate::{ - channel::{channel, MemoryQuota, Sink, CDC_CHANNLE_CAPACITY}, + channel::{channel, Sink, CDC_CHANNLE_CAPACITY}, delegate::{Downstream, DownstreamId, DownstreamState, ObservedRange}, endpoint::{Deregister, Task}, }; @@ -244,14 +244,14 @@ impl EventFeedHeaders { #[derive(Clone)] pub struct Service { scheduler: Scheduler, - memory_quota: MemoryQuota, + memory_quota: Arc, } impl Service { /// Create a ChangeData service. /// /// It requires a scheduler of an `Endpoint` in order to schedule tasks. - pub fn new(scheduler: Scheduler, memory_quota: MemoryQuota) -> Service { + pub fn new(scheduler: Scheduler, memory_quota: Arc) -> Service { Service { scheduler, memory_quota, @@ -304,6 +304,13 @@ impl Service { scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } + // ### Command types: + // * Register registers a region. 1) both `request_id` and `region_id` must be + // specified; 2) `request_id` can be 0 but `region_id` can not. + // * Deregister deregisters some regions in one same `request_id` or just one + // region. 1) if both `request_id` and `region_id` are specified, just + // deregister the region; 2) if only `request_id` is specified, all region + // subscriptions with the same `request_id` will be deregistered. fn handle_request( scheduler: &Scheduler, peer: &str, @@ -361,10 +368,18 @@ impl Service { request: ChangeDataRequest, conn_id: ConnId, ) -> Result<(), String> { - let task = Task::Deregister(Deregister::Request { - conn_id, - request_id: request.request_id, - }); + let task = if request.region_id != 0 { + Task::Deregister(Deregister::Region { + conn_id, + request_id: request.request_id, + region_id: request.region_id, + }) + } else { + Task::Deregister(Deregister::Request { + conn_id, + request_id: request.request_id, + }) + }; scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } @@ -518,7 +533,7 @@ mod tests { use crate::channel::{recv_timeout, CdcEvent}; fn new_rpc_suite(capacity: usize) -> (Server, ChangeDataClient, ReceiverWrapper) { - let memory_quota = MemoryQuota::new(capacity); + let memory_quota = Arc::new(MemoryQuota::new(capacity)); let (scheduler, rx) = dummy_scheduler(); let cdc_service = Service::new(scheduler, memory_quota); let env = Arc::new(EnvBuilder::new().build()); diff --git a/components/cdc/tests/failpoints/mod.rs b/components/cdc/tests/failpoints/mod.rs index 082b1c15f67..619ee200985 100644 --- a/components/cdc/tests/failpoints/mod.rs +++ b/components/cdc/tests/failpoints/mod.rs @@ -4,6 +4,7 @@ #![test_runner(test_util::run_failpoint_tests)] mod test_endpoint; +mod test_memory_quota; mod test_observe; mod test_register; mod test_resolve; diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index f7cc387625d..42977cc3856 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -569,3 +569,29 @@ fn test_cdc_stream_multiplexing() { } assert!(request_2_ready); } + +// This case tests pending regions can still get region split/merge +// notifications. +#[test] +fn test_cdc_notify_pending_regions() { + let cluster = new_server_cluster(0, 1); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + + fail::cfg("cdc_before_initialize", "pause").unwrap(); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + thread::sleep(Duration::from_millis(100)); + suite.cluster.must_split(®ion, b"x"); + let event = receive_event(false); + matches!( + event.get_events()[0].event, + Some(Event_oneof_event::Error(ref e)) if e.has_region_not_found(), + ); + fail::remove("cdc_before_initialize"); +} diff --git a/components/cdc/tests/failpoints/test_memory_quota.rs b/components/cdc/tests/failpoints/test_memory_quota.rs new file mode 100644 index 00000000000..5b564ba61ec --- /dev/null +++ b/components/cdc/tests/failpoints/test_memory_quota.rs @@ -0,0 +1,289 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::*, time::Duration}; + +use cdc::{Task, Validate}; +use futures::{executor::block_on, SinkExt}; +use grpcio::WriteFlags; +use kvproto::{cdcpb::*, kvrpcpb::*}; +use pd_client::PdClient; +use test_raftstore::*; + +use crate::{new_event_feed, TestSuiteBuilder}; + +#[test] +fn test_resolver_track_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + + // Client must receive messages when there is no congest error. + let key_size = memory_quota / 2; + let (k, v) = (vec![1; key_size], vec![5]); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Prewrite); + } + other => panic!("unknown event {:?}", other), + } + + // Trigger congest error. + let key_size = memory_quota * 2; + let (k, v) = (vec![2; key_size], vec![5]); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + suite.stop(); +} + +#[test] +fn test_pending_on_region_ready_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Trigger memory quota exceeded error. + fail::cfg("cdc_pending_on_region_ready", "return").unwrap(); + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + // MemoryQuotaExceeded error is triggered on_region_ready. + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + fail::remove("cdc_incremental_scan_start"); + suite.stop(); +} + +#[test] +fn test_pending_push_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Pause scan so that no region can be initialized, and all locks will be + // put in pending locks. + fail::cfg("cdc_incremental_scan_start", "pause").unwrap(); + + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + // Trigger congest error. + let key_size = memory_quota * 2; + let (k, v) = (vec![1; key_size], vec![5]); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + + // The delegate must be removed. + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + fail::remove("cdc_incremental_scan_start"); + suite.stop(); +} + +#[test] +fn test_scan_lock_memory_quota_exceeded() { + let mut cluster = new_server_cluster(1, 1); + // Increase the Raft tick interval to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(100), None); + let memory_quota = 1024; // 1KB + let mut suite = TestSuiteBuilder::new() + .cluster(cluster) + .memory_quota(memory_quota) + .build(); + + // Let CdcEvent size be 0 to effectively disable memory quota for CdcEvent. + fail::cfg("cdc_event_size", "return(0)").unwrap(); + + // Put a lock that exceeds memory quota. + let key_size = memory_quota * 2; + let (k, v) = (vec![1; key_size], vec![5]); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k, start_ts); + + // No region can be initialized. + let req = suite.new_changedata_request(1); + let (mut req_tx, _event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let mut events = receive_event(false).events.to_vec(); + assert_eq!(events.len(), 1, "{:?}", events); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Error(e) => { + // Unknown errors are translated into region_not_found. + assert!(e.has_region_not_found(), "{:?}", e); + } + other => panic!("unknown event {:?}", other), + } + let scheduler = suite.endpoints.values().next().unwrap().scheduler(); + let (tx, rx) = mpsc::channel(); + scheduler + .schedule(Task::Validate(Validate::Region( + 1, + Box::new(move |delegate| { + tx.send(delegate.is_none()).unwrap(); + }), + ))) + .unwrap(); + + assert!( + rx.recv_timeout(Duration::from_millis(1000)).unwrap(), + "find unexpected delegate" + ); + + suite.stop(); +} diff --git a/components/cdc/tests/failpoints/test_register.rs b/components/cdc/tests/failpoints/test_register.rs index 4558397f8a9..2b6be3744af 100644 --- a/components/cdc/tests/failpoints/test_register.rs +++ b/components/cdc/tests/failpoints/test_register.rs @@ -165,7 +165,11 @@ fn test_connections_register_impl() { let mut events = receive_event(false).events.to_vec(); match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(err) => { - assert!(err.has_epoch_not_match(), "{:?}", err); + assert!( + err.has_epoch_not_match() || err.has_region_not_found(), + "{:?}", + err + ); } other => panic!("unknown event {:?}", other), } diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index d2c4519d50d..b85c1db4493 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -6,7 +6,7 @@ use std::{ }; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, MemoryQuota, Task, Validate}; +use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, Task, Validate}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; @@ -26,6 +26,7 @@ use test_raftstore::*; use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID, storage::kv::LocalTablets}; use tikv_util::{ config::ReadableDuration, + memory::MemoryQuota, worker::{LazyWorker, Runnable}, HandyRwLock, }; @@ -129,7 +130,7 @@ fn create_event_feed( } pub struct TestSuiteBuilder { - cluster: Option>, + cluster: Option>>, memory_quota: Option, } @@ -142,7 +143,10 @@ impl TestSuiteBuilder { } #[must_use] - pub fn cluster(mut self, cluster: Cluster) -> TestSuiteBuilder { + pub fn cluster( + mut self, + cluster: Cluster>, + ) -> TestSuiteBuilder { self.cluster = Some(cluster); self } @@ -159,7 +163,7 @@ impl TestSuiteBuilder { pub fn build_with_cluster_runner(self, mut runner: F) -> TestSuite where - F: FnMut(&mut Cluster), + F: FnMut(&mut Cluster>), { init(); let memory_quota = self.memory_quota.unwrap_or(usize::MAX); @@ -167,6 +171,7 @@ impl TestSuiteBuilder { let count = cluster.count; let pd_cli = cluster.pd_client.clone(); let mut endpoints = HashMap::default(); + let mut quotas = HashMap::default(); let mut obs = HashMap::default(); let mut concurrency_managers = HashMap::default(); // Hack! node id are generated from 1..count+1. @@ -176,15 +181,14 @@ impl TestSuiteBuilder { let mut sim = cluster.sim.wl(); // Register cdc service to gRPC server. + let memory_quota = Arc::new(MemoryQuota::new(memory_quota)); + let memory_quota_ = memory_quota.clone(); let scheduler = worker.scheduler(); sim.pending_services .entry(id) .or_default() .push(Box::new(move || { - create_change_data(cdc::Service::new( - scheduler.clone(), - MemoryQuota::new(memory_quota), - )) + create_change_data(cdc::Service::new(scheduler.clone(), memory_quota_.clone())) })); sim.txn_extra_schedulers.insert( id, @@ -199,6 +203,7 @@ impl TestSuiteBuilder { }, )); endpoints.insert(id, worker); + quotas.insert(id, memory_quota); } runner(&mut cluster); @@ -223,7 +228,7 @@ impl TestSuiteBuilder { cm.clone(), env, sim.security_mgr.clone(), - MemoryQuota::new(usize::MAX), + quotas[id].clone(), sim.get_causal_ts_provider(*id), ); let mut updated_cfg = cfg.clone(); @@ -247,7 +252,7 @@ impl TestSuiteBuilder { } pub struct TestSuite { - pub cluster: Cluster, + pub cluster: Cluster>, pub endpoints: HashMap>, pub obs: HashMap, tikv_cli: HashMap, diff --git a/components/cloud/Cargo.toml b/components/cloud/Cargo.toml index 3931370390e..b5f1e8faffd 100644 --- a/components/cloud/Cargo.toml +++ b/components/cloud/Cargo.toml @@ -11,7 +11,7 @@ error_code = { workspace = true } futures-io = "0.3" kvproto = { workspace = true } lazy_static = "1.3" -openssl = "0.10" +openssl = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } rusoto_core = "0.46.0" diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index a7ea47ec9d2..f06d86b37cb 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -1,5 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error as StdError, io, time::Duration}; +use std::{ + error::Error as StdError, + io, + time::{Duration, SystemTime}, +}; use async_trait::async_trait; use cloud::{ @@ -12,10 +16,11 @@ use futures_util::{ io::{AsyncRead, AsyncReadExt}, stream::TryStreamExt, }; -pub use kvproto::brpb::{Bucket as InputBucket, CloudDynamic, S3 as InputConfig}; +pub use kvproto::brpb::{Bucket as InputBucket, S3 as InputConfig}; use rusoto_core::{request::DispatchSignedRequest, ByteStream, RusotoError}; use rusoto_credential::{ProvideAwsCredentials, StaticProvider}; use rusoto_s3::{util::AddressingStyle, *}; +use rusoto_sts::{StsAssumeRoleSessionCredentialsProvider, StsClient}; use thiserror::Error; use tikv_util::{debug, stream::error_stream, time::Instant}; use tokio::time::{sleep, timeout}; @@ -29,6 +34,7 @@ pub const STORAGE_VENDOR_NAME_AWS: &str = "aws"; pub struct AccessKeyPair { pub access_key: StringNonEmpty, pub secret_access_key: StringNonEmpty, + pub session_token: Option, } impl std::fmt::Debug for AccessKeyPair { @@ -36,6 +42,7 @@ impl std::fmt::Debug for AccessKeyPair { f.debug_struct("AccessKeyPair") .field("access_key", &self.access_key) .field("secret_access_key", &"?") + .field("session_token", &self.session_token) .finish() } } @@ -51,6 +58,8 @@ pub struct Config { storage_class: Option, multi_part_size: usize, object_lock_enabled: bool, + role_arn: Option, + external_id: Option, } impl Config { @@ -66,42 +75,11 @@ impl Config { storage_class: None, multi_part_size: MINIMUM_PART_SIZE, object_lock_enabled: false, + role_arn: None, + external_id: None, } } - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - let bucket = BucketConf::from_cloud_dynamic(cloud_dynamic)?; - let attrs = &cloud_dynamic.attrs; - let def = &String::new(); - let force_path_style_str = attrs.get("force_path_style").unwrap_or(def).clone(); - let force_path_style = force_path_style_str == "true" || force_path_style_str == "True"; - let access_key_opt = attrs.get("access_key"); - let access_key_pair = if let Some(access_key) = access_key_opt { - let secret_access_key = attrs.get("secret_access_key").unwrap_or(def).clone(); - Some(AccessKeyPair { - access_key: StringNonEmpty::required_field(access_key.clone(), "access_key")?, - secret_access_key: StringNonEmpty::required_field( - secret_access_key, - "secret_access_key", - )?, - }) - } else { - None - }; - let storage_class = bucket.storage_class.clone(); - Ok(Config { - bucket, - storage_class, - sse: StringNonEmpty::opt(attrs.get("sse").unwrap_or(def).clone()), - acl: StringNonEmpty::opt(attrs.get("acl").unwrap_or(def).clone()), - access_key_pair, - force_path_style, - sse_kms_key_id: StringNonEmpty::opt(attrs.get("sse_kms_key_id").unwrap_or(def).clone()), - multi_part_size: MINIMUM_PART_SIZE, - object_lock_enabled: false, - }) - } - pub fn from_input(input: InputConfig) -> io::Result { let storage_class = StringNonEmpty::opt(input.storage_class); let endpoint = StringNonEmpty::opt(input.endpoint); @@ -114,13 +92,17 @@ impl Config { }; let access_key_pair = match StringNonEmpty::opt(input.access_key) { None => None, - Some(ak) => Some(AccessKeyPair { - access_key: ak, - secret_access_key: StringNonEmpty::required_field( - input.secret_access_key, - "secret_access_key", - )?, - }), + Some(ak) => { + let session_token = StringNonEmpty::opt(input.session_token); + Some(AccessKeyPair { + access_key: ak, + secret_access_key: StringNonEmpty::required_field( + input.secret_access_key, + "secret_access_key", + )?, + session_token, + }) + } }; Ok(Config { storage_class, @@ -132,6 +114,8 @@ impl Config { sse_kms_key_id: StringNonEmpty::opt(input.sse_kms_key_id), multi_part_size: MINIMUM_PART_SIZE, object_lock_enabled: input.object_lock_enabled, + role_arn: StringNonEmpty::opt(input.role_arn), + external_id: StringNonEmpty::opt(input.external_id), }) } } @@ -162,10 +146,6 @@ impl S3Storage { Self::new(Config::from_input(input)?) } - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - Self::new(Config::from_cloud_dynamic(cloud_dynamic)?) - } - pub fn set_multi_part_size(&mut self, mut size: usize) { if size < MINIMUM_PART_SIZE { // default multi_part_size is 5MB, S3 cannot allow a smaller size. @@ -198,20 +178,59 @@ impl S3Storage { Ok(S3Storage { config, client }) } + fn maybe_assume_role( + config: Config, + cred_provider: P, + dispatcher: D, + ) -> io::Result + where + P: ProvideAwsCredentials + Send + Sync + 'static, + D: DispatchSignedRequest + Send + Sync + 'static, + { + if config.role_arn.is_some() { + // try use role arn anyway with current creds when it's not nil. + let bucket_region = none_to_empty(config.bucket.region.clone()); + let bucket_endpoint = config.bucket.endpoint.clone(); + let region = util::get_region(&bucket_region, &none_to_empty(bucket_endpoint))?; + // cannot use the same dispatcher because of move, so use another http client. + let sts = StsClient::new_with(util::new_http_client()?, cred_provider, region); + let duration_since_epoch = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + let timestamp_secs = duration_since_epoch.as_secs(); + let cred_provider = StsAssumeRoleSessionCredentialsProvider::new( + sts, + String::clone(config.role_arn.as_deref().unwrap()), + format!("{}", timestamp_secs), + config.external_id.as_deref().map(String::clone), + // default duration is 15min + None, + None, + None, + ); + Self::new_creds_dispatcher(config, dispatcher, cred_provider) + } else { + // or just use original cred_provider to access s3. + Self::new_creds_dispatcher(config, dispatcher, cred_provider) + } + } + pub fn with_request_dispatcher(config: Config, dispatcher: D) -> io::Result where D: DispatchSignedRequest + Send + Sync + 'static, { // static credentials are used with minio if let Some(access_key_pair) = &config.access_key_pair { - let cred_provider = StaticProvider::new_minimal( + let cred_provider = StaticProvider::new( (*access_key_pair.access_key).to_owned(), (*access_key_pair.secret_access_key).to_owned(), + access_key_pair.session_token.as_deref().map(String::clone), + None, ); - Self::new_creds_dispatcher(config, dispatcher, cred_provider) + Self::maybe_assume_role(config, cred_provider, dispatcher) } else { let cred_provider = util::CredentialsProvider::new()?; - Self::new_creds_dispatcher(config, dispatcher, cred_provider) + Self::maybe_assume_role(config, cred_provider, dispatcher) } } @@ -290,7 +309,7 @@ impl From> for UploadError { } } -/// try_read_exact tries to read exact length data as the buffer size. +/// try_read_exact tries to read exact length data as the buffer size. /// like [`std::io::Read::read_exact`], but won't return `UnexpectedEof` when /// cannot read anything more from the `Read`. once returning a size less than /// the buffer length, implies a EOF was meet, or nothing read. @@ -311,6 +330,8 @@ async fn try_read_exact( } } +// NOTICE: the openssl fips doesn't support md5, therefore use md5 pakcage to +// hash fn get_content_md5(object_lock_enabled: bool, content: &[u8]) -> Option { object_lock_enabled.then(|| { let digest = md5::compute(content); @@ -637,6 +658,7 @@ mod tests { config.access_key_pair = Some(AccessKeyPair { access_key: StringNonEmpty::required("abc".to_string()).unwrap(), secret_access_key: StringNonEmpty::required("xyz".to_string()).unwrap(), + session_token: Some(StringNonEmpty::required("token".to_string()).unwrap()), }); let mut s = S3Storage::new(config.clone()).unwrap(); // set a less than 5M value not work @@ -871,66 +893,6 @@ mod tests { ); } - #[test] - fn test_config_round_trip() { - let mut input = InputConfig::default(); - input.set_bucket("bucket".to_owned()); - input.set_prefix("backup 02/prefix/".to_owned()); - input.set_region("us-west-2".to_owned()); - let c1 = Config::from_input(input.clone()).unwrap(); - let c2 = Config::from_cloud_dynamic(&cloud_dynamic_from_input(input)).unwrap(); - assert_eq!(c1.bucket.bucket, c2.bucket.bucket); - assert_eq!(c1.bucket.prefix, c2.bucket.prefix); - assert_eq!(c1.bucket.region, c2.bucket.region); - assert_eq!( - c1.bucket.region, - StringNonEmpty::opt("us-west-2".to_owned()) - ); - } - - fn cloud_dynamic_from_input(mut s3: InputConfig) -> CloudDynamic { - let mut bucket = InputBucket::default(); - if !s3.endpoint.is_empty() { - bucket.endpoint = s3.take_endpoint(); - } - if !s3.region.is_empty() { - bucket.region = s3.take_region(); - } - if !s3.prefix.is_empty() { - bucket.prefix = s3.take_prefix(); - } - if !s3.storage_class.is_empty() { - bucket.storage_class = s3.take_storage_class(); - } - if !s3.bucket.is_empty() { - bucket.bucket = s3.take_bucket(); - } - let mut attrs = std::collections::HashMap::new(); - if !s3.sse.is_empty() { - attrs.insert("sse".to_owned(), s3.take_sse()); - } - if !s3.acl.is_empty() { - attrs.insert("acl".to_owned(), s3.take_acl()); - } - if !s3.access_key.is_empty() { - attrs.insert("access_key".to_owned(), s3.take_access_key()); - } - if !s3.secret_access_key.is_empty() { - attrs.insert("secret_access_key".to_owned(), s3.take_secret_access_key()); - } - if !s3.sse_kms_key_id.is_empty() { - attrs.insert("sse_kms_key_id".to_owned(), s3.take_sse_kms_key_id()); - } - if s3.force_path_style { - attrs.insert("force_path_style".to_owned(), "true".to_owned()); - } - let mut cd = CloudDynamic::default(); - cd.set_provider_name("aws".to_owned()); - cd.set_attrs(attrs); - cd.set_bucket(bucket); - cd - } - #[tokio::test] async fn test_try_read_exact() { use std::io::{self, Cursor, Read}; diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 04f00c4bb60..07a4752451e 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -9,11 +9,13 @@ failpoints = ["fail/failpoints"] [dependencies] async-trait = "0.1" -azure_core = { version = "0.12.0", git = "https://github.com/Azure/azure-sdk-for-rust" } -azure_identity = { version = "0.12.0", git = "https://github.com/Azure/azure-sdk-for-rust" } -azure_security_keyvault = { version = "0.12.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false } -azure_storage = { version = "0.12.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false } -azure_storage_blobs = { version = "0.12.0", git = "https://github.com/Azure/azure-sdk-for-rust" } +# TODO: The azure sdk with the newest version needs the rustc v1.70, but current version of rustc in TiKV is v1.67. +# Therefore use the patch to update sdk to support fips 140. +azure_core = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = "release-7.5-fips" } +azure_identity = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = "release-7.5-fips" } +azure_security_keyvault = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = "release-7.5-fips", default-features = false } +azure_storage = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = "release-7.5-fips", default-features = false } +azure_storage_blobs = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = "release-7.5-fips" } base64 = "0.13" cloud = { workspace = true } fail = "0.5" @@ -21,7 +23,7 @@ futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } kvproto = { workspace = true } oauth2 = { version = "4.0.0", default-features = false } -openssl = { version = "0.10.50" } +openssl = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" slog = { workspace = true } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index d88020aa944..5a806c54faf 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -24,11 +24,8 @@ use futures_util::{ stream::StreamExt, TryStreamExt, }; -pub use kvproto::brpb::{ - AzureBlobStorage as InputConfig, AzureCustomerKey, Bucket as InputBucket, CloudDynamic, -}; +pub use kvproto::brpb::{AzureBlobStorage as InputConfig, AzureCustomerKey, Bucket as InputBucket}; use oauth2::{ClientId, ClientSecret}; -use openssl::sha::Sha256; use tikv_util::{ debug, stream::{retry, RetryError}, @@ -62,18 +59,6 @@ struct EncryptionCustomer { encryption_key_sha256: String, } -impl EncryptionCustomer { - fn new(encryption_key: &str) -> Self { - let mut hasher = Sha256::new(); - hasher.update(encryption_key.as_bytes()); - let encryption_key_sha256 = base64::encode(hasher.finish()); - EncryptionCustomer { - encryption_key: base64::encode(encryption_key), - encryption_key_sha256, - } - } -} - impl From for EncryptionCustomer { fn from(value: AzureCustomerKey) -> Self { EncryptionCustomer { @@ -164,28 +149,6 @@ impl Config { env::var(ENV_SHARED_KEY).ok().and_then(StringNonEmpty::opt) } - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - let bucket = BucketConf::from_cloud_dynamic(cloud_dynamic)?; - let attrs = &cloud_dynamic.attrs; - let def = &String::new(); - - Ok(Config { - bucket, - account_name: StringNonEmpty::opt(attrs.get("account_name").unwrap_or(def).clone()), - shared_key: StringNonEmpty::opt(attrs.get("shared_key").unwrap_or(def).clone()), - sas_token: StringNonEmpty::opt(attrs.get("sas_token").unwrap_or(def).clone()), - credential_info: Self::load_credential_info(), - env_account_name: Self::load_env_account_name(), - env_shared_key: Self::load_env_shared_key(), - encryption_scope: StringNonEmpty::opt( - attrs.get("encryption_scope").unwrap_or(def).clone(), - ), - encryption_customer: attrs - .get("encryption_key") - .map(|encryption_key| EncryptionCustomer::new(encryption_key)), - }) - } - pub fn from_input(input: InputConfig) -> io::Result { let bucket = BucketConf { endpoint: StringNonEmpty::opt(input.endpoint), @@ -574,10 +537,6 @@ impl AzureStorage { }) } - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - Self::new(Config::from_cloud_dynamic(cloud_dynamic)?) - } - pub fn new(config: Config) -> io::Result { Self::check_config(&config)?; @@ -900,47 +859,6 @@ mod tests { assert_eq!(get_size, size); } - #[test] - fn test_config_round_trip() { - let mut input = InputConfig::default(); - input.set_bucket("bucket".to_owned()); - input.set_prefix("backup 02/prefix/".to_owned()); - input.set_account_name("user".to_owned()); - let c1 = Config::from_input(input.clone()).unwrap(); - let c2 = Config::from_cloud_dynamic(&cloud_dynamic_from_input(input)).unwrap(); - assert_eq!(c1.bucket.bucket, c2.bucket.bucket); - assert_eq!(c1.bucket.prefix, c2.bucket.prefix); - assert_eq!(c1.account_name, c2.account_name); - } - - fn cloud_dynamic_from_input(mut azure: InputConfig) -> CloudDynamic { - let mut bucket = InputBucket::default(); - if !azure.endpoint.is_empty() { - bucket.endpoint = azure.take_endpoint(); - } - if !azure.prefix.is_empty() { - bucket.prefix = azure.take_prefix(); - } - if !azure.storage_class.is_empty() { - bucket.storage_class = azure.take_storage_class(); - } - if !azure.bucket.is_empty() { - bucket.bucket = azure.take_bucket(); - } - let mut attrs = std::collections::HashMap::new(); - if !azure.account_name.is_empty() { - attrs.insert("account_name".to_owned(), azure.take_account_name()); - } - if !azure.shared_key.is_empty() { - attrs.insert("shared_key".to_owned(), azure.take_shared_key()); - } - let mut cd = CloudDynamic::default(); - cd.set_provider_name("azure".to_owned()); - cd.set_attrs(attrs); - cd.set_bucket(bucket); - cd - } - #[test] fn test_config_check() { { diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index c43e4e63969..56cd317c3f8 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -14,7 +14,7 @@ use futures_util::{ use http::HeaderValue; use hyper::{client::HttpConnector, Body, Client, Request, Response, StatusCode}; use hyper_tls::HttpsConnector; -pub use kvproto::brpb::{Bucket as InputBucket, CloudDynamic, Gcs as InputConfig}; +pub use kvproto::brpb::{Bucket as InputBucket, Gcs as InputConfig}; use tame_gcs::{ common::{PredefinedAcl, StorageClass}, objects::{InsertObjectOptional, Metadata, Object}, @@ -54,35 +54,6 @@ impl Config { io::Error::new(io::ErrorKind::InvalidInput, "missing credentials") } - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - let bucket = BucketConf::from_cloud_dynamic(cloud_dynamic)?; - let attrs = &cloud_dynamic.attrs; - let def = &String::new(); - let predefined_acl = parse_predefined_acl(attrs.get("predefined_acl").unwrap_or(def)) - .or_invalid_input("invalid predefined_acl")?; - let storage_class = parse_storage_class(&none_to_empty(bucket.storage_class.clone())) - .or_invalid_input("invalid storage_class")?; - - let credentials_blob_opt = StringNonEmpty::opt( - attrs - .get("credentials_blob") - .unwrap_or(&"".to_string()) - .to_string(), - ); - let svc_info = if let Some(cred) = credentials_blob_opt { - Some(deserialize_service_account_info(cred)?) - } else { - None - }; - - Ok(Config { - bucket, - predefined_acl, - svc_info, - storage_class, - }) - } - pub fn from_input(input: InputConfig) -> io::Result { let endpoint = StringNonEmpty::opt(input.endpoint); let bucket = BucketConf { @@ -241,10 +212,6 @@ impl GcsStorage { Self::new(Config::from_input(input)?) } - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - Self::new(Config::from_cloud_dynamic(cloud_dynamic)?) - } - /// Create a new GCS storage for the given config. pub fn new(config: Config) -> io::Result { let svc_access = if let Some(si) = &config.svc_info { @@ -618,17 +585,6 @@ mod tests { ); } - #[test] - fn test_config_round_trip() { - let mut input = InputConfig::default(); - input.set_bucket("bucket".to_owned()); - input.set_prefix("backup 02/prefix/".to_owned()); - let c1 = Config::from_input(input.clone()).unwrap(); - let c2 = Config::from_cloud_dynamic(&cloud_dynamic_from_input(input)).unwrap(); - assert_eq!(c1.bucket.bucket, c2.bucket.bucket); - assert_eq!(c1.bucket.prefix, c2.bucket.prefix); - } - enum ThrottleReadState { Spawning, Emitting, @@ -706,32 +662,4 @@ mod tests { assert_eq!(dst.len(), BENCH_READ_SIZE) }) } - - fn cloud_dynamic_from_input(mut gcs: InputConfig) -> CloudDynamic { - let mut bucket = InputBucket::default(); - if !gcs.endpoint.is_empty() { - bucket.endpoint = gcs.take_endpoint(); - } - if !gcs.prefix.is_empty() { - bucket.prefix = gcs.take_prefix(); - } - if !gcs.storage_class.is_empty() { - bucket.storage_class = gcs.take_storage_class(); - } - if !gcs.bucket.is_empty() { - bucket.bucket = gcs.take_bucket(); - } - let mut attrs = std::collections::HashMap::new(); - if !gcs.predefined_acl.is_empty() { - attrs.insert("predefined_acl".to_owned(), gcs.take_predefined_acl()); - } - if !gcs.credentials_blob.is_empty() { - attrs.insert("credentials_blob".to_owned(), gcs.take_credentials_blob()); - } - let mut cd = CloudDynamic::default(); - cd.set_provider_name("gcp".to_owned()); - cd.set_attrs(attrs); - cd.set_bucket(bucket); - cd - } } diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index 84ca77042d7..a0b5c26953c 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -4,7 +4,6 @@ use std::{io, marker::Unpin, pin::Pin, task::Poll}; use async_trait::async_trait; use futures_io::AsyncRead; -pub use kvproto::brpb::CloudDynamic; pub trait BlobConfig: 'static + Send + Sync { fn name(&self) -> &'static str; @@ -177,20 +176,6 @@ impl BucketConf { Ok(u) } } - - pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { - let bucket = cloud_dynamic.bucket.clone().into_option().ok_or_else(|| { - io::Error::new(io::ErrorKind::Other, "Required field bucket is missing") - })?; - - Ok(Self { - endpoint: StringNonEmpty::opt(bucket.endpoint), - bucket: StringNonEmpty::required_field(bucket.bucket, "bucket")?, - prefix: StringNonEmpty::opt(bucket.prefix), - storage_class: StringNonEmpty::opt(bucket.storage_class), - region: StringNonEmpty::opt(bucket.region), - }) - } } pub fn none_to_empty(opt: Option) -> String { diff --git a/components/concurrency_manager/src/lib.rs b/components/concurrency_manager/src/lib.rs index ce77cb87a42..1c6bdb8dbf1 100644 --- a/components/concurrency_manager/src/lib.rs +++ b/components/concurrency_manager/src/lib.rs @@ -124,6 +124,23 @@ impl ConcurrencyManager { }); min_lock_ts } + + pub fn global_min_lock(&self) -> Option<(TimeStamp, Key)> { + let mut min_lock: Option<(TimeStamp, Key)> = None; + // TODO: The iteration looks not so efficient. It's better to be optimized. + self.lock_table.for_each_kv(|key, handle| { + if let Some(curr_ts) = handle.with_lock(|lock| lock.as_ref().map(|l| l.ts)) { + if min_lock + .as_ref() + .map(|(ts, _)| ts > &curr_ts) + .unwrap_or(true) + { + min_lock = Some((curr_ts, key.clone())); + } + } + }); + min_lock + } } #[cfg(test)] diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index db6995fa1d0..8f4fb8952c3 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -115,6 +115,14 @@ impl LockTable { } } + pub fn for_each_kv(&self, mut f: impl FnMut(&Key, Arc)) { + for entry in self.0.iter() { + if let Some(handle) = entry.value().upgrade() { + f(entry.key(), handle); + } + } + } + /// Removes the key and its key handle from the map. pub fn remove(&self, key: &Key) { self.0.remove(key); diff --git a/components/crypto/Cargo.toml b/components/crypto/Cargo.toml new file mode 100644 index 00000000000..26eb77ee057 --- /dev/null +++ b/components/crypto/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "crypto" +version = "0.0.1" +edition = "2021" +publish = false + +[dependencies] +openssl = { workspace = true } +openssl-sys = { workspace = true } +slog = { workspace = true } +# better to not use slog-global, but pass in the logger +slog-global = { workspace = true } diff --git a/components/crypto/build.rs b/components/crypto/build.rs new file mode 100644 index 00000000000..5bfe4920c2d --- /dev/null +++ b/components/crypto/build.rs @@ -0,0 +1,32 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::env; + +fn main() { + if !option_env!("ENABLE_FIPS").map_or(false, |v| v == "1") { + println!("cargo:rustc-cfg=disable_fips"); + return; + } + if let Ok(version) = env::var("DEP_OPENSSL_VERSION_NUMBER") { + let version = u64::from_str_radix(&version, 16).unwrap(); + + #[allow(clippy::unusual_byte_groupings)] + // Follow OpenSSL numeric release version identifier style: + // MNNFFPPS: major minor fix patch status + // See https://github.com/openssl/openssl/blob/OpenSSL_1_0_0-stable/crypto/opensslv.h + if version >= 0x3_00_00_00_0 { + println!("cargo:rustc-cfg=ossl3"); + } else { + println!("cargo:rustc-cfg=ossl1"); + } + } else { + panic!( + " + +The DEP_OPENSSL_VERSION_NUMBER environment variable is not found. +Please make sure \"openssl-sys\" is in fips's dependencies. + +" + ) + } +} diff --git a/components/crypto/src/fips.rs b/components/crypto/src/fips.rs new file mode 100644 index 00000000000..b466401af4f --- /dev/null +++ b/components/crypto/src/fips.rs @@ -0,0 +1,44 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::atomic::{AtomicUsize, Ordering}; + +static FIPS_VERSION: AtomicUsize = AtomicUsize::new(0); + +/// Enable OpenSSL FIPS mode if `can_enable` returns true. +/// It should be called at the very start of a program. +pub fn maybe_enable() { + if !can_enable() { + return; + } + #[cfg(ossl1)] + { + openssl::fips::enable(true).unwrap(); + FIPS_VERSION.store(1, Ordering::SeqCst); + return; + } + #[cfg(ossl3)] + { + std::mem::forget(openssl::provider::Provider::load(None, "fips").unwrap()); + FIPS_VERSION.store(3, Ordering::SeqCst); + return; + } + #[allow(unreachable_code)] + { + slog_global::warn!("OpenSSL FIPS mode is disabled unexpectedly"); + } +} + +/// Return true if it is built for FIPS mode. +pub fn can_enable() -> bool { + !cfg!(disable_fips) +} + +/// Prints OpenSSL FIPS mode status. +pub fn log_status() { + let ver = FIPS_VERSION.load(Ordering::SeqCst); + if ver == 0 { + slog_global::info!("OpenSSL FIPS mode is disabled"); + } else { + slog_global::info!("OpenSSL FIPS mode is enabled"; "openssl_major_version" => ver); + } +} diff --git a/components/crypto/src/lib.rs b/components/crypto/src/lib.rs new file mode 100644 index 00000000000..5afb174040c --- /dev/null +++ b/components/crypto/src/lib.rs @@ -0,0 +1,13 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! A shim crate for cryptographic operations, with special considerations for +//! meeting FIPS 140 requirements. +//! +//! This crate provides a set of cryptographic functionalities, including +//! RNG (random number generator). It has been meticulously crafted +//! to adhere to the FIPS 140 standards, ensuring a secure and compliant +//! environment for cryptographic operations in regulated environments. +// TODO: add message digest. + +pub mod fips; +pub mod rand; diff --git a/components/crypto/src/rand.rs b/components/crypto/src/rand.rs new file mode 100644 index 00000000000..d0f97594f49 --- /dev/null +++ b/components/crypto/src/rand.rs @@ -0,0 +1,17 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! Utilities for cryptographically strong random number generation. + +use openssl::{error::ErrorStack, rand}; + +/// Fill buffer with cryptographically strong pseudo-random bytes. +pub fn rand_bytes(buf: &mut [u8]) -> Result<(), ErrorStack> { + rand::rand_bytes(buf) +} + +/// Return a random u64. +pub fn rand_u64() -> Result { + let mut rand_id = [0u8; 8]; + rand_bytes(&mut rand_id)?; + Ok(u64::from_ne_bytes(rand_id)) +} diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index 021c9f23002..7375a9c0b20 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -6,6 +6,9 @@ publish = false [features] failpoints = ["fail/failpoints"] +# openssl/vendored is necssary in order to conditionally building SM4 encryption +# support, as SM4 is disabled on various openssl distributions, such as Rocky Linux 9. +sm4 = ["openssl/vendored"] [dependencies] async-trait = "0.1" @@ -14,8 +17,8 @@ bytes = "1.0" cloud = { workspace = true } crc32fast = "1.2" crossbeam = "0.8" +crypto = { workspace = true } derive_more = "0.99.3" -engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } @@ -25,10 +28,12 @@ hex = "0.4.2" kvproto = { workspace = true } lazy_static = "1.3" online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } -rand = "0.8" +# For simplicity and compliance with FIPS 140 requirements for random number +# generation, do not use the 'rand' crate in encryption-related code. +# rand = "*" serde = "1.0" serde_derive = "1.0" slog = { workspace = true } diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index c1891a93480..1a7b64eb7be 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -9,6 +9,7 @@ default = ["cloud-aws", "cloud-gcp", "cloud-azure"] cloud-aws = ["aws"] cloud-gcp = [] cloud-azure = ["azure"] +sm4 = ["encryption/sm4"] [dependencies] async-trait = "0.1" @@ -20,7 +21,7 @@ encryption = { workspace = true } error_code = { workspace = true } file_system = { workspace = true } kvproto = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } slog = { workspace = true } # better to not use slog-global, but pass in the logger diff --git a/components/encryption/export/src/lib.rs b/components/encryption/export/src/lib.rs index 8820402be6b..a36406d44ea 100644 --- a/components/encryption/export/src/lib.rs +++ b/components/encryption/export/src/lib.rs @@ -9,9 +9,9 @@ use cloud::kms::Config as CloudConfig; #[cfg(feature = "cloud-aws")] pub use encryption::KmsBackend; pub use encryption::{ - clean_up_dir, clean_up_trash, from_engine_encryption_method, trash_dir_all, AzureConfig, - Backend, DataKeyImporter, DataKeyManager, DataKeyManagerArgs, DecrypterReader, - EncryptionConfig, Error, FileConfig, Iv, KmsConfig, MasterKeyConfig, Result, + clean_up_dir, clean_up_trash, trash_dir_all, AzureConfig, Backend, DataKeyImporter, + DataKeyManager, DataKeyManagerArgs, DecrypterReader, EncryptionConfig, Error, FileConfig, Iv, + KmsConfig, MasterKeyConfig, Result, }; use encryption::{cloud_convert_error, FileBackend, PlaintextBackend}; use tikv_util::{box_err, error, info}; diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index 3940d392be6..a60b9c9c20b 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -1,37 +1,18 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +use std::fmt::{self, Debug, Formatter}; + use byteorder::{BigEndian, ByteOrder}; use cloud::kms::PlainKey; -use engine_traits::EncryptionMethod as EtEncryptionMethod; use kvproto::encryptionpb::EncryptionMethod; -use openssl::symm::{self, Cipher as OCipher}; -use rand::{rngs::OsRng, RngCore}; +use openssl::{ + rand, + symm::{self, Cipher as OCipher}, +}; use tikv_util::box_err; use crate::{Error, Result}; -pub fn to_engine_encryption_method(method: EncryptionMethod) -> EtEncryptionMethod { - match method { - EncryptionMethod::Plaintext => EtEncryptionMethod::Plaintext, - EncryptionMethod::Aes128Ctr => EtEncryptionMethod::Aes128Ctr, - EncryptionMethod::Aes192Ctr => EtEncryptionMethod::Aes192Ctr, - EncryptionMethod::Aes256Ctr => EtEncryptionMethod::Aes256Ctr, - EncryptionMethod::Sm4Ctr => EtEncryptionMethod::Sm4Ctr, - EncryptionMethod::Unknown => EtEncryptionMethod::Unknown, - } -} - -pub fn from_engine_encryption_method(method: EtEncryptionMethod) -> EncryptionMethod { - match method { - EtEncryptionMethod::Plaintext => EncryptionMethod::Plaintext, - EtEncryptionMethod::Aes128Ctr => EncryptionMethod::Aes128Ctr, - EtEncryptionMethod::Aes192Ctr => EncryptionMethod::Aes192Ctr, - EtEncryptionMethod::Aes256Ctr => EncryptionMethod::Aes256Ctr, - EtEncryptionMethod::Sm4Ctr => EncryptionMethod::Sm4Ctr, - EtEncryptionMethod::Unknown => EncryptionMethod::Unknown, - } -} - pub fn get_method_key_length(method: EncryptionMethod) -> usize { match method { EncryptionMethod::Plaintext => 0, @@ -43,6 +24,40 @@ pub fn get_method_key_length(method: EncryptionMethod) -> usize { } } +#[derive(Clone, PartialEq)] +pub struct FileEncryptionInfo { + pub method: EncryptionMethod, + pub key: Vec, + pub iv: Vec, +} +impl Default for FileEncryptionInfo { + fn default() -> Self { + FileEncryptionInfo { + method: EncryptionMethod::Unknown, + key: vec![], + iv: vec![], + } + } +} + +impl Debug for FileEncryptionInfo { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "FileEncryptionInfo [method={:?}, key=...<{} bytes>, iv=...<{} bytes>]", + self.method, + self.key.len(), + self.iv.len() + ) + } +} + +impl FileEncryptionInfo { + pub fn is_empty(&self) -> bool { + self.key.is_empty() && self.iv.is_empty() + } +} + // IV's the length should be 12 btyes for GCM mode. const GCM_IV_12: usize = 12; // IV's the length should be 16 btyes for CTR mode. @@ -57,17 +72,17 @@ pub enum Iv { impl Iv { /// Generate a random IV for AES-GCM. - pub fn new_gcm() -> Iv { + pub fn new_gcm() -> Result { let mut iv = [0u8; GCM_IV_12]; - OsRng.fill_bytes(&mut iv); - Iv::Gcm(iv) + rand::rand_bytes(&mut iv)?; + Ok(Iv::Gcm(iv)) } /// Generate a random IV for AES-CTR. - pub fn new_ctr() -> Iv { + pub fn new_ctr() -> Result { let mut iv = [0u8; CTR_IV_16]; - OsRng.fill_bytes(&mut iv); - Iv::Ctr(iv) + rand::rand_bytes(&mut iv)?; + Ok(Iv::Ctr(iv)) } pub fn from_slice(src: &[u8]) -> Result { @@ -199,9 +214,9 @@ mod tests { let mut ivs = Vec::with_capacity(100); for c in 0..100 { if c % 2 == 0 { - ivs.push(Iv::new_ctr()); + ivs.push(Iv::new_ctr().unwrap()); } else { - ivs.push(Iv::new_gcm()); + ivs.push(Iv::new_gcm().unwrap()); } } ivs.dedup_by(|a, b| a.as_slice() == b.as_slice()); diff --git a/components/encryption/src/encrypted_file/mod.rs b/components/encryption/src/encrypted_file/mod.rs index 9c76b857c70..8cac47077f4 100644 --- a/components/encryption/src/encrypted_file/mod.rs +++ b/components/encryption/src/encrypted_file/mod.rs @@ -5,10 +5,10 @@ use std::{ path::Path, }; +use crypto::rand; use file_system::{rename, File, OpenOptions}; use kvproto::encryptionpb::EncryptedContent; use protobuf::Message; -use rand::{thread_rng, RngCore}; use slog_global::error; use tikv_util::time::Instant; @@ -66,7 +66,7 @@ impl<'a> EncryptedFile<'a> { // TODO what if a tmp file already exists? let origin_path = self.base.join(self.name); let mut tmp_path = origin_path.clone(); - tmp_path.set_extension(format!("{}.{}", thread_rng().next_u64(), TMP_FILE_SUFFIX)); + tmp_path.set_extension(format!("{}.{}", rand::rand_u64()?, TMP_FILE_SUFFIX)); let mut tmp_file = OpenOptions::new() .create(true) .write(true) diff --git a/components/encryption/src/file_dict_file.rs b/components/encryption/src/file_dict_file.rs index 0d1dcbbbd6e..a40fb912b3b 100644 --- a/components/encryption/src/file_dict_file.rs +++ b/components/encryption/src/file_dict_file.rs @@ -6,10 +6,10 @@ use std::{ }; use byteorder::{BigEndian, ByteOrder}; +use crypto::rand; use file_system::{rename, File, OpenOptions}; use kvproto::encryptionpb::{EncryptedContent, FileDictionary, FileInfo}; use protobuf::Message; -use rand::{thread_rng, RngCore}; use tikv_util::{box_err, info, set_panic_mark, warn}; use crate::{ @@ -127,7 +127,7 @@ impl FileDictionaryFile { if self.enable_log { let origin_path = self.file_path(); let mut tmp_path = origin_path.clone(); - tmp_path.set_extension(format!("{}.{}", thread_rng().next_u64(), TMP_FILE_SUFFIX)); + tmp_path.set_extension(format!("{}.{}", rand::rand_u64()?, TMP_FILE_SUFFIX)); let mut tmp_file = OpenOptions::new() .create(true) .write(true) diff --git a/components/encryption/src/io.rs b/components/encryption/src/io.rs index d7b7eb76b8a..4884fc68b92 100644 --- a/components/encryption/src/io.rs +++ b/components/encryption/src/io.rs @@ -390,7 +390,18 @@ pub fn create_aes_ctr_crypter( EncryptionMethod::Aes128Ctr => OCipher::aes_128_ctr(), EncryptionMethod::Aes192Ctr => OCipher::aes_192_ctr(), EncryptionMethod::Aes256Ctr => OCipher::aes_256_ctr(), - EncryptionMethod::Sm4Ctr => OCipher::sm4_ctr(), + EncryptionMethod::Sm4Ctr => { + #[cfg(feature = "sm4")] + { + OCipher::sm4_ctr() + } + #[cfg(not(feature = "sm4"))] + { + return Err(box_err!( + "sm4-ctr is not supported by dynamically linked openssl" + )); + } + } }; let crypter = OCrypter::new(cipher, mode, key, Some(iv.as_slice()))?; Ok((cipher, crypter)) @@ -543,17 +554,10 @@ mod tests { use std::{cmp::min, io::Cursor}; use byteorder::{BigEndian, ByteOrder}; - use rand::{rngs::OsRng, RngCore}; + use openssl::rand; use super::*; - use crate::crypter; - - fn generate_data_key(method: EncryptionMethod) -> Vec { - let key_length = crypter::get_method_key_length(method); - let mut key = vec![0; key_length]; - OsRng.fill_bytes(&mut key); - key - } + use crate::manager::generate_data_key; struct DecoratedCursor { cursor: Cursor>, @@ -617,7 +621,7 @@ mod tests { EncryptionMethod::Sm4Ctr, ]; let ivs = [ - Iv::new_ctr(), + Iv::new_ctr().unwrap(), // Iv overflow Iv::from_slice(&{ let mut v = vec![0; 16]; @@ -634,10 +638,10 @@ mod tests { ]; for method in methods { for iv in ivs { - let key = generate_data_key(method); + let (_, key) = generate_data_key(method).unwrap(); let mut plaintext = vec![0; 1024]; - OsRng.fill_bytes(&mut plaintext); + rand::rand_bytes(&mut plaintext).unwrap(); let mut encrypter = EncrypterWriter::new( DecoratedCursor::new(plaintext.clone(), 1), method, @@ -693,12 +697,12 @@ mod tests { EncryptionMethod::Sm4Ctr, ]; let mut plaintext = vec![0; 10240]; - OsRng.fill_bytes(&mut plaintext); + rand::rand_bytes(&mut plaintext).unwrap(); let offsets = [1024, 1024 + 1, 10240 - 1, 10240, 10240 + 1]; let sizes = [1024, 10240]; for method in methods { - let key = generate_data_key(method); - let iv = Iv::new_ctr(); + let (_, key) = generate_data_key(method).unwrap(); + let iv = Iv::new_ctr().unwrap(); let encrypter = EncrypterReader::new(DecoratedCursor::new(plaintext.clone(), 1), method, &key, iv) .unwrap(); @@ -730,13 +734,13 @@ mod tests { EncryptionMethod::Sm4Ctr, ]; let mut plaintext = vec![0; 10240]; - OsRng.fill_bytes(&mut plaintext); + rand::rand_bytes(&mut plaintext).unwrap(); let offsets = [1024, 1024 + 1, 10240 - 1]; let sizes = [1024, 8000]; let written = vec![0; 10240]; for method in methods { - let key = generate_data_key(method); - let iv = Iv::new_ctr(); + let (_, key) = generate_data_key(method).unwrap(); + let iv = Iv::new_ctr().unwrap(); let encrypter = EncrypterWriter::new(DecoratedCursor::new(written.clone(), 1), method, &key, iv) .unwrap(); @@ -776,12 +780,12 @@ mod tests { EncryptionMethod::Aes256Ctr, EncryptionMethod::Sm4Ctr, ]; - let iv = Iv::new_ctr(); + let iv = Iv::new_ctr().unwrap(); let mut plain_text = vec![0; 10240]; - OsRng.fill_bytes(&mut plain_text); + rand::rand_bytes(&mut plain_text).unwrap(); for method in methods { - let key = generate_data_key(method); + let (_, key) = generate_data_key(method).unwrap(); // encrypt plaintext into encrypt_text let read_once = 16; let mut encrypt_reader = EncrypterReader::new( diff --git a/components/encryption/src/lib.rs b/components/encryption/src/lib.rs index 38c38108dc5..2a9ad4c6f44 100644 --- a/components/encryption/src/lib.rs +++ b/components/encryption/src/lib.rs @@ -16,10 +16,7 @@ use std::{io::ErrorKind, path::Path}; pub use self::{ config::*, - crypter::{ - from_engine_encryption_method, to_engine_encryption_method, verify_encryption_config, - AesGcmCrypter, Iv, - }, + crypter::{verify_encryption_config, AesGcmCrypter, FileEncryptionInfo, Iv}, encrypted_file::EncryptedFile, errors::{cloud_convert_error, Error, Result, RetryCodedError}, file_dict_file::FileDictionaryFile, diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index f3594e8a96b..0f20741e841 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -13,9 +13,7 @@ use std::{ }; use crossbeam::channel::{self, select, tick}; -use engine_traits::{ - EncryptionKeyManager, EncryptionMethod as EtEncryptionMethod, FileEncryptionInfo, -}; +use crypto::rand; use fail::fail_point; use file_system::File; use kvproto::encryptionpb::{DataKey, EncryptionMethod, FileDictionary, FileInfo, KeyDictionary}; @@ -24,7 +22,7 @@ use tikv_util::{box_err, debug, error, info, sys::thread::StdThreadBuildWrapper, use crate::{ config::EncryptionConfig, - crypter::{self, Iv}, + crypter::{self, FileEncryptionInfo, Iv}, encrypted_file::EncryptedFile, file_dict_file::FileDictionaryFile, io::{DecrypterReader, EncrypterWriter}, @@ -203,7 +201,7 @@ impl Dicts { fn new_file(&self, fname: &str, method: EncryptionMethod, sync: bool) -> Result { let mut file_dict_file = self.file_dict_file.lock().unwrap(); let iv = if method != EncryptionMethod::Plaintext { - Iv::new_ctr() + Iv::new_ctr()? } else { Iv::Empty }; @@ -351,7 +349,9 @@ impl Dicts { // Generate new data key. for _ in 0..GENERATE_DATA_KEY_LIMIT { - let (key_id, key) = generate_data_key(method); + let Ok((key_id, key)) = generate_data_key(method) else { + continue; + }; if key_id == 0 { // 0 is invalid continue; @@ -439,14 +439,12 @@ fn run_background_rotate_work( } } -fn generate_data_key(method: EncryptionMethod) -> (u64, Vec) { - use rand::{rngs::OsRng, RngCore}; - - let key_id = OsRng.next_u64(); +pub(crate) fn generate_data_key(method: EncryptionMethod) -> Result<(u64, Vec)> { + let key_id = rand::rand_u64()?; let key_length = crypter::get_method_key_length(method); let mut key = vec![0; key_length]; - OsRng.fill_bytes(&mut key); - (key_id, key) + rand::rand_bytes(&mut key)?; + Ok((key_id, key)) } pub struct DataKeyManager { @@ -661,9 +659,9 @@ impl DataKeyManager { }; EncrypterWriter::new( writer, - crypter::from_engine_encryption_method(file.method), + file.method, &file.key, - if file.method == EtEncryptionMethod::Plaintext { + if file.method == EncryptionMethod::Plaintext { debug_assert!(file.iv.is_empty()); Iv::Empty } else { @@ -691,9 +689,9 @@ impl DataKeyManager { let file = self.get_file(fname)?; DecrypterReader::new( reader, - crypter::from_engine_encryption_method(file.method), + file.method, &file.key, - if file.method == EtEncryptionMethod::Plaintext { + if file.method == EncryptionMethod::Plaintext { debug_assert!(file.iv.is_empty()); Iv::Empty } else { @@ -767,11 +765,7 @@ impl DataKeyManager { } } }; - let encrypted_file = FileEncryptionInfo { - key, - method: crypter::to_engine_encryption_method(method), - iv, - }; + let encrypted_file = FileEncryptionInfo { key, method, iv }; Ok(Some(encrypted_file)) } @@ -844,8 +838,8 @@ impl DataKeyManager { } /// Return which method this manager is using. - pub fn encryption_method(&self) -> engine_traits::EncryptionMethod { - crypter::to_engine_encryption_method(self.method) + pub fn encryption_method(&self) -> EncryptionMethod { + self.method } /// For tests. @@ -869,9 +863,9 @@ impl Drop for DataKeyManager { } } -impl EncryptionKeyManager for DataKeyManager { +impl DataKeyManager { // Get key to open existing file. - fn get_file(&self, fname: &str) -> IoResult { + pub fn get_file(&self, fname: &str) -> IoResult { match self.get_file_exists(fname) { Ok(Some(result)) => Ok(result), Ok(None) => { @@ -881,7 +875,7 @@ impl EncryptionKeyManager for DataKeyManager { let method = EncryptionMethod::Plaintext; Ok(FileEncryptionInfo { key: vec![], - method: crypter::to_engine_encryption_method(method), + method, iv: file.iv, }) } @@ -889,21 +883,25 @@ impl EncryptionKeyManager for DataKeyManager { } } - fn new_file(&self, fname: &str) -> IoResult { + pub fn new_file(&self, fname: &str) -> IoResult { let (_, data_key) = self.dicts.current_data_key(); let key = data_key.get_key().to_owned(); let file = self.dicts.new_file(fname, self.method, true)?; let encrypted_file = FileEncryptionInfo { key, - method: crypter::to_engine_encryption_method(file.method), + method: file.method, iv: file.get_iv().to_owned(), }; Ok(encrypted_file) } - // See comments of `remove_dir` for more details when using this with a - // directory. - fn delete_file(&self, fname: &str, physical_fname: Option<&str>) -> IoResult<()> { + // Can be used with both file and directory. See comments of `remove_dir` for + // more details when using this with a directory. + // + // `physical_fname` is a hint when `fname` was renamed physically. + // Depending on the implementation, providing false negative or false + // positive value may result in leaking encryption keys. + pub fn delete_file(&self, fname: &str, physical_fname: Option<&str>) -> IoResult<()> { fail_point!("key_manager_fails_before_delete_file", |_| IoResult::Err( io::ErrorKind::Other.into() )); @@ -924,7 +922,7 @@ impl EncryptionKeyManager for DataKeyManager { Ok(()) } - fn link_file(&self, src_fname: &str, dst_fname: &str) -> IoResult<()> { + pub fn link_file(&self, src_fname: &str, dst_fname: &str) -> IoResult<()> { let src_path = Path::new(src_fname); let dst_path = Path::new(dst_fname); if src_path.is_dir() { @@ -1006,8 +1004,7 @@ impl<'a> DataKeyImporter<'a> { if key_id.is_none() { for _ in 0..GENERATE_DATA_KEY_LIMIT { // Match `generate_data_key`. - use rand::{rngs::OsRng, RngCore}; - let id = OsRng.next_u64(); + let id = rand::rand_u64()?; if let Entry::Vacant(e) = key_dict.keys.entry(id) { key_id = Some(id); e.insert(new_key); @@ -1120,8 +1117,8 @@ impl<'a> Drop for DataKeyImporter<'a> { #[cfg(test)] mod tests { - use engine_traits::EncryptionMethod as EtEncryptionMethod; use file_system::{remove_file, File}; + use kvproto::encryptionpb::EncryptionMethod; use matches::assert_matches; use tempfile::TempDir; use test_util::create_test_key_file; @@ -1243,7 +1240,7 @@ mod tests { let foo3 = manager.get_file("foo").unwrap(); assert_eq!(foo1, foo3); let bar = manager.new_file("bar").unwrap(); - assert_eq!(bar.method, EtEncryptionMethod::Plaintext); + assert_eq!(bar.method, EncryptionMethod::Plaintext); } // When enabling encryption, using insecure master key is not allowed. @@ -1861,11 +1858,11 @@ mod tests { ) .unwrap(); // different key - let (_, key2) = generate_data_key(EncryptionMethod::Aes192Ctr); + let (_, key2) = generate_data_key(EncryptionMethod::Aes192Ctr).unwrap(); importer .add( "2", - Iv::new_ctr().as_slice().to_owned(), + Iv::new_ctr().unwrap().as_slice().to_owned(), DataKey { key: key2.clone(), method: EncryptionMethod::Aes192Ctr, @@ -1899,7 +1896,7 @@ mod tests { importer .add( "2", - Iv::new_ctr().as_slice().to_owned(), + Iv::new_ctr().unwrap().as_slice().to_owned(), DataKey { key: key2.clone(), method: EncryptionMethod::Aes192Ctr, @@ -1921,7 +1918,7 @@ mod tests { let tmp_dir = tempfile::TempDir::new().unwrap(); let manager = new_key_manager_def(&tmp_dir, Some(EncryptionMethod::Aes192Ctr)).unwrap(); - let (_, key) = generate_data_key(EncryptionMethod::Aes192Ctr); + let (_, key) = generate_data_key(EncryptionMethod::Aes192Ctr).unwrap(); let file0 = manager.new_file("0").unwrap(); let now = SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/components/encryption/src/master_key/file.rs b/components/encryption/src/master_key/file.rs index ad1bfb75a87..1b24a95e497 100644 --- a/components/encryption/src/master_key/file.rs +++ b/components/encryption/src/master_key/file.rs @@ -49,7 +49,7 @@ impl FileBackend { impl Backend for FileBackend { fn encrypt(&self, plaintext: &[u8]) -> Result { - let iv = Iv::new_gcm(); + let iv = Iv::new_gcm()?; self.backend.encrypt_content(plaintext, iv) } diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index 643cb08a0c6..db3c62194fd 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -158,7 +158,7 @@ impl KmsBackend { impl Backend for KmsBackend { fn encrypt(&self, plaintext: &[u8]) -> Result { - self.encrypt_content(plaintext, Iv::new_gcm()) + self.encrypt_content(plaintext, Iv::new_gcm()?) } fn decrypt(&self, content: &EncryptedContent) -> Result> { diff --git a/components/encryption/src/master_key/mem.rs b/components/encryption/src/master_key/mem.rs index 619acc38ebf..c19351f5dc7 100644 --- a/components/encryption/src/master_key/mem.rs +++ b/components/encryption/src/master_key/mem.rs @@ -128,7 +128,9 @@ mod tests { .unwrap(); let backend = MemAesGcmBackend::new(key).unwrap(); - let encrypted_content = backend.encrypt_content(&pt, Iv::new_gcm()).unwrap(); + let encrypted_content = backend + .encrypt_content(&pt, Iv::new_gcm().unwrap()) + .unwrap(); let plaintext = backend.decrypt_content(&encrypted_content).unwrap(); assert_eq!(plaintext, pt); diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index f5da1dad550..1ad0679f1b6 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -9,6 +9,7 @@ publish = false testexport = [] [dependencies] +encryption = { workspace = true } engine_traits = { workspace = true } kvproto = { workspace = true } raft = { workspace = true } diff --git a/components/engine_panic/src/compact.rs b/components/engine_panic/src/compact.rs index 988bec790de..f64c97ff5b0 100644 --- a/components/engine_panic/src/compact.rs +++ b/components/engine_panic/src/compact.rs @@ -44,6 +44,10 @@ impl CompactExt for PanicEngine { ) -> Result<()> { panic!() } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + panic!() + } } pub struct PanicCompactedEvent; diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index c081a5c1d12..05147ca06fb 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -40,6 +40,10 @@ impl DbOptions for PanicDbOptions { panic!() } + fn get_flush_size(&self) -> Result { + panic!() + } + fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { panic!() } diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index b5ce0d1516e..7b8546af111 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -1,8 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{ - IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SyncMutable, - WriteOptions, + IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SnapshotContext, + SyncMutable, WriteOptions, }; use crate::{db_vector::PanicDbVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; @@ -13,7 +13,7 @@ pub struct PanicEngine; impl KvEngine for PanicEngine { type Snapshot = PanicSnapshot; - fn snapshot(&self) -> Self::Snapshot { + fn snapshot(&self, _: Option) -> Self::Snapshot { panic!() } fn sync(&self) -> Result<()> { diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 8da5c48d3e6..6ebecd58a09 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -129,4 +129,9 @@ impl MiscExt for PanicEngine { fn get_accumulated_flush_count_cf(cf: &str) -> Result { panic!() } + + type DiskEngine = PanicEngine; + fn get_disk_engine(&self) -> &Self::DiskEngine { + panic!() + } } diff --git a/components/engine_panic/src/sst.rs b/components/engine_panic/src/sst.rs index 119cd5884a3..59c23e67636 100644 --- a/components/engine_panic/src/sst.rs +++ b/components/engine_panic/src/sst.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, path::PathBuf, sync::Arc}; +use ::encryption::DataKeyManager; use engine_traits::{ CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, RefIterable, Result, SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, @@ -18,16 +19,13 @@ impl SstExt for PanicEngine { pub struct PanicSstReader; impl SstReader for PanicSstReader { - fn open(path: &str) -> Result { + fn open(path: &str, mgr: Option>) -> Result { panic!() } - fn open_encrypted( - path: &str, - mgr: Arc, - ) -> Result { + fn verify_checksum(&self) -> Result<()> { panic!() } - fn verify_checksum(&self) -> Result<()> { + fn kv_count_and_size(&self) -> (u64, u64) { panic!() } } diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index 1162c67f210..6a2372fb31f 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -40,6 +40,23 @@ impl RocksCfOptions { pub fn into_raw(self) -> RawCfOptions { self.0 } + + pub fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + pub fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } impl Deref for RocksCfOptions { diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 199b7d9f3be..f64c9a7d49e 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -121,6 +121,10 @@ impl CompactExt for RocksEngine { db.compact_files_cf(handle, &opts, &files, output_level) .map_err(r2e) } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.as_inner().check_in_range(start, end).map_err(r2e) + } } #[cfg(test)] diff --git a/components/engine_rocks/src/config.rs b/components/engine_rocks/src/config.rs index e121a1cea18..d55c5cb3dfc 100644 --- a/components/engine_rocks/src/config.rs +++ b/components/engine_rocks/src/config.rs @@ -340,9 +340,9 @@ pub enum BlobRunMode { impl From for ConfigValue { fn from(mode: BlobRunMode) -> ConfigValue { let str_value = match mode { - BlobRunMode::Normal => "normal", - BlobRunMode::ReadOnly => "read-only", - BlobRunMode::Fallback => "fallback", + BlobRunMode::Normal => "kNormal", + BlobRunMode::ReadOnly => "kReadOnly", + BlobRunMode::Fallback => "kFallback", }; ConfigValue::String(str_value.into()) } @@ -366,8 +366,11 @@ impl FromStr for BlobRunMode { "normal" => Ok(BlobRunMode::Normal), "read-only" => Ok(BlobRunMode::ReadOnly), "fallback" => Ok(BlobRunMode::Fallback), + "kNormal" => Ok(BlobRunMode::Normal), + "kReadOnly" => Ok(BlobRunMode::ReadOnly), + "kFallback" => Ok(BlobRunMode::Fallback), m => Err(format!( - "expect: normal, read-only or fallback, got: {:?}", + "expect: normal, kNormal, read-only, kReadOnly, kFallback or fallback, got: {:?}", m )), } diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index c9ef2cfda98..38587663084 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -100,6 +100,14 @@ impl DbOptions for RocksDbOptions { Ok(()) } + fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { if let Some(m) = self.0.get_write_buffer_manager() { m.set_flush_oldest_first(f); diff --git a/components/engine_rocks/src/encryption.rs b/components/engine_rocks/src/encryption.rs index 4dbe3ab10d2..75dc407e3c3 100644 --- a/components/engine_rocks/src/encryption.rs +++ b/components/engine_rocks/src/encryption.rs @@ -2,11 +2,10 @@ use std::{io::Result, sync::Arc}; -use encryption::{self, DataKeyManager}; -use engine_traits::{EncryptionKeyManager, EncryptionMethod, FileEncryptionInfo}; +use encryption::{DataKeyManager, FileEncryptionInfo}; +use kvproto::encryptionpb::EncryptionMethod; use rocksdb::{ - DBEncryptionMethod, EncryptionKeyManager as DBEncryptionKeyManager, - FileEncryptionInfo as DBFileEncryptionInfo, + DBEncryptionMethod, EncryptionKeyManager, FileEncryptionInfo as DBFileEncryptionInfo, }; use crate::{r2e, raw::Env}; @@ -15,29 +14,29 @@ use crate::{r2e, raw::Env}; pub(crate) fn get_env( base_env: Option>, key_manager: Option>, -) -> engine_traits::Result> { - let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); +) -> engine_traits::Result>> { if let Some(manager) = key_manager { - Ok(Arc::new( + let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); + Ok(Some(Arc::new( Env::new_key_managed_encrypted_env(base_env, WrappedEncryptionKeyManager { manager }) .map_err(r2e)?, - )) + ))) } else { Ok(base_env) } } -pub struct WrappedEncryptionKeyManager { - manager: Arc, +pub struct WrappedEncryptionKeyManager { + manager: Arc, } -impl WrappedEncryptionKeyManager { - pub fn new(manager: Arc) -> Self { +impl WrappedEncryptionKeyManager { + pub fn new(manager: Arc) -> Self { Self { manager } } } -impl DBEncryptionKeyManager for WrappedEncryptionKeyManager { +impl EncryptionKeyManager for WrappedEncryptionKeyManager { fn get_file(&self, fname: &str) -> Result { self.manager .get_file(fname) diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 293b74e3bca..7de0ffd0dbe 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -2,7 +2,9 @@ use std::{any::Any, sync::Arc}; -use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; +use engine_traits::{ + IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SnapshotContext, SyncMutable, +}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ @@ -182,7 +184,7 @@ impl RocksEngine { impl KvEngine for RocksEngine { type Snapshot = RocksSnapshot; - fn snapshot(&self) -> RocksSnapshot { + fn snapshot(&self, _: Option) -> RocksSnapshot { RocksSnapshot::new(self.db.clone()) } @@ -292,7 +294,7 @@ mod tests { engine.put_msg(key, &r).unwrap(); engine.put_msg_cf(cf, key, &r).unwrap(); - let snap = engine.snapshot(); + let snap = engine.snapshot(None); let mut r1: Region = engine.get_msg(key).unwrap().unwrap(); assert_eq!(r, r1); diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 9628c61c23f..03a40d005c8 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -194,8 +194,15 @@ impl rocksdb::EventListener for RocksPersistenceListener { fn on_memtable_sealed(&self, info: &MemTableInfo) { // Note: first_seqno is effectively the smallest seqno of memtable. // earliest_seqno has ambiguous semantics. - self.0 - .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); + self.0.on_memtable_sealed( + info.cf_name().to_string(), + info.first_seqno(), + info.largest_seqno(), + ); + } + + fn on_flush_begin(&self, _: &FlushJobInfo) { + fail::fail_point!("on_flush_begin"); } fn on_flush_completed(&self, job: &FlushJobInfo) { diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index b5561b3de42..5afa5452344 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -113,14 +113,15 @@ pub use rocksdb::{ }; pub mod flow_control_factors; +use ::encryption::DataKeyManager; pub use flow_control_factors::*; pub mod raw; pub fn get_env( - key_manager: Option>, + key_manager: Option>, limiter: Option>, ) -> engine_traits::Result> { let env = encryption::get_env(None /* base_env */, key_manager)?; - file_system::get_env(Some(env), limiter) + file_system::get_env(env, limiter) } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index b1406cacdb8..f82e1e68832 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -448,6 +448,11 @@ impl MiscExt for RocksEngine { .get(); Ok(n) } + + type DiskEngine = RocksEngine; + fn get_disk_engine(&self) -> &Self::DiskEngine { + self + } } #[cfg(test)] diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 522696cb150..2b32af111ec 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -920,6 +920,7 @@ struct CfStats { blob_cache_size: Option, readers_mem: Option, mem_tables: Option, + mem_tables_all: Option, num_keys: Option, pending_compaction_bytes: Option, num_immutable_mem_table: Option, @@ -978,6 +979,9 @@ impl StatisticsReporter for RocksStatisticsReporter { if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { *cf_stats.mem_tables.get_or_insert_default() += v; } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables_all.get_or_insert_default() += v; + } // TODO: add cache usage and pinned usage. if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { *cf_stats.num_keys.get_or_insert_default() += v; @@ -1119,6 +1123,11 @@ impl StatisticsReporter for RocksStatisticsReporter { .with_label_values(&[&self.name, cf, "mem-tables"]) .set(v as i64); } + if let Some(v) = cf_stats.mem_tables_all { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables-all"]) + .set(v as i64); + } if let Some(v) = cf_stats.num_keys { STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC .with_label_values(&[&self.name, cf]) @@ -1538,9 +1547,9 @@ lazy_static! { "Number of times titan blob file sync is done", &["db"] ).unwrap(); - pub static ref STORE_ENGINE_BLOB_FILE_SYNCED: SimpleEngineTickerMetrics = - auto_flush_from!(STORE_ENGINE_BLOB_FILE_SYNCED_VEC, SimpleEngineTickerMetrics); - + pub static ref STORE_ENGINE_BLOB_FILE_SYNCED: SimpleEngineTickerMetrics = + auto_flush_from!(STORE_ENGINE_BLOB_FILE_SYNCED_VEC, SimpleEngineTickerMetrics); + pub static ref STORE_ENGINE_BLOB_CACHE_EFFICIENCY_VEC: IntCounterVec = register_int_counter_vec!( "tikv_engine_blob_cache_efficiency", "Efficiency of titan's blob cache", diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index 042949f1c09..5bbc6245c72 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -5,6 +5,7 @@ use rocksdb::{DBStatisticsHistogramType as HistType, DBStatisticsTickerType as T pub const ROCKSDB_TOTAL_SST_FILES_SIZE: &str = "rocksdb.total-sst-files-size"; pub const ROCKSDB_TABLE_READERS_MEM: &str = "rocksdb.estimate-table-readers-mem"; pub const ROCKSDB_CUR_SIZE_ALL_MEM_TABLES: &str = "rocksdb.cur-size-all-mem-tables"; +pub const ROCKSDB_SIZE_ALL_MEM_TABLES: &str = "rocksdb.size-all-mem-tables"; pub const ROCKSDB_ESTIMATE_NUM_KEYS: &str = "rocksdb.estimate-num-keys"; pub const ROCKSDB_PENDING_COMPACTION_BYTES: &str = "rocksdb.\ estimate-pending-compaction-bytes"; diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 145fa9a7bce..1030b7aa17f 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -2,22 +2,20 @@ use std::{path::PathBuf, sync::Arc}; +use ::encryption::DataKeyManager; use engine_traits::{ - EncryptionKeyManager, Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, - SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, + SstExt, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; -use kvproto::import_sstpb::SstMeta; +use file_system::get_io_rate_limiter; use rocksdb::{ rocksdb::supported_compression, ColumnFamilyOptions, DBCompressionType, DBIterator, Env, EnvOptions, ExternalSstFileInfo as RawExternalSstFileInfo, SequentialFile, SstFileReader, SstFileWriter, DB, }; -use tikv_util::box_err; -use crate::{ - encryption::WrappedEncryptionKeyManager, engine::RocksEngine, options::RocksReadOptions, r2e, -}; +use crate::{engine::RocksEngine, get_env, options::RocksReadOptions, r2e}; impl SstExt for RocksEngine { type SstReader = RocksSstReader; @@ -30,19 +28,6 @@ pub struct RocksSstReader { } impl RocksSstReader { - pub fn sst_meta_info(&self, sst: SstMeta) -> SstMetaInfo { - let mut meta = SstMetaInfo { - total_kvs: 0, - total_bytes: 0, - meta: sst, - }; - self.inner.read_table_properties(|p| { - meta.total_kvs = p.num_entries(); - meta.total_bytes = p.raw_key_size() + p.raw_value_size(); - }); - meta - } - pub fn open_with_env(path: &str, env: Option>) -> Result { let mut cf_options = ColumnFamilyOptions::new(); if let Some(env) = env { @@ -63,20 +48,23 @@ impl RocksSstReader { } impl SstReader for RocksSstReader { - fn open(path: &str) -> Result { - Self::open_with_env(path, None) - } - fn open_encrypted(path: &str, mgr: Arc) -> Result { - let env = Env::new_key_managed_encrypted_env( - Arc::default(), - WrappedEncryptionKeyManager::new(mgr), - ) - .map_err(|err| Error::Other(box_err!("failed to open encrypted env: {}", err)))?; - Self::open_with_env(path, Some(Arc::new(env))) + fn open(path: &str, mgr: Option>) -> Result { + let env = get_env(mgr, get_io_rate_limiter())?; + Self::open_with_env(path, Some(env)) } + fn verify_checksum(&self) -> Result<()> { - self.inner.verify_checksum().map_err(r2e)?; - Ok(()) + self.inner.verify_checksum().map_err(r2e) + } + + fn kv_count_and_size(&self) -> (u64, u64) { + let mut count = 0; + let mut bytes = 0; + self.inner.read_table_properties(|p| { + count = p.num_entries(); + bytes = p.raw_key_size() + p.raw_value_size(); + }); + (count, bytes) } } diff --git a/components/engine_rocks/src/sst_partitioner.rs b/components/engine_rocks/src/sst_partitioner.rs index fc1dcd40270..f642a94f28f 100644 --- a/components/engine_rocks/src/sst_partitioner.rs +++ b/components/engine_rocks/src/sst_partitioner.rs @@ -23,6 +23,8 @@ impl rocksdb::SstPartitionerFactory output_level: context.output_level, smallest_key: context.smallest_key, largest_key: context.largest_key, + next_level_boundaries: context.next_level_boundaries.clone(), + next_level_sizes: context.next_level_sizes.clone(), }; self.0.create_partitioner(&ctx).map(RocksSstPartitioner) } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 225cd1d7f06..e4991419eed 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -3,6 +3,7 @@ use std::{ffi::CString, fs, path::Path, str::FromStr, sync::Arc}; use engine_traits::{Engines, Range, Result, CF_DEFAULT}; +use fail::fail_point; use rocksdb::{ load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, @@ -462,6 +463,13 @@ pub struct RangeCompactionFilterFactory(Arc); impl RangeCompactionFilterFactory { pub fn new(start_key: Box<[u8]>, end_key: Box<[u8]>) -> Self { + fail_point!("unlimited_range_compaction_filter", |_| { + let range = OwnedRange { + start_key: keys::data_key(b"").into_boxed_slice(), + end_key: keys::data_end_key(b"").into_boxed_slice(), + }; + Self(Arc::new(range)) + }); let range = OwnedRange { start_key, end_key }; Self(Arc::new(range)) } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index dd56d9a5db4..85d9d4c1b78 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -127,7 +127,7 @@ pub mod kv { } fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { - encryption::trash_dir_all(path, self.db_opt.key_manager.as_deref())?; + encryption::trash_dir_all(path, self.db_opt.get_key_manager().as_deref())?; Ok(()) } @@ -202,13 +202,17 @@ pub mod ctor { #[derive(Clone, Default)] pub struct DbOptions { - pub(crate) key_manager: Option>, + key_manager: Option>, rate_limiter: Option>, state_storage: Option>, enable_multi_batch_write: bool, } impl DbOptions { + pub fn get_key_manager(&self) -> Option> { + self.key_manager.clone() + } + pub fn set_key_manager(&mut self, key_manager: Option>) { self.key_manager = key_manager; } diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index 2d11b59f623..30f1861be0a 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -11,6 +11,7 @@ testexport = [] [dependencies] case_macros = { workspace = true } collections = { workspace = true } +encryption = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index 05590a1ff32..2a4341a6788 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -71,6 +71,9 @@ pub trait CompactExt: CfNamesExt { max_subcompactions: u32, exclude_l0: bool, ) -> Result<()>; + + // Check all data is in the range [start, end). + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()>; } pub trait CompactedEvent: Send { diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index 2c6e9c3d4e8..9713c406978 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -21,6 +21,7 @@ pub trait DbOptions { fn get_rate_limiter_auto_tuned(&self) -> Option; fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()>; fn set_flush_size(&mut self, f: usize) -> Result<()>; + fn get_flush_size(&self) -> Result; fn set_flush_oldest_first(&mut self, f: bool) -> Result<()>; fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); } diff --git a/components/engine_traits/src/encryption.rs b/components/engine_traits/src/encryption.rs deleted file mode 100644 index 7376e2d5592..00000000000 --- a/components/engine_traits/src/encryption.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - fmt::{self, Debug, Formatter}, - io::Result, -}; - -pub trait EncryptionKeyManager: Sync + Send { - fn get_file(&self, fname: &str) -> Result; - fn new_file(&self, fname: &str) -> Result; - /// Can be used with both file and directory. - /// - /// `physical_fname` is a hint when `fname` was renamed physically. - /// Depending on the implementation, providing false negative or false - /// positive value may result in leaking encryption keys. - fn delete_file(&self, fname: &str, physical_fname: Option<&str>) -> Result<()>; - fn link_file(&self, src_fname: &str, dst_fname: &str) -> Result<()>; -} - -#[derive(Clone, PartialEq)] -pub struct FileEncryptionInfo { - pub method: EncryptionMethod, - pub key: Vec, - pub iv: Vec, -} -impl Default for FileEncryptionInfo { - fn default() -> Self { - FileEncryptionInfo { - method: EncryptionMethod::Unknown, - key: vec![], - iv: vec![], - } - } -} - -impl Debug for FileEncryptionInfo { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!( - f, - "FileEncryptionInfo [method={:?}, key=...<{} bytes>, iv=...<{} bytes>]", - self.method, - self.key.len(), - self.iv.len() - ) - } -} - -impl FileEncryptionInfo { - pub fn is_empty(&self) -> bool { - self.key.is_empty() && self.iv.is_empty() - } -} - -#[derive(Copy, Clone, Debug, PartialEq)] -pub enum EncryptionMethod { - Unknown = 0, - Plaintext = 1, - Aes128Ctr = 2, - Aes192Ctr = 3, - Aes256Ctr = 4, - Sm4Ctr = 5, -} diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index cc90f2ce075..83f05180820 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -39,7 +39,10 @@ pub trait KvEngine: type Snapshot: Snapshot; /// Create a snapshot - fn snapshot(&self) -> Self::Snapshot; + /// + /// SnapCtx will only be used by some type of trait implementors (ex: + /// HybridEngine) + fn snapshot(&self, snap_ctx: Option) -> Self::Snapshot; /// Syncs any writes to disk fn sync(&self) -> Result<()>; @@ -78,3 +81,9 @@ pub trait KvEngine: #[cfg(feature = "testexport")] fn inner_refcount(&self) -> usize; } + +#[derive(Debug, Clone)] +pub struct SnapshotContext { + pub region_id: u64, + pub read_ts: u64, +} diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index 6ef46ff7a70..6df2ef5a992 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -149,6 +149,8 @@ pub enum Error { EntriesUnavailable, #[error("The entries of region is compacted")] EntriesCompacted, + #[error("Iterator of RegionCacheSnapshot is only supported with boundary set")] + BoundaryNotSet, } pub type Result = result::Result; @@ -165,6 +167,7 @@ impl ErrorCodeExt for Error { Error::Other(_) => error_code::UNKNOWN, Error::EntriesUnavailable => error_code::engine::DATALOSS, Error::EntriesCompacted => error_code::engine::DATACOMPACTED, + Error::BoundaryNotSet => error_code::engine::BOUNDARY_NOT_SET, } } } diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index eebf0e7c32a..8590236e126 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -18,14 +18,17 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, RwLock, }, + time::Duration, }; use kvproto::import_sstpb::SstMeta; -use slog_global::info; -use tikv_util::set_panic_mark; +use slog_global::{info, warn}; +use tikv_util::{set_panic_mark, time::Instant}; use crate::{data_cf_offset, RaftEngine, RaftLogBatch, DATA_CFS_LEN}; +const HEAVY_WORKER_THRESHOLD: Duration = Duration::from_millis(25); + #[derive(Debug)] pub struct ApplyProgress { cf: String, @@ -203,7 +206,16 @@ impl PersistenceListener { /// Called when memtable is frozen. /// /// `smallest_seqno` should be the smallest seqno of the memtable. - pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64) { + /// + /// Note: After https://github.com/tikv/rocksdb/pull/347, rocksdb global lock will + /// be held during this method, so we should avoid do heavy things in it. + pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64, largest_seqno: u64) { + let t = Instant::now_coarse(); + (|| { + fail_point!("on_memtable_sealed", |t| { + assert_eq!(t.unwrap().as_str(), cf); + }) + })(); // The correctness relies on the assumption that there will be only one // thread writting to the DB and increasing apply index. // Apply index will be set within DB lock, so it's correct even with manual @@ -214,8 +226,9 @@ impl PersistenceListener { let flushed = prs.last_flushed[offset]; if flushed > smallest_seqno { panic!( - "sealed seqno has been flushed {} {} {} <= {}", - cf, apply_index, smallest_seqno, flushed + "sealed seqno conflict with latest flushed index, cf {}, + sealed smallest_seqno {}, sealed largest_seqno {}, last_flushed {}, apply_index {}", + cf, smallest_seqno, largest_seqno, flushed, apply_index, ); } prs.prs.push_back(ApplyProgress { @@ -223,13 +236,18 @@ impl PersistenceListener { apply_index, smallest_seqno, }); + if t.saturating_elapsed() > HEAVY_WORKER_THRESHOLD { + warn!( + "heavy work in on_memtable_sealed, the code should be reviewed"; + ); + } } /// Called a memtable finished flushing. /// /// `largest_seqno` should be the largest seqno of the generated file. pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64, file_no: u64) { - fail_point!("on_flush_completed"); + fail_point!("on_flush_completed", |_| {}); // Maybe we should hook the compaction to avoid the file is compacted before // being recorded. let offset = data_cf_offset(cf); @@ -239,7 +257,13 @@ impl PersistenceListener { if flushed >= largest_seqno { // According to facebook/rocksdb#11183, it's possible OnFlushCompleted can be // called out of order. But it's guaranteed files are installed in order. - info!("flush complete reorder found"; "flushed" => flushed, "largest_seqno" => largest_seqno, "file_no" => file_no, "cf" => cf); + info!( + "flush complete reorder found"; + "flushed" => flushed, + "largest_seqno" => largest_seqno, + "file_no" => file_no, + "cf" => cf + ); return; } prs.last_flushed[offset] = largest_seqno; diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index e09b1b52733..53708994561 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -295,8 +295,6 @@ mod sst; pub use crate::sst::*; mod write_batch; pub use crate::write_batch::*; -mod encryption; -pub use crate::encryption::*; mod mvcc_properties; mod sst_partitioner; pub use crate::sst_partitioner::*; @@ -314,6 +312,8 @@ mod table_properties; pub use crate::table_properties::*; mod checkpoint; pub use crate::checkpoint::*; +mod memory_engine; +pub use memory_engine::RegionCacheEngine; // These modules contain more general traits, some of which may be implemented // by multiple types. diff --git a/components/engine_traits/src/memory_engine.rs b/components/engine_traits/src/memory_engine.rs new file mode 100644 index 00000000000..9babc8580fc --- /dev/null +++ b/components/engine_traits/src/memory_engine.rs @@ -0,0 +1,19 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::fmt::Debug; + +use crate::{Iterable, Snapshot, WriteBatchExt}; + +/// RegionCacheEngine works as a region cache caching some regions (in Memory or +/// NVME for instance) to improve the read performance. +pub trait RegionCacheEngine: + WriteBatchExt + Iterable + Debug + Clone + Unpin + Send + Sync + 'static +{ + type Snapshot: Snapshot; + + // If None is returned, the RegionCacheEngine is currently not readable for this + // region or read_ts. + // Sequence number is shared between RegionCacheEngine and disk KvEnigne to + // provide atomic write + fn snapshot(&self, region_id: u64, read_ts: u64, seq_num: u64) -> Option; +} diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 7871b3b8ecc..ad93db44231 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -178,4 +178,7 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt + WriteBatchExt { } Ok(n) } + + type DiskEngine; + fn get_disk_engine(&self) -> &Self::DiskEngine; } diff --git a/components/engine_traits/src/sst.rs b/components/engine_traits/src/sst.rs index 4a728df1e97..036c8999e3f 100644 --- a/components/engine_traits/src/sst.rs +++ b/components/engine_traits/src/sst.rs @@ -2,9 +2,10 @@ use std::{path::PathBuf, sync::Arc}; +use encryption::DataKeyManager; use kvproto::import_sstpb::SstMeta; -use crate::{errors::Result, EncryptionKeyManager, RefIterable}; +use crate::{errors::Result, RefIterable}; #[derive(Clone, Debug)] pub struct SstMetaInfo { @@ -20,10 +21,10 @@ pub trait SstExt: Sized { } /// SstReader is used to read an SST file. -pub trait SstReader: RefIterable + Sized { - fn open(path: &str) -> Result; - fn open_encrypted(path: &str, mgr: Arc) -> Result; +pub trait SstReader: RefIterable + Sized + Send { + fn open(path: &str, mgr: Option>) -> Result; fn verify_checksum(&self) -> Result<()>; + fn kv_count_and_size(&self) -> (u64, u64); } /// SstWriter is used to create sst files that can be added to database later. diff --git a/components/engine_traits/src/sst_partitioner.rs b/components/engine_traits/src/sst_partitioner.rs index bc6ec13a4eb..4a8ee9e71bc 100644 --- a/components/engine_traits/src/sst_partitioner.rs +++ b/components/engine_traits/src/sst_partitioner.rs @@ -22,6 +22,8 @@ pub struct SstPartitionerContext<'a> { pub output_level: i32, pub smallest_key: &'a [u8], pub largest_key: &'a [u8], + pub next_level_boundaries: Vec<&'a [u8]>, + pub next_level_sizes: Vec, } pub trait SstPartitioner { diff --git a/components/engine_traits_tests/src/ctor.rs b/components/engine_traits_tests/src/ctor.rs index 5d987d64858..ba3154d9267 100644 --- a/components/engine_traits_tests/src/ctor.rs +++ b/components/engine_traits_tests/src/ctor.rs @@ -9,7 +9,7 @@ use engine_test::{ ctor::{CfOptions, DbOptions, KvEngineConstructorExt}, kv::KvTestEngine, }; -use engine_traits::{EncryptionKeyManager, KvEngine, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; +use engine_traits::{KvEngine, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use super::tempdir; diff --git a/components/engine_traits_tests/src/iterator.rs b/components/engine_traits_tests/src/iterator.rs index 714ca4cb0b4..fee6cda6f02 100644 --- a/components/engine_traits_tests/src/iterator.rs +++ b/components/engine_traits_tests/src/iterator.rs @@ -41,7 +41,9 @@ fn iter_empty_engine() { #[test] fn iter_empty_snapshot() { let db = default_engine(); - iter_empty(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + iter_empty(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn iter_forward(e: &E, i: IF) @@ -99,7 +101,9 @@ fn iter_forward_engine() { #[test] fn iter_forward_snapshot() { let db = default_engine(); - iter_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + iter_forward(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn iter_reverse(e: &E, i: IF) @@ -157,7 +161,9 @@ fn iter_reverse_engine() { #[test] fn iter_reverse_snapshot() { let db = default_engine(); - iter_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + iter_reverse(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn seek_to_key_then_forward(e: &E, i: IF) @@ -198,7 +204,9 @@ fn seek_to_key_then_forward_engine() { #[test] fn seek_to_key_then_forward_snapshot() { let db = default_engine(); - seek_to_key_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + seek_to_key_then_forward(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn seek_to_key_then_reverse(e: &E, i: IF) @@ -239,7 +247,9 @@ fn seek_to_key_then_reverse_engine() { #[test] fn seek_to_key_then_reverse_snapshot() { let db = default_engine(); - seek_to_key_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + seek_to_key_then_reverse(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn iter_forward_then_reverse(e: &E, i: IF) @@ -300,7 +310,9 @@ fn iter_forward_then_reverse_engine() { #[test] fn iter_forward_then_reverse_snapshot() { let db = default_engine(); - iter_forward_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + iter_forward_then_reverse(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn iter_reverse_then_forward(e: &E, i: IF) @@ -361,7 +373,9 @@ fn iter_reverse_then_forward_engine() { #[test] fn iter_reverse_then_forward_snapshot() { let db = default_engine(); - iter_reverse_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + iter_reverse_then_forward(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } // When seek finds an exact key then seek_for_prev behaves just like seek @@ -405,7 +419,9 @@ fn seek_for_prev_engine() { #[test] fn seek_for_prev_snapshot() { let db = default_engine(); - seek_for_prev(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + seek_for_prev(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } // When Seek::Key doesn't find an exact match, @@ -440,7 +456,9 @@ fn seek_key_miss_engine() { #[test] fn seek_key_miss_snapshot() { let db = default_engine(); - seek_key_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + seek_key_miss(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } fn seek_key_prev_miss(e: &E, i: IF) @@ -472,5 +490,7 @@ fn seek_key_prev_miss_engine() { #[test] fn seek_key_prev_miss_snapshot() { let db = default_engine(); - seek_key_prev_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + seek_key_prev_miss(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } diff --git a/components/engine_traits_tests/src/read_consistency.rs b/components/engine_traits_tests/src/read_consistency.rs index 8c7ab50657f..35d0262fbcb 100644 --- a/components/engine_traits_tests/src/read_consistency.rs +++ b/components/engine_traits_tests/src/read_consistency.rs @@ -12,7 +12,7 @@ fn snapshot_with_writes() { db.engine.put(b"a", b"aa").unwrap(); - let snapshot = db.engine.snapshot(); + let snapshot = db.engine.snapshot(None); assert_eq!(snapshot.get_value(b"a").unwrap().unwrap(), b"aa"); @@ -77,5 +77,7 @@ fn iterator_with_writes_engine() { #[test] fn iterator_with_writes_snapshot() { let db = default_engine(); - iterator_with_writes(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); + iterator_with_writes(&db.engine, |e| { + e.snapshot(None).iterator(CF_DEFAULT).unwrap() + }); } diff --git a/components/engine_traits_tests/src/snapshot_basic.rs b/components/engine_traits_tests/src/snapshot_basic.rs index c0f93480830..83248abfb6e 100644 --- a/components/engine_traits_tests/src/snapshot_basic.rs +++ b/components/engine_traits_tests/src/snapshot_basic.rs @@ -10,7 +10,7 @@ fn snapshot_get_value() { db.engine.put(b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(); + let snap = db.engine.snapshot(None); let value = snap.get_value(b"a").unwrap(); let value = value.unwrap(); @@ -26,7 +26,7 @@ fn snapshot_get_value_after_put() { db.engine.put(b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(); + let snap = db.engine.snapshot(None); db.engine.put(b"a", b"aaa").unwrap(); @@ -41,7 +41,7 @@ fn snapshot_get_value_cf() { db.engine.put_cf(CF_WRITE, b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(); + let snap = db.engine.snapshot(None); let value = snap.get_value_cf(CF_WRITE, b"a").unwrap(); let value = value.unwrap(); @@ -57,7 +57,7 @@ fn snapshot_get_value_cf_after_put() { db.engine.put_cf(CF_WRITE, b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(); + let snap = db.engine.snapshot(None); db.engine.put_cf(CF_WRITE, b"a", b"aaa").unwrap(); diff --git a/components/engine_traits_tests/src/sst.rs b/components/engine_traits_tests/src/sst.rs index 26ed686aad4..77258e649ff 100644 --- a/components/engine_traits_tests/src/sst.rs +++ b/components/engine_traits_tests/src/sst.rs @@ -48,7 +48,7 @@ fn basic() -> Result<()> { sst_writer.put(b"k1", b"v1")?; sst_writer.finish()?; - let sst_reader = ::SstReader::open(&sst_path)?; + let sst_reader = ::SstReader::open(&sst_path, None)?; let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; @@ -77,7 +77,7 @@ fn forward() -> Result<()> { sst_writer.put(b"k2", b"v2")?; sst_writer.finish()?; - let sst_reader = ::SstReader::open(&sst_path)?; + let sst_reader = ::SstReader::open(&sst_path, None)?; let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; @@ -114,7 +114,7 @@ fn reverse() -> Result<()> { sst_writer.put(b"k2", b"v2")?; sst_writer.finish()?; - let sst_reader = ::SstReader::open(&sst_path)?; + let sst_reader = ::SstReader::open(&sst_path, None)?; let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_last()?; @@ -152,7 +152,7 @@ fn delete() -> Result<()> { sst_writer.delete(b"k1")?; sst_writer.finish()?; - let sst_reader = ::SstReader::open(&sst_path)?; + let sst_reader = ::SstReader::open(&sst_path, None)?; let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; @@ -206,7 +206,7 @@ fn same_key() -> Result<()> { sst_writer.finish()?; - let sst_reader = ::SstReader::open(&sst_path)?; + let sst_reader = ::SstReader::open(&sst_path, None)?; let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; @@ -248,7 +248,7 @@ fn reverse_key() -> Result<()> { sst_writer.finish()?; - let sst_reader = ::SstReader::open(&sst_path)?; + let sst_reader = ::SstReader::open(&sst_path, None)?; let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; diff --git a/components/error_code/src/backup_stream.rs b/components/error_code/src/backup_stream.rs index a4b28b0e9ee..78cb544746d 100644 --- a/components/error_code/src/backup_stream.rs +++ b/components/error_code/src/backup_stream.rs @@ -3,9 +3,6 @@ define_error_codes! { "KV:LogBackup:", - ETCD => ("Etcd", - "Error during requesting the meta store(etcd)", - "Please check the connectivity between TiKV and PD."), PROTO => ("Proto", "Error during decode / encoding protocol buffer messages", "Please check the version of TiKV / BR are compatible, or whether data is corrupted." diff --git a/components/error_code/src/engine.rs b/components/error_code/src/engine.rs index 4bb66f09753..4ae712ffa58 100644 --- a/components/error_code/src/engine.rs +++ b/components/error_code/src/engine.rs @@ -10,5 +10,6 @@ define_error_codes!( CF_NAME => ("CfName", "", ""), CODEC => ("Codec", "", ""), DATALOSS => ("DataLoss", "", ""), - DATACOMPACTED => ("DataCompacted", "", "") + DATACOMPACTED => ("DataCompacted", "", ""), + BOUNDARY_NOT_SET => ("BoundaryNotSet", "", "") ); diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 001f4f146f6..9e568ee00c1 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -22,5 +22,10 @@ define_error_codes!( TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), INCOMPATIBLE_API_VERSION => ("IncompatibleApiVersion", "", ""), INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), - RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", "") + RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", ""), + SUSPENDED => ("Suspended", + "this request has been suspended.", + "Probably there are some export tools don't support exporting data inserted by `ingest`(say, snapshot backup). Check the user manual and stop them."), + REQUEST_TOO_NEW => ("RequestTooNew", "", ""), + REQUEST_TOO_OLD => ("RequestTooOld", "", "") ); diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index aed49aad3ab..69de83e5474 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -4,39 +4,24 @@ version = "0.0.1" edition = "2021" publish = false -[features] -cloud-storage-dylib = [ - "ffi-support", - "libloading", - "protobuf", -] -cloud-storage-grpc = [ - "grpcio", -] -failpoints = ["fail/failpoints"] - [dependencies] async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } async-trait = "0.1" -bytes = "1.0" +aws = { workspace = true } +azure = { workspace = true } +cloud = { workspace = true } encryption = { workspace = true } engine_traits = { workspace = true } -fail = "0.5" -ffi-support = { optional = true, version = "0.4.2" } file_system = { workspace = true } futures = "0.3" -futures-executor = "0.3" futures-io = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { workspace = true, optional = true } +gcp = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" -libloading = { optional = true, version = "0.7.0" } -openssl = "0.10" +openssl = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly", "push"] } -protobuf = { optional = true, version = "2" } rand = "0.8" -rusoto_core = "0.46.0" slog = { workspace = true } # better to not use slog-global, but pass in the logger slog-global = { workspace = true } @@ -52,3 +37,7 @@ rust-ini = "0.14.0" structopt = "0.3" tempfile = "3.1" tokio = { version = "1.5", features = ["macros"] } + +[[example]] +name = "scli" +path = "examples/scli.rs" diff --git a/components/external_storage/export/examples/scli.rs b/components/external_storage/examples/scli.rs similarity index 75% rename from components/external_storage/export/examples/scli.rs rename to components/external_storage/examples/scli.rs index 0ab54721b29..9621f840e6c 100644 --- a/components/external_storage/export/examples/scli.rs +++ b/components/external_storage/examples/scli.rs @@ -6,19 +6,13 @@ use std::{ path::Path, }; -#[cfg(feature = "cloud-azure")] -use external_storage_export::make_azblob_backend; -#[cfg(feature = "cloud-gcp")] -use external_storage_export::make_gcs_backend; -#[cfg(feature = "cloud-aws")] -use external_storage_export::make_s3_backend; -use external_storage_export::{ - create_storage, make_cloud_backend, make_hdfs_backend, make_local_backend, make_noop_backend, - ExternalStorage, UnpinReader, +use external_storage::{ + create_storage, make_azblob_backend, make_gcs_backend, make_hdfs_backend, make_local_backend, + make_noop_backend, make_s3_backend, ExternalStorage, UnpinReader, }; use futures_util::io::{copy, AllowStdIo}; use ini::ini::Ini; -use kvproto::brpb::{AzureBlobStorage, Bucket, CloudDynamic, Gcs, StorageBackend, S3}; +use kvproto::brpb::{AzureBlobStorage, Gcs, StorageBackend, S3}; use structopt::{clap::arg_enum, StructOpt}; use tikv_util::stream::block_on_external_io; use tokio::runtime::Runtime; @@ -32,7 +26,6 @@ arg_enum! { S3, GCS, Azure, - Cloud, } } @@ -67,8 +60,6 @@ pub struct Opt { /// Remote path prefix #[structopt(short = "x", long)] prefix: Option, - #[structopt(long)] - cloud_name: Option, #[structopt(subcommand)] command: Command, } @@ -82,35 +73,6 @@ enum Command { Load, } -fn create_cloud_storage(opt: &Opt) -> Result { - let mut bucket = Bucket::default(); - if let Some(endpoint) = &opt.endpoint { - bucket.endpoint = endpoint.to_string(); - } - if let Some(region) = &opt.region { - bucket.region = region.to_string(); - } - if let Some(bucket_name) = &opt.bucket { - bucket.bucket = bucket_name.to_string(); - } else { - return Err(Error::new(ErrorKind::Other, "missing bucket")); - } - if let Some(prefix) = &opt.prefix { - bucket.prefix = prefix.to_string(); - } - let mut config = CloudDynamic::default(); - config.set_bucket(bucket); - let mut attrs = std::collections::HashMap::new(); - if let Some(credential_file) = &opt.credential_file { - attrs.insert("credential_file".to_owned(), credential_file.clone()); - } - config.set_attrs(attrs); - if let Some(cloud_name) = &opt.cloud_name { - config.provider_name = cloud_name.clone(); - } - Ok(make_cloud_backend(config)) -} - fn create_s3_storage(opt: &Opt) -> Result { let mut config = S3::default(); @@ -150,10 +112,7 @@ fn create_s3_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - #[cfg(feature = "cloud-aws")] - return Ok(make_s3_backend(config)); - #[cfg(not(feature = "cloud-aws"))] - return Err(Error::new(ErrorKind::Other, "missing feature")); + Ok(make_s3_backend(config)) } fn create_gcs_storage(opt: &Opt) -> Result { @@ -173,10 +132,7 @@ fn create_gcs_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - #[cfg(feature = "cloud-gcp")] - return Ok(make_gcs_backend(config)); - #[cfg(not(feature = "cloud-gcp"))] - return Err(Error::new(ErrorKind::Other, "missing feature")); + Ok(make_gcs_backend(config)) } fn create_azure_storage(opt: &Opt) -> Result { @@ -212,10 +168,7 @@ fn create_azure_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - #[cfg(feature = "cloud-azure")] - return Ok(make_azblob_backend(config)); - #[cfg(not(feature = "cloud-azure"))] - return Err(Error::new(ErrorKind::Other, "missing feature")); + Ok(make_azblob_backend(config)) } fn process() -> Result<()> { @@ -228,7 +181,6 @@ fn process() -> Result<()> { StorageType::S3 => create_s3_storage(&opt)?, StorageType::GCS => create_gcs_storage(&opt)?, StorageType::Azure => create_azure_storage(&opt)?, - StorageType::Cloud => create_cloud_storage(&opt)?, }), Default::default(), )?; diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml deleted file mode 100644 index 6537eaf8995..00000000000 --- a/components/external_storage/export/Cargo.toml +++ /dev/null @@ -1,96 +0,0 @@ -[package] -name = "external_storage_export" -version = "0.0.1" -edition = "2021" -publish = false - -[[bin]] -name = "tikv-cloud-storage" -path = "src/bin/tikv-cloud-storage.rs" -required-features = ["cloud-storage-grpc"] - -[lib] -name = "external_storage_export" -# Experimental feature to load the cloud storage code dynamically -# crate-type = ["lib", "cdylib"] - -[features] -default = ["cloud-gcp", "cloud-aws", "cloud-azure"] -cloud-aws = ["aws"] -cloud-gcp = ["gcp"] -cloud-azure = ["azure"] -cloud-storage-dylib = [ - "external_storage/cloud-storage-dylib", - "ffi-support", - "file_system", - "futures", - "libloading", - "lazy_static", - "once_cell", - "protobuf", - "slog", - "slog-global", - "tokio", - "tokio-util", -] -cloud-storage-grpc = [ - "external_storage/cloud-storage-grpc", - "grpcio", - "file_system", - "futures", - "futures-executor", - "libc", - "signal-hook", - "slog", - "slog-global", - "slog-term", - "tokio", - "tokio-util", -] - -[dependencies] -async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } -async-trait = "0.1" -aws = { optional = true, workspace = true } -azure = { optional = true, workspace = true } -cloud = { workspace = true } -encryption = { workspace = true } -engine_traits = { workspace = true } -external_storage = { workspace = true } -ffi-support = { optional = true, version = "0.4.2" } -file_system = { workspace = true, optional = true } -futures = { optional = true, version = "0.3" } -futures-executor = { optional = true, version = "0.3" } -futures-io = { version = "0.3" } -futures-util = { version = "0.3", default-features = false, features = ["io"] } -gcp = { optional = true, workspace = true } -grpcio = { workspace = true, optional = true } -kvproto = { workspace = true } -lazy_static = { optional = true, version = "1.3" } -libloading = { optional = true, version = "0.7.0" } -once_cell = { optional = true, version = "1.3.1" } -protobuf = { optional = true, version = "2" } -slog-global = { optional = true, workspace = true } -tikv_util = { workspace = true } -tokio = { version = "1.5", features = ["time", "rt", "net"], optional = true } -tokio-util = { version = "0.7", features = ["compat"], optional = true } -url = "2.0" - -[dev-dependencies] -futures-util = { version = "0.3", default-features = false, features = ["io"] } -matches = "0.1.8" -rust-ini = "0.14.0" -structopt = "0.3" -tempfile = "3.1" -tokio = { version = "1.5", features = ["time"] } - -[[example]] -name = "scli" -path = "examples/scli.rs" - -[target.'cfg(unix)'.dependencies] -nix = { optional = true, version = "0.24" } -signal-hook = { optional = true, version = "0.3" } -libc = { optional = true, version = "0.2" } -slog = { optional = true, version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-term = { optional = true, version = "2.4" } diff --git a/components/external_storage/export/src/bin/tikv-cloud-storage.rs b/components/external_storage/export/src/bin/tikv-cloud-storage.rs deleted file mode 100644 index 07cd8507948..00000000000 --- a/components/external_storage/export/src/bin/tikv-cloud-storage.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use external_storage_export::new_service; -use grpcio::{self}; -use slog::{self}; -use slog_global::{info, warn}; -use tikv_util::logger::{self}; - -fn build_logger(drainer: D, log_level: slog::Level) -where - D: slog::Drain + Send + 'static, - ::Err: std::fmt::Display, -{ - // use async drainer and init std log. - logger::init_log(drainer, log_level, true, true, vec![], 100).unwrap_or_else(|e| { - println!("failed to initialize log: {}", e); - }); -} - -fn main() { - println!("starting GRPC cloud-storage service"); - let decorator = slog_term::PlainDecorator::new(std::io::stdout()); - let drain = slog_term::CompactFormat::new(decorator).build(); - build_logger(drain, slog::Level::Debug); - warn!("redirect grpcio logging"); - grpcio::redirect_log(); - info!("slog logging"); - let service = new_service().expect("GRPC service creation for tikv-cloud-storage"); - wait::for_signal(); - info!("service {:?}", service); -} - -#[cfg(unix)] -mod wait { - use libc::c_int; - use signal_hook::{ - consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, - iterator::Signals, - Signals, - }; - use slog_global::info; - - pub fn for_signal() { - let mut signals = Signals::new(&[SIGTERM, SIGINT, SIGHUP]).unwrap(); - for signal in &mut signals { - match signal { - SIGTERM | SIGINT | SIGHUP => { - info!("receive signal {}, stopping server...", signal); - break; - } - // TODO: handle more signals - _ => unreachable!(), - } - } - } -} - -#[cfg(not(unix))] -mod wait { - pub fn for_signal() {} -} diff --git a/components/external_storage/export/src/dylib.rs b/components/external_storage/export/src/dylib.rs deleted file mode 100644 index 308973de95e..00000000000 --- a/components/external_storage/export/src/dylib.rs +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Mutex; - -use anyhow::Context; -use kvproto::brpb as proto; -pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; -use lazy_static::lazy_static; -use once_cell::sync::OnceCell; -use protobuf::{self}; -use slog_global::{error, info}; -use tokio::runtime::{Builder, Runtime}; - -use crate::request::{restore_receiver, write_receiver}; - -static RUNTIME: OnceCell = OnceCell::new(); -lazy_static! { - static ref RUNTIME_INIT: Mutex<()> = Mutex::new(()); -} - -/// # Safety -/// Deref data pointer, thus unsafe -#[no_mangle] -pub extern "C" fn external_storage_init(error: &mut ffi_support::ExternError) { - ffi_support::call_with_result(error, || { - (|| -> anyhow::Result<()> { - let guarded = RUNTIME_INIT.lock().unwrap(); - if RUNTIME.get().is_some() { - return Ok(()); - } - let runtime = Builder::new() - .basic_scheduler() - .thread_name("external-storage-dylib") - .core_threads(1) - .enable_all() - .build() - .context("build runtime")?; - if RUNTIME.set(runtime).is_err() { - error!("runtime already set") - } - #[allow(clippy::unit_arg)] - Ok(*guarded) - })() - .context("external_storage_init") - .map_err(anyhow_to_extern_err) - }) -} - -/// # Safety -/// Deref data pointer, thus unsafe -#[no_mangle] -pub unsafe extern "C" fn external_storage_write( - data: *const u8, - len: i32, - error: &mut ffi_support::ExternError, -) { - ffi_support::call_with_result(error, || { - (|| -> anyhow::Result<()> { - let runtime = RUNTIME - .get() - .context("must first call external_storage_init")?; - let buffer = get_buffer(data, len); - let req: proto::ExternalStorageWriteRequest = protobuf::parse_from_bytes(buffer)?; - info!("write request {:?}", req.get_object_name()); - write_receiver(&runtime, req) - })() - .context("external_storage_write") - .map_err(anyhow_to_extern_err) - }) -} - -/// # Safety -/// Deref data pointer, thus unsafe -pub unsafe extern "C" fn external_storage_restore( - data: *const u8, - len: i32, - error: &mut ffi_support::ExternError, -) { - ffi_support::call_with_result(error, || { - (|| -> anyhow::Result<()> { - let runtime = RUNTIME - .get() - .context("must first call external_storage_init")?; - let buffer = get_buffer(data, len); - let req: proto::ExternalStorageRestoreRequest = protobuf::parse_from_bytes(buffer)?; - info!("restore request {:?}", req.get_object_name()); - Ok(restore_receiver(runtime, req)?) - })() - .context("external_storage_restore") - .map_err(anyhow_to_extern_err) - }) -} - -unsafe fn get_buffer<'a>(data: *const u8, len: i32) -> &'a [u8] { - assert!(len >= 0, "Bad buffer len: {}", len); - if len == 0 { - // This will still fail, but as a bad protobuf format. - &[] - } else { - assert!(!data.is_null(), "Unexpected null data pointer"); - std::slice::from_raw_parts(data, len as usize) - } -} - -fn anyhow_to_extern_err(e: anyhow::Error) -> ffi_support::ExternError { - ffi_support::ExternError::new_error(ffi_support::ErrorCode::new(1), format!("{:?}", e)) -} - -pub mod staticlib { - use std::{ - io::{self}, - sync::Arc, - }; - - use external_storage::{ - dylib_client::extern_to_io_err, - request::{ - anyhow_to_io_log_error, file_name_for_write, restore_sender, write_sender, DropPath, - }, - ExternalStorage, - }; - use futures_io::AsyncRead; - use protobuf::Message; - use tikv_util::time::Limiter; - - use super::*; - - struct ExternalStorageClient { - backend: Backend, - runtime: Arc, - name: &'static str, - url: url::Url, - } - - pub fn new_client( - backend: Backend, - name: &'static str, - url: url::Url, - ) -> io::Result> { - let runtime = Builder::new() - .basic_scheduler() - .thread_name("external-storage-dylib-client") - .core_threads(1) - .enable_all() - .build()?; - external_storage_init_ffi()?; - Ok(Box::new(ExternalStorageClient { - runtime: Arc::new(runtime), - backend, - name, - url, - }) as _) - } - - impl ExternalStorage for ExternalStorageClient { - fn name(&self) -> &'static str { - self.name - } - - fn url(&self) -> io::Result { - Ok(self.url.clone()) - } - - fn write( - &self, - name: &str, - reader: Box, - content_length: u64, - ) -> io::Result<()> { - info!("external storage writing"); - (|| -> anyhow::Result<()> { - let file_path = file_name_for_write(&self.name, &name); - let req = write_sender( - &self.runtime, - self.backend.clone(), - file_path.clone(), - name, - reader, - content_length, - )?; - let bytes = req.write_to_bytes()?; - info!("write request"); - external_storage_write_ffi(bytes)?; - DropPath(file_path); - Ok(()) - })() - .context("external storage write") - .map_err(anyhow_to_io_log_error) - } - - fn read(&self, _name: &str) -> crate::ExternalData<'_> { - unimplemented!("use restore instead of read") - } - - fn restore( - &self, - storage_name: &str, - restore_name: std::path::PathBuf, - expected_length: u64, - speed_limiter: &Limiter, - ) -> io::Result<()> { - info!("external storage restore"); - let req = restore_sender( - self.backend.clone(), - storage_name, - restore_name, - expected_length, - speed_limiter, - )?; - let bytes = req.write_to_bytes()?; - external_storage_restore_ffi(bytes) - } - } - - fn external_storage_write_ffi(bytes: Vec) -> io::Result<()> { - let mut e = ffi_support::ExternError::default(); - unsafe { - external_storage_write(bytes.as_ptr(), bytes.len() as i32, &mut e); - } - if e.get_code() != ffi_support::ErrorCode::SUCCESS { - Err(extern_to_io_err(e)) - } else { - Ok(()) - } - } - - fn external_storage_restore_ffi(bytes: Vec) -> io::Result<()> { - let mut e = ffi_support::ExternError::default(); - unsafe { - external_storage_restore(bytes.as_ptr(), bytes.len() as i32, &mut e); - } - if e.get_code() != ffi_support::ErrorCode::SUCCESS { - Err(extern_to_io_err(e)) - } else { - Ok(()) - } - } - - fn external_storage_init_ffi() -> io::Result<()> { - let mut e = ffi_support::ExternError::default(); - external_storage_init(&mut e); - if e.get_code() != ffi_support::ErrorCode::SUCCESS { - return Err(extern_to_io_err(e)); - } - Ok(()) - } -} diff --git a/components/external_storage/export/src/grpc_service.rs b/components/external_storage/export/src/grpc_service.rs deleted file mode 100644 index 7ef2bd093d1..00000000000 --- a/components/external_storage/export/src/grpc_service.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - io::{self, ErrorKind}, - sync::Arc, -}; - -use anyhow::Context; -use external_storage::request::anyhow_to_io_log_error; -use grpcio::{self}; -use kvproto::brpb as proto; -use slog_global::{error, info}; -use tokio::runtime::{Builder, Runtime}; - -use crate::request::{restore_receiver, write_receiver}; - -#[derive(Debug)] -pub struct SocketService { - server: grpcio::Server, - listener: std::os::unix::net::UnixListener, -} - -pub fn new_service() -> io::Result { - (|| -> anyhow::Result { - let env = Arc::new(grpcio::EnvBuilder::new().build()); - let storage_service = Service::new().context("new storage service")?; - let builder = grpcio::ServerBuilder::new(env) - .register_service(proto::create_external_storage(storage_service)); - let grpc_socket_path = "/tmp/grpc-external-storage.sock"; - let socket_addr = format!("unix:{}", grpc_socket_path); - let socket_path = std::path::PathBuf::from(grpc_socket_path); - // Keep the listener in scope: otherwise the socket is destroyed - let listener = bind_socket(&socket_path).context("GRPC new service create socket")?; - let mut server = builder - .bind(socket_addr, 0) - .build() - .context("GRPC build server")?; - server.start(); - let (..) = server.bind_addrs().next().context("GRPC bind server")?; - Ok(SocketService { server, listener }) - })() - .context("new service") - .map_err(anyhow_to_io_log_error) -} - -/// Service handles the RPC messages for the `ExternalStorage` service. -#[derive(Clone)] -pub struct Service { - runtime: Arc, -} - -impl Service { - /// Create a new backup service. - pub fn new() -> io::Result { - let runtime = Arc::new( - Builder::new() - .basic_scheduler() - .thread_name("external-storage-grpc-service") - .core_threads(1) - .enable_all() - .build()?, - ); - Ok(Service { runtime }) - } -} - -impl proto::ExternalStorage for Service { - fn save( - &mut self, - _ctx: grpcio::RpcContext, - req: proto::ExternalStorageWriteRequest, - sink: grpcio::UnarySink, - ) { - info!("write request {:?}", req.get_object_name()); - let result = write_receiver(&self.runtime, req); - match result { - Ok(_) => { - let rsp = proto::ExternalStorageWriteResponse::default(); - info!("success write"); - sink.success(rsp); - } - Err(e) => { - error!("write {}", e); - sink.fail(make_rpc_error(anyhow_to_io_log_error(e))); - } - } - } - - fn restore( - &mut self, - _ctx: grpcio::RpcContext, - req: proto::ExternalStorageRestoreRequest, - sink: grpcio::UnarySink, - ) { - info!( - "restore request {:?} {:?}", - req.get_object_name(), - req.get_restore_name() - ); - let result = restore_receiver(&self.runtime, req); - match result { - Ok(_) => { - let rsp = proto::ExternalStorageRestoreResponse::default(); - info!("success restore"); - sink.success(rsp); - } - Err(e) => { - error!("restore {}", e); - sink.fail(make_rpc_error(e)); - } - } - } -} - -pub fn make_rpc_error(err: io::Error) -> grpcio::RpcStatus { - grpcio::RpcStatus::new( - match err.kind() { - ErrorKind::NotFound => grpcio::RpcStatusCode::NOT_FOUND, - ErrorKind::InvalidInput => grpcio::RpcStatusCode::INVALID_ARGUMENT, - ErrorKind::PermissionDenied => grpcio::RpcStatusCode::UNAUTHENTICATED, - _ => grpcio::RpcStatusCode::UNKNOWN, - }, - Some(format!("{:?}", err)), - ) -} - -fn bind_socket(socket_path: &std::path::Path) -> anyhow::Result { - let msg = format!("bind socket {:?}", &socket_path); - info!("{}", msg); - std::os::unix::net::UnixListener::bind(&socket_path).context(msg) -} diff --git a/components/external_storage/export/src/lib.rs b/components/external_storage/export/src/lib.rs deleted file mode 100644 index e04e5beb695..00000000000 --- a/components/external_storage/export/src/lib.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -mod export; -pub use export::*; - -#[cfg(feature = "cloud-storage-grpc")] -mod grpc_service; -#[cfg(feature = "cloud-storage-grpc")] -pub use grpc_service::new_service; - -#[cfg(feature = "cloud-storage-dylib")] -mod dylib; - -#[cfg(any(feature = "cloud-storage-grpc", feature = "cloud-storage-dylib"))] -mod request; diff --git a/components/external_storage/export/src/request.rs b/components/external_storage/export/src/request.rs deleted file mode 100644 index 5623c0732d7..00000000000 --- a/components/external_storage/export/src/request.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::io::{self}; - -use anyhow::Context; -use external_storage::request::file_name_for_write; -use file_system::File; -use futures::executor::block_on; -use futures_io::AsyncRead; -use kvproto::brpb as proto; -pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; -use slog_global::info; -use tikv_util::time::Limiter; -use tokio::runtime::Runtime; -use tokio_util::compat::Tokio02AsyncReadCompatExt; - -use crate::export::{create_storage_no_client, read_external_storage_into_file, ExternalStorage}; - -pub fn write_receiver( - runtime: &Runtime, - req: proto::ExternalStorageWriteRequest, -) -> anyhow::Result<()> { - let storage_backend = req.get_storage_backend(); - let object_name = req.get_object_name(); - let content_length = req.get_content_length(); - let storage = create_storage_no_client(storage_backend).context("create storage")?; - let file_path = file_name_for_write(storage.name(), object_name); - let reader = runtime - .enter(|| block_on(open_file_as_async_read(file_path))) - .context("open file")?; - storage - .write(object_name, reader, content_length) - .context("storage write") -} - -pub fn restore_receiver( - runtime: &Runtime, - req: proto::ExternalStorageRestoreRequest, -) -> io::Result<()> { - let object_name = req.get_object_name(); - let storage_backend = req.get_storage_backend(); - let file_name = std::path::PathBuf::from(req.get_restore_name()); - let expected_length = req.get_content_length(); - runtime.enter(|| { - block_on(restore_inner( - storage_backend, - object_name, - file_name, - expected_length, - )) - }) -} - -pub async fn restore_inner( - storage_backend: &proto::StorageBackend, - object_name: &str, - file_name: std::path::PathBuf, - expected_length: u64, -) -> io::Result<()> { - let storage = create_storage_no_client(&storage_backend)?; - // TODO: support encryption. The service must be launched with or sent a - // DataKeyManager - let output: &mut dyn io::Write = &mut File::create(file_name)?; - // the minimum speed of reading data, in bytes/second. - // if reading speed is slower than this rate, we will stop with - // a "TimedOut" error. - // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) - const MINIMUM_READ_SPEED: usize = 8192; - let limiter = Limiter::new(f64::INFINITY); - let x = read_external_storage_into_file( - &mut storage.read(object_name), - output, - &limiter, - expected_length, - None, - MINIMUM_READ_SPEED, - ) - .await; - x -} - -async fn open_file_as_async_read( - file_path: std::path::PathBuf, -) -> anyhow::Result> { - info!("open file {:?}", &file_path); - let f = tokio::fs::File::open(file_path) - .await - .context("open file")?; - let reader: Box = Box::new(Box::pin(f.compat())); - Ok(reader) -} diff --git a/components/external_storage/src/dylib_client.rs b/components/external_storage/src/dylib_client.rs deleted file mode 100644 index 9e2748c2011..00000000000 --- a/components/external_storage/src/dylib_client.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - io::{self, ErrorKind}, - sync::Arc, -}; - -use anyhow::Context; -use futures_io::AsyncRead; -pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; -use protobuf::{self, Message}; -use slog_global::info; -use tikv_util::time::Limiter; -use tokio::runtime::{Builder, Runtime}; - -use crate::{ - request::{ - anyhow_to_io_log_error, file_name_for_write, restore_sender, write_sender, DropPath, - }, - ExternalStorage, -}; - -struct ExternalStorageClient { - backend: Backend, - runtime: Arc, - library: libloading::Library, - name: &'static str, - url: url::Url, -} - -pub fn new_client( - backend: Backend, - name: &'static str, - url: url::Url, -) -> io::Result> { - let runtime = Builder::new() - .basic_scheduler() - .thread_name("external-storage-dylib-client") - .core_threads(1) - .enable_all() - .build()?; - let library = unsafe { - libloading::Library::new( - std::path::Path::new("./") - .join(libloading::library_filename("external_storage_export")), - ) - .map_err(libloading_err_to_io)? - }; - external_storage_init_ffi_dynamic(&library)?; - Ok(Box::new(ExternalStorageClient { - runtime: Arc::new(runtime), - backend, - library, - name, - url, - }) as _) -} - -impl ExternalStorage for ExternalStorageClient { - fn name(&self) -> &'static str { - self.name - } - - fn url(&self) -> io::Result { - Ok(self.url.clone()) - } - - fn write( - &self, - name: &str, - reader: Box, - content_length: u64, - ) -> io::Result<()> { - info!("external storage writing"); - (|| -> anyhow::Result<()> { - let file_path = file_name_for_write(&self.name, &name); - let req = write_sender( - &self.runtime, - self.backend.clone(), - file_path.clone(), - name, - reader, - content_length, - )?; - let bytes = req.write_to_bytes()?; - info!("write request"); - call_ffi_dynamic(&self.library, b"external_storage_write", bytes)?; - DropPath(file_path); - Ok(()) - })() - .context("external storage write") - .map_err(anyhow_to_io_log_error) - } - - fn read(&self, _name: &str) -> crate::ExternalData<'_> { - unimplemented!("use restore instead of read") - } - - fn restore( - &self, - storage_name: &str, - restore_name: std::path::PathBuf, - expected_length: u64, - speed_limiter: &Limiter, - ) -> io::Result<()> { - info!("external storage restore"); - let req = restore_sender( - self.backend.clone(), - storage_name, - restore_name, - expected_length, - speed_limiter, - )?; - let bytes = req.write_to_bytes()?; - call_ffi_dynamic(&self.library, b"external_storage_restore", bytes) - } -} - -pub fn extern_to_io_err(e: ffi_support::ExternError) -> io::Error { - io::Error::new(io::ErrorKind::Other, format!("{:?}", e)) -} - -type FfiInitFn<'a> = - libloading::Symbol<'a, unsafe extern "C" fn(error: &mut ffi_support::ExternError) -> ()>; -type FfiFn<'a> = libloading::Symbol< - 'a, - unsafe extern "C" fn(error: &mut ffi_support::ExternError, bytes: Vec) -> (), ->; - -fn external_storage_init_ffi_dynamic(library: &libloading::Library) -> io::Result<()> { - let mut e = ffi_support::ExternError::default(); - unsafe { - let func: FfiInitFn = library - .get(b"external_storage_init") - .map_err(libloading_err_to_io)?; - func(&mut e); - } - if e.get_code() != ffi_support::ErrorCode::SUCCESS { - return Err(extern_to_io_err(e)); - } - Ok(()) -} - -fn call_ffi_dynamic( - library: &libloading::Library, - fn_name: &[u8], - bytes: Vec, -) -> io::Result<()> { - let mut e = ffi_support::ExternError::default(); - unsafe { - let func: FfiFn = library.get(fn_name).map_err(libloading_err_to_io)?; - func(&mut e, bytes); - } - if e.get_code() != ffi_support::ErrorCode::SUCCESS { - return Err(extern_to_io_err(e)); - } - Ok(()) -} - -fn libloading_err_to_io(e: libloading::Error) -> io::Error { - // TODO: custom error type - let kind = match e { - libloading::Error::DlOpen { .. } | libloading::Error::DlOpenUnknown => { - ErrorKind::AddrNotAvailable - } - _ => ErrorKind::Other, - }; - io::Error::new(kind, format!("{}", e)) -} diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/src/export.rs similarity index 54% rename from components/external_storage/export/src/export.rs rename to components/external_storage/src/export.rs index ad31dc363ae..5b69a793c12 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/src/export.rs @@ -1,41 +1,23 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -//! To use External storage with protobufs as an application, import this -//! module. external_storage contains the actual library code -//! Cloud provider backends are under components/cloud use std::{io, path::Path, sync::Arc}; use async_trait::async_trait; -#[cfg(feature = "cloud-aws")] pub use aws::{Config as S3Config, S3Storage}; -#[cfg(feature = "cloud-azure")] pub use azure::{AzureStorage, Config as AzureConfig}; -#[cfg(any(feature = "cloud-storage-dylib", feature = "cloud-storage-grpc"))] -use cloud::blob::BlobConfig; use cloud::blob::{BlobStorage, PutResource}; use encryption::DataKeyManager; -#[cfg(feature = "cloud-storage-dylib")] -use external_storage::dylib_client; -#[cfg(feature = "cloud-storage-grpc")] -use external_storage::grpc_client; -pub use external_storage::{ - compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_info_buff, - read_external_storage_into_file, record_storage_create, BackendConfig, ExternalData, - ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, - MIN_READ_SPEED, +use gcp::GcsStorage; +use kvproto::brpb::{ + AzureBlobStorage, Gcs, Noop, StorageBackend, StorageBackend_oneof_backend as Backend, S3, }; -#[cfg(feature = "cloud-gcp")] -pub use gcp::{Config as GcsConfig, GcsStorage}; -pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; -#[cfg(any(feature = "cloud-gcp", feature = "cloud-aws", feature = "cloud-azure"))] -use kvproto::brpb::{AzureBlobStorage, Gcs, S3}; -use kvproto::brpb::{CloudDynamic, Noop, StorageBackend}; use tikv_util::time::{Instant, Limiter}; -#[cfg(feature = "cloud-storage-dylib")] -use tikv_util::warn; -#[cfg(feature = "cloud-storage-dylib")] -use crate::dylib; +use crate::{ + compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_into_file, + record_storage_create, BackendConfig, ExternalData, ExternalStorage, HdfsStorage, LocalStorage, + NoopStorage, RestoreConfig, UnpinReader, +}; pub fn create_storage( storage_backend: &StorageBackend, @@ -48,20 +30,6 @@ pub fn create_storage( } } -// when the flag cloud-storage-dylib or cloud-storage-grpc is set create_storage -// is automatically wrapped with a client This function is used by the -// library/server to avoid any wrapping -pub fn create_storage_no_client( - storage_backend: &StorageBackend, - config: BackendConfig, -) -> io::Result> { - if let Some(backend) = &storage_backend.backend { - create_backend_inner(backend, config) - } else { - Err(bad_storage_backend(storage_backend)) - } -} - fn bad_storage_backend(storage_backend: &StorageBackend) -> io::Error { io::Error::new( io::ErrorKind::NotFound, @@ -77,93 +45,11 @@ fn bad_backend(backend: Backend) -> io::Error { bad_storage_backend(&storage_backend) } -#[cfg(any(feature = "cloud-gcp", feature = "cloud-aws", feature = "cloud-azure"))] fn blob_store(store: Blob) -> Box { Box::new(BlobStore::new(store)) as Box } -#[cfg(feature = "cloud-storage-grpc")] -pub fn create_backend(backend: &Backend) -> io::Result> { - match create_config(backend) { - Some(config) => { - let conf = config?; - grpc_client::new_client(backend.clone(), conf.name(), conf.url()?) - } - None => Err(bad_backend(backend.clone())), - } -} - -#[cfg(feature = "cloud-storage-dylib")] -pub fn create_backend(backend: &Backend) -> io::Result> { - match create_config(backend) { - Some(config) => { - let conf = config?; - let r = dylib_client::new_client(backend.clone(), conf.name(), conf.url()?); - match r { - Err(e) if e.kind() == io::ErrorKind::AddrNotAvailable => { - warn!("could not open dll for external_storage_export"); - dylib::staticlib::new_client(backend.clone(), conf.name(), conf.url()?) - } - _ => r, - } - } - None => Err(bad_backend(backend.clone())), - } -} - -#[cfg(all( - not(feature = "cloud-storage-grpc"), - not(feature = "cloud-storage-dylib") -))] -pub fn create_backend( - backend: &Backend, - config: BackendConfig, -) -> io::Result> { - create_backend_inner(backend, config) -} - -#[cfg(any(feature = "cloud-storage-dylib", feature = "cloud-storage-grpc"))] -fn create_config(backend: &Backend) -> Option>> { - match backend { - #[cfg(feature = "cloud-aws")] - Backend::S3(config) => { - let conf = S3Config::from_input(config.clone()); - Some(conf.map(|c| Box::new(c) as Box)) - } - #[cfg(feature = "cloud-gcp")] - Backend::Gcs(config) => { - let conf = GcsConfig::from_input(config.clone()); - Some(conf.map(|c| Box::new(c) as Box)) - } - #[cfg(feature = "cloud-azure")] - Backend::AzureBlobStorage(config) => { - let conf = AzureConfig::from_input(config.clone()); - Some(conf.map(|c| Box::new(c) as Box)) - } - Backend::CloudDynamic(dyn_backend) => match dyn_backend.provider_name.as_str() { - #[cfg(feature = "cloud-aws")] - "aws" | "s3" => { - let conf = S3Config::from_cloud_dynamic(&dyn_backend); - Some(conf.map(|c| Box::new(c) as Box)) - } - #[cfg(feature = "cloud-gcp")] - "gcp" | "gcs" => { - let conf = GcsConfig::from_cloud_dynamic(&dyn_backend); - Some(conf.map(|c| Box::new(c) as Box)) - } - #[cfg(feature = "cloud-azure")] - "azure" | "azblob" => { - let conf = AzureConfig::from_cloud_dynamic(&dyn_backend); - Some(conf.map(|c| Box::new(c) as Box)) - } - _ => None, - }, - _ => None, - } -} - -/// Create a new storage from the given storage backend description. -fn create_backend_inner( +fn create_backend( backend: &Backend, backend_config: BackendConfig, ) -> io::Result> { @@ -176,30 +62,18 @@ fn create_backend_inner( Backend::Hdfs(hdfs) => { Box::new(HdfsStorage::new(&hdfs.remote, backend_config.hdfs_config)?) } - Backend::Noop(_) => { - Box::::default() as Box - } - #[cfg(feature = "cloud-aws")] + Backend::Noop(_) => Box::::default() as Box, Backend::S3(config) => { let mut s = S3Storage::from_input(config.clone())?; s.set_multi_part_size(backend_config.s3_multi_part_size); blob_store(s) } - #[cfg(feature = "cloud-gcp")] Backend::Gcs(config) => blob_store(GcsStorage::from_input(config.clone())?), - #[cfg(feature = "cloud-azure")] Backend::AzureBlobStorage(config) => blob_store(AzureStorage::from_input(config.clone())?), - Backend::CloudDynamic(dyn_backend) => match dyn_backend.provider_name.as_str() { - #[cfg(feature = "cloud-aws")] - "aws" | "s3" => blob_store(S3Storage::from_cloud_dynamic(dyn_backend)?), - #[cfg(feature = "cloud-gcp")] - "gcp" | "gcs" => blob_store(GcsStorage::from_cloud_dynamic(dyn_backend)?), - #[cfg(feature = "cloud-azure")] - "azure" | "azblob" => blob_store(AzureStorage::from_cloud_dynamic(dyn_backend)?), - _ => { - return Err(bad_backend(Backend::CloudDynamic(dyn_backend.clone()))); - } - }, + Backend::CloudDynamic(dyn_backend) => { + // CloudDynamic backend is no longer supported. + return Err(bad_backend(Backend::CloudDynamic(dyn_backend.clone()))); + } #[allow(unreachable_patterns)] _ => return Err(bad_backend(backend.clone())), }; @@ -207,7 +81,6 @@ fn create_backend_inner( Ok(storage) } -#[cfg(feature = "cloud-aws")] // Creates a S3 `StorageBackend` pub fn make_s3_backend(config: S3) -> StorageBackend { let mut backend = StorageBackend::default(); @@ -236,26 +109,18 @@ pub fn make_noop_backend() -> StorageBackend { backend } -#[cfg(feature = "cloud-gcp")] pub fn make_gcs_backend(config: Gcs) -> StorageBackend { let mut backend = StorageBackend::default(); backend.set_gcs(config); backend } -#[cfg(feature = "cloud-azure")] pub fn make_azblob_backend(config: AzureBlobStorage) -> StorageBackend { let mut backend = StorageBackend::default(); backend.set_azure_blob_storage(config); backend } -pub fn make_cloud_backend(config: CloudDynamic) -> StorageBackend { - let mut backend = StorageBackend::default(); - backend.set_cloud_dynamic(config); - backend -} - #[cfg(test)] mod tests { use tempfile::Builder; diff --git a/components/external_storage/src/grpc_client.rs b/components/external_storage/src/grpc_client.rs deleted file mode 100644 index e836d8fb58a..00000000000 --- a/components/external_storage/src/grpc_client.rs +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - io::{self, ErrorKind}, - sync::Arc, -}; - -use anyhow::Context; -use futures_io::AsyncRead; -use grpcio::{self}; -use kvproto::brpb as proto; -pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; -use tikv_util::time::Limiter; -use tokio::runtime::{Builder, Runtime}; - -use crate::{ - request::{ - anyhow_to_io_log_error, file_name_for_write, restore_sender, write_sender, DropPath, - }, - ExternalStorage, -}; - -struct ExternalStorageClient { - backend: Backend, - runtime: Arc, - rpc: proto::ExternalStorageClient, - name: &'static str, - url: url::Url, -} - -pub fn new_client( - backend: Backend, - name: &'static str, - url: url::Url, -) -> io::Result> { - let runtime = Builder::new() - .basic_scheduler() - .thread_name("external-storage-grpc-client") - .core_threads(1) - .enable_all() - .build()?; - Ok(Box::new(ExternalStorageClient { - backend, - runtime: Arc::new(runtime), - rpc: new_rpc_client()?, - name, - url, - })) -} - -fn new_rpc_client() -> io::Result { - let env = Arc::new(grpcio::EnvBuilder::new().build()); - let grpc_socket_path = "/tmp/grpc-external-storage.sock"; - let socket_addr = format!("unix:{}", grpc_socket_path); - let channel = grpcio::ChannelBuilder::new(env).connect(&socket_addr); - Ok(proto::ExternalStorageClient::new(channel)) -} - -impl ExternalStorage for ExternalStorageClient { - fn name(&self) -> &'static str { - self.name - } - - fn url(&self) -> io::Result { - Ok(self.url.clone()) - } - - fn write( - &self, - name: &str, - reader: Box, - content_length: u64, - ) -> io::Result<()> { - info!("external storage writing"); - (|| -> anyhow::Result<()> { - let file_path = file_name_for_write(&self.name, &name); - let req = write_sender( - &self.runtime, - self.backend.clone(), - file_path.clone(), - name, - reader, - content_length, - )?; - info!("grpc write request"); - self.rpc - .save(&req) - .map_err(rpc_error_to_io) - .context("rpc write")?; - info!("grpc write request finished"); - DropPath(file_path); - Ok(()) - })() - .context("external storage write") - .map_err(anyhow_to_io_log_error) - } - - fn read(&self, _name: &str) -> crate::ExternalData<'_> { - unimplemented!("use restore instead of read") - } - - fn restore( - &self, - storage_name: &str, - restore_name: std::path::PathBuf, - expected_length: u64, - speed_limiter: &Limiter, - ) -> io::Result<()> { - info!("external storage restore"); - let req = restore_sender( - self.backend.clone(), - storage_name, - restore_name, - expected_length, - speed_limiter, - )?; - self.rpc.restore(&req).map_err(rpc_error_to_io).map(|_| ()) - } -} - -pub fn rpc_error_to_io(err: grpcio::Error) -> io::Error { - let msg = format!("{}", err); - match err { - grpcio::Error::RpcFailure(status) => match status.status { - grpcio::RpcStatusCode::NOT_FOUND => io::Error::new(ErrorKind::NotFound, msg), - grpcio::RpcStatusCode::INVALID_ARGUMENT => io::Error::new(ErrorKind::InvalidInput, msg), - grpcio::RpcStatusCode::UNAUTHENTICATED => { - io::Error::new(ErrorKind::PermissionDenied, msg) - } - _ => io::Error::new(ErrorKind::Other, msg), - }, - _ => io::Error::new(ErrorKind::Other, msg), - } -} diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index dd021f14bf8..05dbf6f965d 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -17,8 +17,7 @@ use std::{ use async_compression::futures::bufread::ZstdDecoder; use async_trait::async_trait; -use encryption::{from_engine_encryption_method, DecrypterReader, Iv}; -use engine_traits::FileEncryptionInfo; +use encryption::{DecrypterReader, FileEncryptionInfo, Iv}; use file_system::File; use futures::io::BufReader; use futures_io::AsyncRead; @@ -40,12 +39,8 @@ mod noop; pub use noop::NoopStorage; mod metrics; use metrics::EXT_STORAGE_CREATE_HISTOGRAM; -#[cfg(feature = "cloud-storage-dylib")] -pub mod dylib_client; -#[cfg(feature = "cloud-storage-grpc")] -pub mod grpc_client; -#[cfg(any(feature = "cloud-storage-dylib", feature = "cloud-storage-grpc"))] -pub mod request; +mod export; +pub use export::*; pub fn record_storage_create(start: Instant, storage: &dyn ExternalStorage) { EXT_STORAGE_CREATE_HISTOGRAM @@ -253,7 +248,7 @@ pub fn encrypt_wrap_reader( let input = match file_crypter { Some(x) => Box::new(DecrypterReader::new( reader, - from_engine_encryption_method(x.method), + x.method, &x.key, Iv::from_slice(&x.iv)?, )?), diff --git a/components/external_storage/src/request.rs b/components/external_storage/src/request.rs deleted file mode 100644 index 7f1a81d49b7..00000000000 --- a/components/external_storage/src/request.rs +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::io::{self, ErrorKind}; - -use anyhow::Context; -use futures::executor::block_on; -use futures_io::{AsyncRead, AsyncWrite}; -use kvproto::brpb as proto; -pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; -use tikv_util::time::Limiter; -use tokio::runtime::Runtime; -use tokio_util::compat::Tokio02AsyncReadCompatExt; - -pub fn write_sender( - runtime: &Runtime, - backend: Backend, - file_path: std::path::PathBuf, - name: &str, - reader: Box, - content_length: u64, -) -> io::Result { - (|| -> anyhow::Result { - // TODO: the reader should write direct to the file_path - // currently it is copying into an intermediate buffer - // Writing to a file here uses up disk space - // But as a positive it gets the backup data out of the DB the fastest - // Currently this waits for the file to be completely written before sending to - // storage - runtime.enter(|| { - block_on(async { - let msg = |action: &str| format!("{} file {:?}", action, &file_path); - let f = tokio::fs::File::create(file_path.clone()) - .await - .context(msg("create"))?; - let mut writer: Box = Box::new(Box::pin(f.compat())); - futures_util::io::copy(reader, &mut writer) - .await - .context(msg("copy")) - }) - })?; - let mut req = proto::ExternalStorageWriteRequest::default(); - req.set_object_name(name.to_string()); - req.set_content_length(content_length); - let mut sb = proto::StorageBackend::default(); - sb.backend = Some(backend); - req.set_storage_backend(sb); - Ok(req) - })() - .context("write_sender") - .map_err(anyhow_to_io_log_error) -} - -pub fn restore_sender( - backend: Backend, - storage_name: &str, - restore_name: std::path::PathBuf, - expected_length: u64, - _speed_limiter: &Limiter, -) -> io::Result { - // TODO: send speed_limiter - let mut req = proto::ExternalStorageRestoreRequest::default(); - req.set_object_name(storage_name.to_string()); - let restore_str = restore_name.to_str().ok_or_else(|| { - io::Error::new( - ErrorKind::InvalidData, - format!("could not convert to str {:?}", &restore_name), - ) - })?; - req.set_restore_name(restore_str.to_string()); - req.set_content_length(expected_length); - let mut sb = proto::StorageBackend::default(); - sb.backend = Some(backend); - req.set_storage_backend(sb); - Ok(req) -} - -pub fn anyhow_to_io_log_error(err: anyhow::Error) -> io::Error { - let string = format!("{:#}", &err); - match err.downcast::() { - Ok(e) => { - // It will be difficult to propagate the context - // without changing the error type to anyhow or a custom TiKV error - error!("{}", string); - e - } - Err(_) => io::Error::new(ErrorKind::Other, string), - } -} - -pub fn file_name_for_write(storage_name: &str, object_name: &str) -> std::path::PathBuf { - let full_name = format!("{}-{}", storage_name, object_name); - std::env::temp_dir().join(full_name) -} - -pub struct DropPath(pub std::path::PathBuf); - -impl Drop for DropPath { - fn drop(&mut self) { - let _ = std::fs::remove_file(&self.0); - } -} diff --git a/components/file_system/Cargo.toml b/components/file_system/Cargo.toml index fbd96c3c348..ef7df46936d 100644 --- a/components/file_system/Cargo.toml +++ b/components/file_system/Cargo.toml @@ -15,7 +15,7 @@ fs2 = "0.4" lazy_static = "1.3" libc = "0.2" online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } parking_lot = "0.12" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/hybrid_engine/Cargo.toml b/components/hybrid_engine/Cargo.toml new file mode 100644 index 00000000000..bcdd0f3ce7d --- /dev/null +++ b/components/hybrid_engine/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "hybrid_engine" +version = "0.0.1" +edition = "2021" +publish = false + +[features] +testexport = [] + +[dependencies] +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +tikv_util = { workspace = true } +txn_types = { workspace = true } + +[dev-dependencies] +engine_rocks = { workspace = true } +region_cache_memory_engine = { workspace = true } +tempfile = "3.0" diff --git a/components/hybrid_engine/src/cf_names.rs b/components/hybrid_engine/src/cf_names.rs new file mode 100644 index 00000000000..990fb4d0f76 --- /dev/null +++ b/components/hybrid_engine/src/cf_names.rs @@ -0,0 +1,15 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{CfNamesExt, KvEngine, RegionCacheEngine}; + +use crate::engine::HybridEngine; + +impl CfNamesExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn cf_names(&self) -> Vec<&str> { + self.disk_engine().cf_names() + } +} diff --git a/components/hybrid_engine/src/cf_options.rs b/components/hybrid_engine/src/cf_options.rs new file mode 100644 index 00000000000..61fe08da536 --- /dev/null +++ b/components/hybrid_engine/src/cf_options.rs @@ -0,0 +1,21 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{CfOptionsExt, KvEngine, RegionCacheEngine, Result}; + +use crate::engine::HybridEngine; + +impl CfOptionsExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type CfOptions = EK::CfOptions; + + fn get_options_cf(&self, cf: &str) -> Result { + self.disk_engine().get_options_cf(cf) + } + + fn set_options_cf(&self, cf: &str, options: &[(&str, &str)]) -> Result<()> { + self.disk_engine().set_options_cf(cf, options) + } +} diff --git a/components/hybrid_engine/src/checkpoint.rs b/components/hybrid_engine/src/checkpoint.rs new file mode 100644 index 00000000000..7d9bdb022ea --- /dev/null +++ b/components/hybrid_engine/src/checkpoint.rs @@ -0,0 +1,22 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{Checkpointable, KvEngine, RegionCacheEngine, Result}; + +use crate::engine::HybridEngine; + +impl Checkpointable for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type Checkpointer = EK::Checkpointer; + + fn new_checkpointer(&self) -> Result { + self.disk_engine().new_checkpointer() + } + + fn merge(&self, dbs: &[&Self]) -> Result<()> { + let disk_dbs: Vec<_> = dbs.iter().map(|&db| db.disk_engine()).collect(); + self.disk_engine().merge(&disk_dbs) + } +} diff --git a/components/hybrid_engine/src/compact.rs b/components/hybrid_engine/src/compact.rs new file mode 100644 index 00000000000..6afbba556b0 --- /dev/null +++ b/components/hybrid_engine/src/compact.rs @@ -0,0 +1,71 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{CompactExt, KvEngine, RegionCacheEngine, Result}; + +use crate::engine::HybridEngine; + +impl CompactExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type CompactedEvent = EK::CompactedEvent; + + fn auto_compactions_is_disabled(&self) -> Result { + self.disk_engine().auto_compactions_is_disabled() + } + + fn compact_range_cf( + &self, + cf: &str, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + exclusive_manual: bool, + max_subcompactions: u32, + ) -> Result<()> { + self.disk_engine().compact_range_cf( + cf, + start_key, + end_key, + exclusive_manual, + max_subcompactions, + ) + } + + fn compact_files_in_range_cf( + &self, + cf: &str, + start: Option<&[u8]>, + end: Option<&[u8]>, + output_level: Option, + ) -> Result<()> { + self.disk_engine() + .compact_files_in_range_cf(cf, start, end, output_level) + } + + fn compact_files_in_range( + &self, + start: Option<&[u8]>, + end: Option<&[u8]>, + output_level: Option, + ) -> Result<()> { + self.disk_engine() + .compact_files_in_range(start, end, output_level) + } + + fn compact_files_cf( + &self, + cf: &str, + files: Vec, + output_level: Option, + max_subcompactions: u32, + exclude_l0: bool, + ) -> Result<()> { + self.disk_engine() + .compact_files_cf(cf, files, output_level, max_subcompactions, exclude_l0) + } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.disk_engine().check_in_range(start, end) + } +} diff --git a/components/hybrid_engine/src/db_options.rs b/components/hybrid_engine/src/db_options.rs new file mode 100644 index 00000000000..6b4be90a43f --- /dev/null +++ b/components/hybrid_engine/src/db_options.rs @@ -0,0 +1,21 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{DbOptionsExt, KvEngine, RegionCacheEngine, Result}; + +use crate::engine::HybridEngine; + +impl DbOptionsExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type DbOptions = EK::DbOptions; + + fn get_db_options(&self) -> Self::DbOptions { + self.disk_engine().get_db_options() + } + + fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()> { + self.disk_engine().set_db_options(options) + } +} diff --git a/components/hybrid_engine/src/engine.rs b/components/hybrid_engine/src/engine.rs new file mode 100644 index 00000000000..b76b999f1c3 --- /dev/null +++ b/components/hybrid_engine/src/engine.rs @@ -0,0 +1,201 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{ + KvEngine, Peekable, ReadOptions, RegionCacheEngine, Result, SnapshotContext, SnapshotMiscExt, + SyncMutable, +}; + +use crate::snapshot::HybridEngineSnapshot; + +/// This engine is structured with both a disk engine and an region cache +/// engine. The disk engine houses the complete database data, whereas the +/// region cache engine functions as a region cache, selectively caching certain +/// regions (in a better performance storage device such as NVME or RAM) to +/// enhance read performance. For the regions that are cached, region cache +/// engine retains all data that has not been garbage collected. +#[derive(Clone, Debug)] +pub struct HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + disk_engine: EK, + region_cache_engine: EC, +} + +impl HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + pub fn disk_engine(&self) -> &EK { + &self.disk_engine + } + + pub fn mut_disk_engine(&mut self) -> &mut EK { + &mut self.disk_engine + } + + pub fn region_cache_engine(&self) -> &EC { + &self.region_cache_engine + } + + pub fn mut_region_cache_engine(&mut self) -> &mut EC { + &mut self.region_cache_engine + } +} + +impl HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + pub fn new(disk_engine: EK, region_cache_engine: EC) -> Self { + Self { + disk_engine, + region_cache_engine, + } + } +} + +// todo: implement KvEngine methods as well as it's super traits. +impl KvEngine for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type Snapshot = HybridEngineSnapshot; + + fn snapshot(&self, ctx: Option) -> Self::Snapshot { + let disk_snap = self.disk_engine.snapshot(ctx.clone()); + let region_cache_snap = if let Some(ctx) = ctx { + self.region_cache_engine.snapshot( + ctx.region_id, + ctx.read_ts, + disk_snap.sequence_number(), + ) + } else { + None + }; + HybridEngineSnapshot::new(disk_snap, region_cache_snap) + } + + fn sync(&self) -> engine_traits::Result<()> { + self.disk_engine.sync() + } + + fn bad_downcast(&self) -> &T { + self.disk_engine.bad_downcast() + } + + #[cfg(feature = "testexport")] + fn inner_refcount(&self) -> usize { + self.disk_engine.inner_refcount() + } +} + +impl Peekable for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type DbVector = EK::DbVector; + + // region cache engine only supports peekable trait in the snapshot of it + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + self.disk_engine.get_value_opt(opts, key) + } + + // region cache engine only supports peekable trait in the snapshot of it + fn get_value_cf_opt( + &self, + opts: &ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + self.disk_engine.get_value_cf_opt(opts, cf, key) + } +} + +impl SyncMutable for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + unimplemented!() + } + + fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete(&self, key: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_range_cf(&self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + unimplemented!() + } +} + +#[cfg(test)] +mod tests { + use engine_rocks::util::new_engine; + use engine_traits::{KvEngine, SnapshotContext, CF_DEFAULT, CF_LOCK, CF_WRITE}; + use region_cache_memory_engine::RegionCacheMemoryEngine; + use tempfile::Builder; + + use crate::HybridEngine; + + #[test] + fn test_engine() { + let path = Builder::new().prefix("temp").tempdir().unwrap(); + let disk_engine = new_engine( + path.path().to_str().unwrap(), + &[CF_DEFAULT, CF_LOCK, CF_WRITE], + ) + .unwrap(); + let memory_engine = RegionCacheMemoryEngine::default(); + memory_engine.new_region(1); + { + let mut core = memory_engine.core().lock().unwrap(); + core.mut_region_meta(1).unwrap().set_can_read(true); + core.mut_region_meta(1).unwrap().set_safe_ts(10); + } + + let hybrid_engine = HybridEngine::new(disk_engine, memory_engine.clone()); + let s = hybrid_engine.snapshot(None); + assert!(!s.region_cache_snapshot_available()); + + let mut snap_ctx = SnapshotContext { + read_ts: 15, + region_id: 1, + }; + let s = hybrid_engine.snapshot(Some(snap_ctx.clone())); + assert!(s.region_cache_snapshot_available()); + + { + let mut core = memory_engine.core().lock().unwrap(); + core.mut_region_meta(1).unwrap().set_can_read(false); + } + let s = hybrid_engine.snapshot(Some(snap_ctx.clone())); + assert!(!s.region_cache_snapshot_available()); + + { + let mut core = memory_engine.core().lock().unwrap(); + core.mut_region_meta(1).unwrap().set_can_read(true); + } + snap_ctx.read_ts = 5; + let s = hybrid_engine.snapshot(Some(snap_ctx)); + assert!(!s.region_cache_snapshot_available()); + } +} diff --git a/components/hybrid_engine/src/engine_iterator.rs b/components/hybrid_engine/src/engine_iterator.rs new file mode 100644 index 00000000000..7349240f2a9 --- /dev/null +++ b/components/hybrid_engine/src/engine_iterator.rs @@ -0,0 +1,99 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{Iterator, KvEngine, RegionCacheEngine, Result}; +use tikv_util::Either; + +pub struct HybridEngineIterator +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + iter: Either, +} + +impl HybridEngineIterator +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + pub fn disk_engine_iterator(iter: EK::Iterator) -> Self { + Self { + iter: Either::Left(iter), + } + } + + pub fn region_cache_engine_iterator(iter: EC::Iterator) -> Self { + Self { + iter: Either::Right(iter), + } + } +} + +impl Iterator for HybridEngineIterator +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn seek(&mut self, key: &[u8]) -> Result { + match self.iter { + Either::Left(ref mut iter) => iter.seek(key), + Either::Right(ref mut iter) => iter.seek(key), + } + } + + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + match self.iter { + Either::Left(ref mut iter) => iter.seek_for_prev(key), + Either::Right(ref mut iter) => iter.seek_for_prev(key), + } + } + + fn seek_to_first(&mut self) -> Result { + match self.iter { + Either::Left(ref mut iter) => iter.seek_to_first(), + Either::Right(ref mut iter) => iter.seek_to_first(), + } + } + + fn seek_to_last(&mut self) -> Result { + match self.iter { + Either::Left(ref mut iter) => iter.seek_to_last(), + Either::Right(ref mut iter) => iter.seek_to_last(), + } + } + + fn prev(&mut self) -> Result { + match self.iter { + Either::Left(ref mut iter) => iter.prev(), + Either::Right(ref mut iter) => iter.prev(), + } + } + + fn next(&mut self) -> Result { + match self.iter { + Either::Left(ref mut iter) => iter.next(), + Either::Right(ref mut iter) => iter.next(), + } + } + + fn key(&self) -> &[u8] { + match self.iter { + Either::Left(ref iter) => iter.key(), + Either::Right(ref iter) => iter.key(), + } + } + + fn value(&self) -> &[u8] { + match self.iter { + Either::Left(ref iter) => iter.value(), + Either::Right(ref iter) => iter.value(), + } + } + + fn valid(&self) -> Result { + match self.iter { + Either::Left(ref iter) => iter.valid(), + Either::Right(ref iter) => iter.valid(), + } + } +} diff --git a/components/hybrid_engine/src/flow_control_factors.rs b/components/hybrid_engine/src/flow_control_factors.rs new file mode 100644 index 00000000000..9649671d418 --- /dev/null +++ b/components/hybrid_engine/src/flow_control_factors.rs @@ -0,0 +1,23 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{FlowControlFactorsExt, KvEngine, RegionCacheEngine, Result}; + +use crate::engine::HybridEngine; + +impl FlowControlFactorsExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn get_cf_num_files_at_level(&self, cf: &str, level: usize) -> Result> { + self.disk_engine().get_cf_num_files_at_level(cf, level) + } + + fn get_cf_num_immutable_mem_table(&self, cf: &str) -> Result> { + self.disk_engine().get_cf_num_immutable_mem_table(cf) + } + + fn get_cf_pending_compaction_bytes(&self, cf: &str) -> Result> { + self.disk_engine().get_cf_pending_compaction_bytes(cf) + } +} diff --git a/components/hybrid_engine/src/hybrid_metrics.rs b/components/hybrid_engine/src/hybrid_metrics.rs new file mode 100644 index 00000000000..2d49d9ad1d9 --- /dev/null +++ b/components/hybrid_engine/src/hybrid_metrics.rs @@ -0,0 +1,25 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RegionCacheEngine, StatisticsReporter}; + +use crate::engine::HybridEngine; + +pub struct HybridEngineStatisticsReporter {} + +impl StatisticsReporter> for HybridEngineStatisticsReporter +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn new(name: &str) -> Self { + unimplemented!() + } + + fn collect(&mut self, engine: &HybridEngine) { + unimplemented!() + } + + fn flush(&mut self) { + unimplemented!() + } +} diff --git a/components/hybrid_engine/src/import.rs b/components/hybrid_engine/src/import.rs new file mode 100644 index 00000000000..de40c83d214 --- /dev/null +++ b/components/hybrid_engine/src/import.rs @@ -0,0 +1,17 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{ImportExt, KvEngine, RegionCacheEngine}; + +use crate::engine::HybridEngine; + +impl ImportExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type IngestExternalFileOptions = EK::IngestExternalFileOptions; + + fn ingest_external_file_cf(&self, cf: &str, files: &[&str]) -> engine_traits::Result<()> { + unimplemented!() + } +} diff --git a/components/hybrid_engine/src/iterable.rs b/components/hybrid_engine/src/iterable.rs new file mode 100644 index 00000000000..27a38570f01 --- /dev/null +++ b/components/hybrid_engine/src/iterable.rs @@ -0,0 +1,21 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{IterOptions, Iterable, KvEngine, RegionCacheEngine, Result}; + +use crate::{engine::HybridEngine, engine_iterator::HybridEngineIterator}; + +impl Iterable for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type Iterator = HybridEngineIterator; + + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { + // Iterator of region cache engine should only be created from the + // snapshot of it + self.disk_engine() + .iterator_opt(cf, opts) + .map(|iter| HybridEngineIterator::disk_engine_iterator(iter)) + } +} diff --git a/components/hybrid_engine/src/lib.rs b/components/hybrid_engine/src/lib.rs new file mode 100644 index 00000000000..0778412a2c9 --- /dev/null +++ b/components/hybrid_engine/src/lib.rs @@ -0,0 +1,27 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(dead_code)] +#![allow(unused_variables)] + +mod cf_names; +mod cf_options; +mod checkpoint; +mod compact; +mod db_options; +mod engine; +mod engine_iterator; +mod flow_control_factors; +mod hybrid_metrics; +mod import; +mod iterable; +mod misc; +mod mvcc_properties; +mod perf_context; +mod range_properties; +mod snapshot; +mod sst; +mod table_properties; +mod ttl_properties; +mod write_batch; + +pub use engine::HybridEngine; +pub use snapshot::HybridEngineSnapshot; diff --git a/components/hybrid_engine/src/misc.rs b/components/hybrid_engine/src/misc.rs new file mode 100644 index 00000000000..d761322ae76 --- /dev/null +++ b/components/hybrid_engine/src/misc.rs @@ -0,0 +1,132 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, MiscExt, RegionCacheEngine, Result}; + +use crate::{engine::HybridEngine, hybrid_metrics::HybridEngineStatisticsReporter}; + +impl MiscExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type StatisticsReporter = HybridEngineStatisticsReporter; + + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()> { + unimplemented!() + } + + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { + unimplemented!() + } + + fn flush_oldest_cf( + &self, + wait: bool, + threshold: Option, + ) -> Result { + unimplemented!() + } + + fn delete_ranges_cf( + &self, + wopts: &engine_traits::WriteOptions, + cf: &str, + strategy: engine_traits::DeleteStrategy, + ranges: &[engine_traits::Range<'_>], + ) -> Result { + unimplemented!() + } + + fn get_approximate_memtable_stats_cf( + &self, + cf: &str, + range: &engine_traits::Range<'_>, + ) -> Result<(u64, u64)> { + unimplemented!() + } + + fn ingest_maybe_slowdown_writes(&self, cf: &str) -> Result { + unimplemented!() + } + + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + unimplemented!() + } + + fn get_engine_used_size(&self) -> Result { + unimplemented!() + } + + fn path(&self) -> &str { + unimplemented!() + } + + fn sync_wal(&self) -> Result<()> { + unimplemented!() + } + + fn pause_background_work(&self) -> Result<()> { + unimplemented!() + } + + fn continue_background_work(&self) -> Result<()> { + unimplemented!() + } + + fn exists(path: &str) -> bool { + unimplemented!() + } + + fn locked(path: &str) -> Result { + unimplemented!() + } + + fn dump_stats(&self) -> Result { + unimplemented!() + } + + fn get_latest_sequence_number(&self) -> u64 { + unimplemented!() + } + + fn get_oldest_snapshot_sequence_number(&self) -> Option { + unimplemented!() + } + + fn get_total_sst_files_size_cf(&self, cf: &str) -> Result> { + unimplemented!() + } + + fn get_num_keys(&self) -> Result { + unimplemented!() + } + + fn get_range_stats( + &self, + cf: &str, + start: &[u8], + end: &[u8], + ) -> Result> { + unimplemented!() + } + + fn is_stalled_or_stopped(&self) -> bool { + unimplemented!() + } + + fn get_active_memtable_stats_cf( + &self, + cf: &str, + ) -> Result> { + unimplemented!() + } + + fn get_accumulated_flush_count_cf(cf: &str) -> Result { + unimplemented!() + } + + type DiskEngine = EK::DiskEngine; + fn get_disk_engine(&self) -> &Self::DiskEngine { + self.disk_engine().get_disk_engine() + } +} diff --git a/components/hybrid_engine/src/mvcc_properties.rs b/components/hybrid_engine/src/mvcc_properties.rs new file mode 100644 index 00000000000..0d03258d2de --- /dev/null +++ b/components/hybrid_engine/src/mvcc_properties.rs @@ -0,0 +1,23 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, MvccProperties, MvccPropertiesExt, RegionCacheEngine}; +use txn_types::TimeStamp; + +use crate::engine::HybridEngine; + +impl MvccPropertiesExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn get_mvcc_properties_cf( + &self, + cf: &str, + safe_point: TimeStamp, + start_key: &[u8], + end_key: &[u8], + ) -> Option { + self.disk_engine() + .get_mvcc_properties_cf(cf, safe_point, start_key, end_key) + } +} diff --git a/components/hybrid_engine/src/perf_context.rs b/components/hybrid_engine/src/perf_context.rs new file mode 100644 index 00000000000..1db4e8c9d27 --- /dev/null +++ b/components/hybrid_engine/src/perf_context.rs @@ -0,0 +1,20 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, PerfContextExt, PerfContextKind, RegionCacheEngine}; + +use crate::engine::HybridEngine; + +impl PerfContextExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type PerfContext = EK::PerfContext; + + fn get_perf_context( + level: engine_traits::PerfLevel, + kind: PerfContextKind, + ) -> Self::PerfContext { + EK::get_perf_context(level, kind) + } +} diff --git a/components/hybrid_engine/src/range_properties.rs b/components/hybrid_engine/src/range_properties.rs new file mode 100644 index 00000000000..7f38379f36d --- /dev/null +++ b/components/hybrid_engine/src/range_properties.rs @@ -0,0 +1,60 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, Range, RangePropertiesExt, RegionCacheEngine, Result}; + +use crate::engine::HybridEngine; + +impl RangePropertiesExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn get_range_approximate_keys(&self, range: Range<'_>, large_threshold: u64) -> Result { + self.disk_engine() + .get_range_approximate_keys(range, large_threshold) + } + + fn get_range_approximate_keys_cf( + &self, + cfname: &str, + range: Range<'_>, + large_threshold: u64, + ) -> Result { + self.disk_engine() + .get_range_approximate_keys_cf(cfname, range, large_threshold) + } + + fn get_range_approximate_size(&self, range: Range<'_>, large_threshold: u64) -> Result { + self.disk_engine() + .get_range_approximate_size(range, large_threshold) + } + + fn get_range_approximate_size_cf( + &self, + cfname: &str, + range: Range<'_>, + large_threshold: u64, + ) -> Result { + self.disk_engine() + .get_range_approximate_size_cf(cfname, range, large_threshold) + } + + fn get_range_approximate_split_keys( + &self, + range: Range<'_>, + key_count: usize, + ) -> Result>> { + self.disk_engine() + .get_range_approximate_split_keys(range, key_count) + } + + fn get_range_approximate_split_keys_cf( + &self, + cfname: &str, + range: Range<'_>, + key_count: usize, + ) -> Result>> { + self.disk_engine() + .get_range_approximate_split_keys_cf(cfname, range, key_count) + } +} diff --git a/components/hybrid_engine/src/snapshot.rs b/components/hybrid_engine/src/snapshot.rs new file mode 100644 index 00000000000..3c7ab875a21 --- /dev/null +++ b/components/hybrid_engine/src/snapshot.rs @@ -0,0 +1,106 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::fmt::{self, Debug, Formatter}; + +use engine_traits::{ + CfNamesExt, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, RegionCacheEngine, Result, + Snapshot, SnapshotMiscExt, +}; + +use crate::engine_iterator::HybridEngineIterator; + +pub struct HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + disk_snap: EK::Snapshot, + region_cache_snap: Option, +} + +impl HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + pub fn new(disk_snap: EK::Snapshot, region_cache_snap: Option) -> Self { + HybridEngineSnapshot { + disk_snap, + region_cache_snap, + } + } + + pub fn region_cache_snapshot_available(&self) -> bool { + self.region_cache_snap.is_some() + } +} + +impl Snapshot for HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ +} + +impl Debug for HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { + write!(fmt, "Hybrid Engine Snapshot Impl") + } +} + +impl Iterable for HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type Iterator = HybridEngineIterator; + + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { + unimplemented!() + } +} + +impl Peekable for HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type DbVector = EK::DbVector; + + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + unimplemented!() + } + + fn get_value_cf_opt( + &self, + opts: &ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + unimplemented!() + } +} + +impl CfNamesExt for HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn cf_names(&self) -> Vec<&str> { + self.disk_snap.cf_names() + } +} + +impl SnapshotMiscExt for HybridEngineSnapshot +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn sequence_number(&self) -> u64 { + self.disk_snap.sequence_number() + } +} diff --git a/components/hybrid_engine/src/sst.rs b/components/hybrid_engine/src/sst.rs new file mode 100644 index 00000000000..2bade295ec3 --- /dev/null +++ b/components/hybrid_engine/src/sst.rs @@ -0,0 +1,53 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{ + KvEngine, RegionCacheEngine, Result, SstCompressionType, SstExt, SstWriterBuilder, +}; + +use crate::engine::HybridEngine; + +pub struct HybridEngineSstWriteBuilder {} + +impl SstExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type SstReader = EK::SstReader; + type SstWriter = EK::SstWriter; + type SstWriterBuilder = HybridEngineSstWriteBuilder; +} + +impl SstWriterBuilder> for HybridEngineSstWriteBuilder +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn new() -> Self { + unimplemented!() + } + + fn set_db(self, _db: &HybridEngine) -> Self { + unimplemented!() + } + + fn set_cf(self, _cf: &str) -> Self { + unimplemented!() + } + + fn set_in_memory(self, _in_memory: bool) -> Self { + unimplemented!() + } + + fn set_compression_type(self, _compression: Option) -> Self { + unimplemented!() + } + + fn set_compression_level(self, level: i32) -> Self { + unimplemented!() + } + + fn build(self, _path: &str) -> Result< as SstExt>::SstWriter> { + unimplemented!() + } +} diff --git a/components/hybrid_engine/src/table_properties.rs b/components/hybrid_engine/src/table_properties.rs new file mode 100644 index 00000000000..6ad95e5931a --- /dev/null +++ b/components/hybrid_engine/src/table_properties.rs @@ -0,0 +1,21 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, Range, RegionCacheEngine, Result, TablePropertiesExt}; + +use crate::engine::HybridEngine; + +impl TablePropertiesExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type TablePropertiesCollection = EK::TablePropertiesCollection; + + fn table_properties_collection( + &self, + cf: &str, + ranges: &[Range<'_>], + ) -> Result { + self.disk_engine().table_properties_collection(cf, ranges) + } +} diff --git a/components/hybrid_engine/src/ttl_properties.rs b/components/hybrid_engine/src/ttl_properties.rs new file mode 100644 index 00000000000..d5b7d8578b5 --- /dev/null +++ b/components/hybrid_engine/src/ttl_properties.rs @@ -0,0 +1,21 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RegionCacheEngine, Result, TtlProperties, TtlPropertiesExt}; + +use crate::engine::HybridEngine; + +impl TtlPropertiesExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + fn get_range_ttl_properties_cf( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> Result> { + self.disk_engine() + .get_range_ttl_properties_cf(cf, start_key, end_key) + } +} diff --git a/components/hybrid_engine/src/write_batch.rs b/components/hybrid_engine/src/write_batch.rs new file mode 100644 index 00000000000..3aba34c9c85 --- /dev/null +++ b/components/hybrid_engine/src/write_batch.rs @@ -0,0 +1,101 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{ + KvEngine, Mutable, RegionCacheEngine, Result, WriteBatch, WriteBatchExt, WriteOptions, +}; + +use crate::engine::HybridEngine; + +pub struct HybridEngineWriteBatch { + _disk_write_batch: EK::WriteBatch, + // todo: region_cache_engine write batch +} + +impl WriteBatchExt for HybridEngine +where + EK: KvEngine, + EC: RegionCacheEngine, +{ + type WriteBatch = HybridEngineWriteBatch; + const WRITE_BATCH_MAX_KEYS: usize = EK::WRITE_BATCH_MAX_KEYS; + + fn write_batch(&self) -> Self::WriteBatch { + unimplemented!() + } + + fn write_batch_with_cap(&self, _: usize) -> Self::WriteBatch { + unimplemented!() + } +} + +impl WriteBatch for HybridEngineWriteBatch { + fn write_opt(&mut self, _: &WriteOptions) -> Result { + unimplemented!() + } + + fn write_callback_opt(&mut self, _opts: &WriteOptions, _cb: impl FnMut()) -> Result { + unimplemented!() + } + + fn data_size(&self) -> usize { + unimplemented!() + } + + fn count(&self) -> usize { + unimplemented!() + } + + fn is_empty(&self) -> bool { + unimplemented!() + } + + fn should_write_to_engine(&self) -> bool { + unimplemented!() + } + + fn clear(&mut self) { + unimplemented!() + } + + fn set_save_point(&mut self) { + unimplemented!() + } + + fn pop_save_point(&mut self) -> Result<()> { + unimplemented!() + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + unimplemented!() + } + + fn merge(&mut self, _other: Self) -> Result<()> { + unimplemented!() + } +} + +impl Mutable for HybridEngineWriteBatch { + fn put(&mut self, _key: &[u8], _value: &[u8]) -> Result<()> { + unimplemented!() + } + + fn put_cf(&mut self, _cf: &str, _key: &[u8], _value: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete(&mut self, _key: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_cf(&mut self, _cf: &str, _key: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_range(&mut self, _begin_key: &[u8], _end_key: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_range_cf(&mut self, _cf: &str, _begin_key: &[u8], _end_key: &[u8]) -> Result<()> { + unimplemented!() + } +} diff --git a/components/online_config/Cargo.toml b/components/online_config/Cargo.toml index 9d67f1cf1de..47e8996391c 100644 --- a/components/online_config/Cargo.toml +++ b/components/online_config/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" publish = false [dependencies] +chrono = "0.4" online_config_derive = { path = "./online_config_derive" } serde = { version = "1.0", features = ["derive"] } diff --git a/components/online_config/src/lib.rs b/components/online_config/src/lib.rs index 45694305a5f..5fec0cea9bc 100644 --- a/components/online_config/src/lib.rs +++ b/components/online_config/src/lib.rs @@ -5,9 +5,12 @@ use std::{ fmt::{self, Debug, Display, Formatter}, }; +use chrono::{FixedOffset, NaiveTime}; pub use online_config_derive::*; pub type ConfigChange = HashMap; +pub type OffsetTime = (NaiveTime, FixedOffset); +pub type Schedule = Vec; #[derive(Clone, PartialEq)] pub enum ConfigValue { @@ -21,6 +24,8 @@ pub enum ConfigValue { Bool(bool), String(String), Module(ConfigChange), + OffsetTime(OffsetTime), + Schedule(Schedule), Skip, None, } @@ -38,6 +43,8 @@ impl Display for ConfigValue { ConfigValue::Bool(v) => write!(f, "{}", v), ConfigValue::String(v) => write!(f, "{}", v), ConfigValue::Module(v) => write!(f, "{:?}", v), + ConfigValue::OffsetTime((t, o)) => write!(f, "{} {}", t, o), + ConfigValue::Schedule(v) => write!(f, "{:?}", v), ConfigValue::Skip => write!(f, "ConfigValue::Skip"), ConfigValue::None => write!(f, ""), } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 06ea6e9055d..80958e151d0 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -1098,9 +1098,7 @@ impl PdClient for RpcClient { }) as PdFuture<_> }; - self.pd_client - .request(req, executor, LEADER_CHANGE_RETRY) - .execute() + self.pd_client.request(req, executor, NO_RETRY).execute() } fn report_region_buckets(&self, bucket_stat: &BucketStat, period: Duration) -> PdFuture<()> { diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 5b0d563f2b8..97b2702fc39 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -117,7 +117,7 @@ impl RawClient { /// Returns Ok(true) when a new connection is established. async fn maybe_reconnect(&mut self, ctx: &ConnectContext, force: bool) -> Result { - PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + PD_RECONNECT_COUNTER_VEC.try_connect.inc(); let start = Instant::now(); let members = self.members.clone(); @@ -135,21 +135,15 @@ impl RawClient { .await { Err(e) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["failure"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.failure.inc(); return Err(e); } Ok(None) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["no-need"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.no_need.inc(); return Ok(false); } Ok(Some(tuple)) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["success"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.success.inc(); tuple } }; diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 7a9d2cd2a61..21ae61ccd61 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -211,6 +211,15 @@ impl BucketStat { } } + pub fn clean_stats(&mut self, idx: usize) { + self.stats.write_keys[idx] = 0; + self.stats.write_bytes[idx] = 0; + self.stats.read_qps[idx] = 0; + self.stats.write_qps[idx] = 0; + self.stats.read_keys[idx] = 0; + self.stats.read_bytes[idx] = 0; + } + pub fn split(&mut self, idx: usize) { assert!(idx != 0); // inherit the traffic stats for splited bucket diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index d92e334396a..7e7121170d6 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,7 +2,7 @@ use lazy_static::lazy_static; use prometheus::*; -use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; +use prometheus_static_metric::*; make_static_metric! { pub label_enum PDRequestEventType { @@ -40,9 +40,34 @@ make_static_metric! { meta_storage_watch, } + pub label_enum PDReconnectEventKind { + success, + failure, + no_need, + cancel, + try_connect, + } + + pub label_enum StoreSizeEventType { + capacity, + available, + used, + snap_size, + raft_size, + kv_size, + import_size, + } + + pub struct StoreSizeEventIntrVec: IntGauge { + "type" => StoreSizeEventType, + } + pub struct PDRequestEventHistogramVec: Histogram { "type" => PDRequestEventType, } + pub struct PDReconnectEventCounterVec: IntCounter { + "type" => PDReconnectEventKind, + } } lazy_static! { @@ -66,12 +91,14 @@ lazy_static! { &["type"] ) .unwrap(); - pub static ref PD_RECONNECT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( - "tikv_pd_reconnect_total", - "Total number of PD reconnections.", - &["type"] - ) - .unwrap(); + pub static ref PD_RECONNECT_COUNTER_VEC: PDReconnectEventCounterVec = + register_static_int_counter_vec!( + PDReconnectEventCounterVec, + "tikv_pd_reconnect_total", + "Total number of PD reconnections.", + &["type"] + ) + .unwrap(); pub static ref PD_PENDING_HEARTBEAT_GAUGE: IntGauge = register_int_gauge!( "tikv_pd_pending_heartbeat_total", "Total number of pending region heartbeat" @@ -88,8 +115,14 @@ lazy_static! { &["type"] ) .unwrap(); - pub static ref STORE_SIZE_GAUGE_VEC: IntGaugeVec = - register_int_gauge_vec!("tikv_store_size_bytes", "Size of storage.", &["type"]).unwrap(); + pub static ref STORE_SIZE_EVENT_INT_VEC: StoreSizeEventIntrVec = + register_static_int_gauge_vec!( + StoreSizeEventIntrVec, + "tikv_store_size_bytes", + "Size of storage.", + &["type"] + ) + .unwrap(); pub static ref REGION_READ_KEYS_HISTOGRAM: Histogram = register_histogram!( "tikv_region_read_keys", "Histogram of keys written for regions", diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 5491a51c047..329448a6ac6 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -50,6 +50,7 @@ const MAX_RETRY_TIMES: u64 = 5; // The max duration when retrying to connect to leader. No matter if the // MAX_RETRY_TIMES is reached. const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); +const MAX_BACKOFF: Duration = Duration::from_secs(3); // FIXME: Use a request-independent way to handle reconnection. pub const REQUEST_RECONNECT_INTERVAL: Duration = Duration::from_secs(1); // 1s @@ -116,6 +117,7 @@ pub struct Inner { pub rg_resp: Option>, last_try_reconnect: Instant, + bo: ExponentialBackoff, } impl Inner { @@ -168,7 +170,6 @@ pub struct Client { pub(crate) inner: RwLock, pub feature_gate: FeatureGate, enable_forwarding: bool, - retry_interval: Duration, } impl Client { @@ -219,6 +220,7 @@ impl Client { pending_heartbeat: Arc::default(), pending_buckets: Arc::default(), last_try_reconnect: Instant::now(), + bo: ExponentialBackoff::new(retry_interval), tso, meta_storage, rg_sender: Either::Left(Some(rg_sender)), @@ -226,7 +228,6 @@ impl Client { }), feature_gate: FeatureGate::default(), enable_forwarding, - retry_interval, } } @@ -363,17 +364,15 @@ impl Client { /// Note: Retrying too quickly will return an error due to cancellation. /// Please always try to reconnect after sending the request first. pub async fn reconnect(&self, force: bool) -> Result<()> { - PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + PD_RECONNECT_COUNTER_VEC.try_connect.inc(); let start = Instant::now(); let future = { let inner = self.inner.rl(); - if start.saturating_duration_since(inner.last_try_reconnect) < self.retry_interval { + if start.saturating_duration_since(inner.last_try_reconnect) < inner.bo.get_interval() { // Avoid unnecessary updating. // Prevent a large number of reconnections in a short time. - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["cancel"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.cancel.inc(); return Err(box_err!("cancel reconnection due to too small interval")); } let connector = PdConnector::new(inner.env.clone(), inner.security_mgr.clone()); @@ -394,36 +393,38 @@ impl Client { { let mut inner = self.inner.wl(); - if start.saturating_duration_since(inner.last_try_reconnect) < self.retry_interval { + if start.saturating_duration_since(inner.last_try_reconnect) < inner.bo.get_interval() { // There may be multiple reconnections that pass the read lock at the same time. // Check again in the write lock to avoid unnecessary updating. - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["cancel"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.cancel.inc(); return Err(box_err!("cancel reconnection due to too small interval")); } inner.last_try_reconnect = start; + inner.bo.next_backoff(); } slow_log!(start.saturating_elapsed(), "try reconnect pd"); let (client, target_info, members, tso) = match future.await { Err(e) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["failure"]) - .inc(); + PD_RECONNECT_COUNTER_VEC.failure.inc(); return Err(e); } - Ok(None) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["no-need"]) - .inc(); - return Ok(()); - } - Ok(Some(tuple)) => { - PD_RECONNECT_COUNTER_VEC - .with_label_values(&["success"]) - .inc(); - tuple + Ok(res) => { + // Reset the retry count. + { + let mut inner = self.inner.wl(); + inner.bo.reset() + } + match res { + None => { + PD_RECONNECT_COUNTER_VEC.no_need.inc(); + return Ok(()); + } + Some(tuple) => { + PD_RECONNECT_COUNTER_VEC.success.inc(); + tuple + } + } } }; @@ -435,7 +436,7 @@ impl Client { } } -/// The context of sending requets. +/// The context of sending request. pub struct Request { remain_request_count: usize, request_sent: usize, @@ -900,6 +901,33 @@ impl PdConnector { } } +/// Simple backoff strategy. +struct ExponentialBackoff { + base: Duration, + interval: Duration, +} + +impl ExponentialBackoff { + pub fn new(base: Duration) -> Self { + Self { + base, + interval: base, + } + } + pub fn next_backoff(&mut self) -> Duration { + self.interval = std::cmp::min(self.interval * 2, MAX_BACKOFF); + self.interval + } + + pub fn get_interval(&self) -> Duration { + self.interval + } + + pub fn reset(&mut self) { + self.interval = self.base; + } +} + pub fn trim_http_prefix(s: &str) -> &str { s.trim_start_matches("http://") .trim_start_matches("https://") @@ -1045,8 +1073,11 @@ pub fn merge_bucket_stats, I: AsRef<[u8]>>( mod test { use kvproto::metapb::BucketStats; + use super::*; use crate::{merge_bucket_stats, util::find_bucket_index}; + const BASE_BACKOFF: Duration = Duration::from_millis(100); + #[test] fn test_merge_bucket_stats() { #[allow(clippy::type_complexity)] @@ -1162,4 +1193,23 @@ mod test { assert_eq!(find_bucket_index(b"k7", &keys), Some(4)); assert_eq!(find_bucket_index(b"k8", &keys), Some(4)); } + + #[test] + fn test_exponential_backoff() { + let mut backoff = ExponentialBackoff::new(BASE_BACKOFF); + assert_eq!(backoff.get_interval(), BASE_BACKOFF); + + assert_eq!(backoff.next_backoff(), 2 * BASE_BACKOFF); + assert_eq!(backoff.next_backoff(), Duration::from_millis(400)); + assert_eq!(backoff.get_interval(), Duration::from_millis(400)); + + // Should not exceed MAX_BACKOFF + for _ in 0..20 { + backoff.next_backoff(); + } + assert_eq!(backoff.get_interval(), MAX_BACKOFF); + + backoff.reset(); + assert_eq!(backoff.get_interval(), BASE_BACKOFF); + } } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 4304e181707..303fc5f24f2 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" publish = false edition = "2021" +[features] +failpoints = ["raft-engine/failpoints"] + [dependencies] codec = { workspace = true } encryption = { workspace = true } diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 1f19a161b09..c71b9fd65d9 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -10,12 +10,13 @@ use std::{ use codec::number::NumberCodec; use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ - CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, - RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, Result, - CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + CacheStats, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, + RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, Result, CF_DEFAULT, CF_LOCK, CF_RAFT, + CF_WRITE, }; use file_system::{IoOp, IoRateLimiter, IoType, WithIoType}; use kvproto::{ + encryptionpb::EncryptionMethod, metapb::Region, raft_serverpb::{ RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index d281c0eca69..2bd7737ade4 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -24,10 +24,6 @@ test-engines-panic = [ "engine_test/test-engines-panic", ] -cloud-aws = ["raftstore/cloud-aws"] -cloud-gcp = ["raftstore/cloud-gcp"] -cloud-azure = ["raftstore/cloud-azure"] - [dependencies] batch-system = { workspace = true } bytes = "1.0" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 4c142a43abf..a637eca704b 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -1,7 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cmp, ops::{Deref, DerefMut}, path::Path, sync::{ @@ -48,7 +47,7 @@ use tikv_util::{ box_err, config::{Tracker, VersionTrack}, log::SlogFormat, - sys::SysQuota, + sys::{disk::get_disk_status, SysQuota}, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, Limiter}, timer::{SteadyTimer, GLOBAL_TIMER_HANDLE}, worker::{Builder, LazyWorker, Scheduler, Worker}, @@ -105,11 +104,15 @@ pub struct StoreContext { /// Disk usage for the store itself. pub self_disk_usage: DiskUsage, + // TODO: how to remove offlined stores? + /// Disk usage for other stores. The store itself is not included. + /// Only contains items which is not `DiskUsage::Normal`. + pub store_disk_usages: HashMap, pub snap_mgr: TabletSnapManager, pub global_stat: GlobalStoreStat, pub store_stat: LocalStoreStat, - pub sst_importer: Arc, + pub sst_importer: Arc>, pub key_manager: Option>, /// Inspector for latency inspecting @@ -140,7 +143,7 @@ impl StoreContext { self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::GcPeer as usize].wait_duration = - 60 * cmp::min(Duration::from_secs(1), self.cfg.raft_base_tick_interval.0); + self.cfg.gc_peer_check_interval.0; } // Return None means it has passed unsafe vote period. @@ -229,6 +232,7 @@ impl PollHandler { shutdown: Arc, snap_mgr: TabletSnapManager, global_stat: GlobalStoreStat, - sst_importer: Arc, + sst_importer: Arc>, key_manager: Option>, node_start_time: Timespec, // monotonic_raw_now } @@ -382,7 +386,7 @@ impl StorePollerBuilder { shutdown: Arc, snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, - sst_importer: Arc, + sst_importer: Arc>, key_manager: Option>, node_start_time: Timespec, // monotonic_raw_now ) -> Self { @@ -562,6 +566,7 @@ where apply_pool: self.apply_pool.clone(), high_priority_pool: self.high_priority_pool.clone(), self_disk_usage: DiskUsage::Normal, + store_disk_usages: Default::default(), snap_mgr: self.snap_mgr.clone(), coprocessor_host: self.coprocessor_host.clone(), global_stat: self.global_stat.clone(), @@ -689,7 +694,7 @@ impl StoreSystem { collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, - sst_importer: Arc, + sst_importer: Arc>, key_manager: Option>, grpc_service_mgr: GrpcServiceManager, resource_ctl: Option>, @@ -806,7 +811,6 @@ impl StoreSystem { causal_ts_provider, workers.pd.scheduler(), auto_split_controller, - store_meta.lock().unwrap().region_read_progress.clone(), collector_reg_handle, grpc_service_mgr, self.logger.clone(), @@ -985,16 +989,16 @@ impl StoreRouter { msg: Box, ) -> std::result::Result<(), TrySendError>> { let id = msg.get_region_id(); - let peer_msg = PeerMsg::RaftMessage(msg); + let peer_msg = PeerMsg::RaftMessage(msg, Some(TiInstant::now())); let store_msg = match self.router.try_send(id, peer_msg) { Either::Left(Ok(())) => return Ok(()), - Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m, _)))) => { return Err(TrySendError::Full(m)); } - Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m)))) => { + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m, _)))) => { return Err(TrySendError::Disconnected(m)); } - Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), + Either::Right(PeerMsg::RaftMessage(m, _)) => StoreMsg::RaftMessage(m), _ => unreachable!(), }; match self.router.send_control(store_msg) { diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index e55c143a33a..49530fcd6df 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -87,7 +87,7 @@ impl ApplyFsm { log_recovery: Option>, applied_term: u64, buckets: Option, - sst_importer: Arc, + sst_importer: Arc>, coprocessor_host: CoprocessorHost, logger: Logger, ) -> (ApplyScheduler, Self) { diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index f6b9217ecbf..47a1aee1ef4 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -9,7 +9,7 @@ use crossbeam::channel::TryRecvError; use encryption_export::DataKeyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; -use raftstore::store::{Config, TabletSnapManager, Transport}; +use raftstore::store::{Config, ReadCallback, TabletSnapManager, Transport}; use slog::{debug, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -17,6 +17,7 @@ use tikv_util::{ slog_panic, time::{duration_to_sec, Instant}, }; +use tracker::{TrackerToken, GLOBAL_TRACKERS}; use crate::{ batch::StoreContext, @@ -195,6 +196,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.schedule_tick(PeerTick::SplitRegionCheck); self.schedule_tick(PeerTick::PdHeartbeat); self.schedule_tick(PeerTick::CompactLog); + self.fsm.peer.on_check_merge(self.store_ctx); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } @@ -206,11 +208,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } #[inline] - fn on_receive_command(&self, send_time: Instant) { + fn on_receive_command(&self, send_time: Instant, read_token: Option) { + let propose_wait_time = send_time.saturating_elapsed(); self.store_ctx .raft_metrics .propose_wait_time - .observe(duration_to_sec(send_time.saturating_elapsed())); + .observe(duration_to_sec(propose_wait_time)); + if let Some(token) = read_token { + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.read_index_propose_wait_nanos = propose_wait_time.as_nanos() as u64; + }); + } } fn on_tick(&mut self, tick: PeerTick) { @@ -239,30 +247,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { - PeerMsg::RaftMessage(msg) => { - self.fsm.peer.on_raft_message(self.store_ctx, msg); + PeerMsg::RaftMessage(msg, send_time) => { + self.fsm + .peer + .on_raft_message(self.store_ctx, msg, send_time); } PeerMsg::RaftQuery(cmd) => { - self.on_receive_command(cmd.send_time); + self.on_receive_command(cmd.send_time, cmd.ch.read_tracker()); self.on_query(cmd.request, cmd.ch) } PeerMsg::AdminCommand(cmd) => { - self.on_receive_command(cmd.send_time); + self.on_receive_command(cmd.send_time, None); self.fsm .peer_mut() .on_admin_command(self.store_ctx, cmd.request, cmd.ch) } PeerMsg::SimpleWrite(write) => { - self.on_receive_command(write.send_time); + self.on_receive_command(write.send_time, None); self.fsm.peer_mut().on_simple_write( self.store_ctx, write.header, write.data, write.ch, + Some(write.extra_opts), ); } PeerMsg::UnsafeWrite(write) => { - self.on_receive_command(write.send_time); + self.on_receive_command(write.send_time, None); self.fsm .peer_mut() .on_unsafe_write(self.store_ctx, write.data); @@ -307,6 +318,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::StoreUnreachable { to_store_id } => { self.fsm.peer_mut().on_store_unreachable(to_store_id) } + PeerMsg::StoreMaybeTombstone { store_id } => { + self.fsm.peer_mut().on_store_maybe_tombstone(store_id) + } PeerMsg::SnapshotSent { to_peer_id, status } => { self.fsm.peer_mut().on_snapshot_sent(to_peer_id, status) } @@ -370,9 +384,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, syncer, failed_stores, ), - PeerMsg::ExitForceLeaderState => { - self.fsm.peer_mut().on_exit_force_leader(self.store_ctx) - } + PeerMsg::ExitForceLeaderState => self + .fsm + .peer_mut() + .on_exit_force_leader(self.store_ctx, false), PeerMsg::ExitForceLeaderStateCampaign => { self.fsm.peer_mut().on_exit_force_leader_campaign() } diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index 432ea72456a..920a4e68e8c 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -6,15 +6,15 @@ use std::sync::Arc; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - metapb::{self, RegionEpoch}, + metapb::RegionEpoch, raft_serverpb::{ExtraMessageType, RaftMessage, RefreshBuckets}, }; -use pd_client::{BucketMeta, BucketStat}; +use pd_client::BucketMeta; use raftstore::{ coprocessor::RegionChangeEvent, store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, }; -use slog::{error, info, warn}; +use slog::{error, info}; use crate::{ batch::StoreContext, @@ -24,254 +24,40 @@ use crate::{ worker::pd, }; -#[derive(Debug, Clone, Default)] -pub struct BucketStatsInfo { - bucket_stat: Option, - // the last buckets records the stats that the recently refreshed. - last_bucket_stat: Option, - // the report bucket stat records the increment stats after last report pd. - // it will be reset after report pd. - report_bucket_stat: Option, - // last bucket count. - // BucketStat.meta is Arc so it cannot be used for last bucket count - last_bucket_count: usize, -} - -impl BucketStatsInfo { - /// returns all bucket ranges those's write_bytes exceed the given - /// diff_size_threshold. - pub fn gen_bucket_range_for_update( - &self, - diff_size_threshold: u64, - ) -> Option> { - let region_buckets = self.bucket_stat.as_ref()?; - let stats = ®ion_buckets.stats; - let keys = ®ion_buckets.meta.keys; - - let empty_last_keys = vec![]; - let empty_last_stats = metapb::BucketStats::default(); - let (last_keys, last_stats, stats_reset) = self - .last_bucket_stat - .as_ref() - .map(|b| { - ( - &b.meta.keys, - &b.stats, - region_buckets.create_time != b.create_time, - ) - }) - .unwrap_or((&empty_last_keys, &empty_last_stats, false)); - - let mut bucket_ranges = vec![]; - let mut j = 0; - assert_eq!(keys.len(), stats.write_bytes.len() + 1); - for i in 0..stats.write_bytes.len() { - let mut diff_in_bytes = stats.write_bytes[i]; - while j < last_keys.len() && keys[i] > last_keys[j] { - j += 1; - } - if j < last_keys.len() && keys[i] == last_keys[j] { - if !stats_reset { - diff_in_bytes -= last_stats.write_bytes[j]; - } - j += 1; - } - if diff_in_bytes >= diff_size_threshold { - bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); - } - } - Some(bucket_ranges) - } - - #[inline] - pub fn version(&self) -> u64 { - self.bucket_stat - .as_ref() - .or(self.last_bucket_stat.as_ref()) - .map(|b| b.meta.version) - .unwrap_or_default() - } - #[inline] - pub fn add_bucket_flow(&mut self, delta: &Option) { - if let (Some(buckets), Some(report_buckets), Some(delta)) = ( - self.bucket_stat.as_mut(), - self.report_bucket_stat.as_mut(), - delta, - ) { - buckets.merge(delta); - report_buckets.merge(delta); - } - } - - #[inline] - pub fn set_bucket_stat(&mut self, buckets: Option) { - if let Some(b) = self.bucket_stat.take() { - self.last_bucket_stat = Some(b); - } - self.report_bucket_stat = buckets.clone(); - self.bucket_stat = buckets; - self.last_bucket_count = self - .bucket_stat - .as_ref() - .map_or(0, |bucket_stat| bucket_stat.meta.keys.len() - 1); - } - - #[inline] - pub fn clear_bucket_stat(&mut self) { - if let Some(bucket) = self.report_bucket_stat.as_mut() { - bucket.clear_stats(); - } - } - - #[inline] - pub fn report_bucket_stat(&mut self) -> BucketStat { - let current = self.report_bucket_stat.as_mut().unwrap(); - let delta = current.clone(); - current.clear_stats(); - delta - } - - #[inline] - pub fn bucket_stat(&self) -> &Option { - &self.bucket_stat - } - - #[inline] - pub fn last_bucket_count(&self) -> usize { - self.last_bucket_count - } -} - impl Peer { #[inline] pub fn on_refresh_region_buckets( &mut self, store_ctx: &mut StoreContext, region_epoch: RegionEpoch, - mut buckets: Vec, + buckets: Vec, bucket_ranges: Option>, ) { - // bucket version layout - // term logical counter - // |-----------|-----------| - // high bits low bits - // term: given 10s election timeout, the 32 bit means 1362 year running time - let gen_bucket_version = |term, current_version| { - let current_version_term = current_version >> 32; - let bucket_version: u64 = if current_version_term == term { - current_version + 1 - } else { - if term > u32::MAX.into() { - error!( - self.logger, - "unexpected term {} more than u32::MAX. Bucket - version will be backward.", - term - ); - } - term << 32 - }; - bucket_version - }; - - let region = self.region(); - let current_version = self.region_buckets_info().version(); - let next_bucket_version = gen_bucket_version(self.term(), current_version); - let mut is_first_refresh = true; - let mut change_bucket_version = false; - let mut region_buckets: BucketStat; - - // The region buckets reset after this region happened split or merge. - // The message should be dropped if it's epoch is lower than the regions. - // The bucket ranges is none when the region buckets is also none. - // So this condition indicates that the region buckets needs to refresh not - // renew. - if let (Some(bucket_ranges), Some(peer_region_buckets)) = - (bucket_ranges, self.region_buckets_info().bucket_stat()) - { - is_first_refresh = false; - assert_eq!(buckets.len(), bucket_ranges.len()); - let mut meta_idx = 0; - region_buckets = peer_region_buckets.clone(); - let mut meta = (*region_buckets.meta).clone(); - meta.region_epoch = region_epoch; - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - // the bucket ranges maybe need to split or merge not all the meta keys, so it - // needs to find the first keys. - while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { - meta_idx += 1; - } - // meta_idx can't be not the last entry (which is end key) - if meta_idx >= meta.keys.len() - 1 { - warn!( - self.logger, - "can't find the bucket key"; - "bucket_range_key" => log_wrappers::Value::key(&bucket_range.0)); - break; - } - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = store_ctx - .coprocessor_host - .cfg - .region_bucket_merge_size_ratio - * (store_ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[meta_idx] = bucket.size; - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && meta_idx != 0 - && meta.sizes[meta_idx - 1] + bucket.size - < store_ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(meta_idx); - meta.left_merge(meta_idx); - change_bucket_version = true; - continue; - } - } else { - // update size - meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - meta_idx += 1; - region_buckets.split(meta_idx); - meta.split(meta_idx, bucket_key); - change_bucket_version = true; - } - } - meta_idx += 1; - } - if self.region_buckets_info().last_bucket_count() != region_buckets.meta.keys.len() - 1 - { - change_bucket_version = true; - } - if change_bucket_version { - meta.version = next_bucket_version; - } - region_buckets.meta = Arc::new(meta); - } else { - // when the region buckets is none, the exclusive buckets includes all the - // bucket keys. - assert_eq!(buckets.len(), 1); - change_bucket_version = true; - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - let mut meta = BucketMeta { - region_id: self.region_id(), - region_epoch, - version: next_bucket_version, - keys: bucket_keys, - sizes: vec![store_ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], - }; - // padding the boundary keys and initialize the flow. - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - region_buckets = BucketStat::from_meta(Arc::new(meta)); + if self.term() > u32::MAX.into() { + error!( + self.logger, + "unexpected term {} more than u32::MAX. Bucket version will be backward.", + self.term() + ); } + let current_version = self.region_buckets_info().version(); + let next_bucket_version = util::gen_bucket_version(self.term(), current_version); + let region = self.region().clone(); + let change_bucket_version = self.region_buckets_info_mut().on_refresh_region_buckets( + &store_ctx.coprocessor_host.cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + let region_buckets = self + .region_buckets_info() + .bucket_stat() + .as_ref() + .unwrap() + .clone(); let buckets_count = region_buckets.meta.keys.len() - 1; if change_bucket_version { // TODO: we may need to make it debug once the coprocessor timeout is resolved. @@ -281,17 +67,18 @@ impl Peer { "bucket_version" => next_bucket_version, "buckets_count" => buckets_count, "estimated_region_size" => region_buckets.meta.total_size(), - "first_refresh" => is_first_refresh, ); + } else { + // it means the buckets key range not any change, so don't need to refresh. + return; } + store_ctx.coprocessor_host.on_region_changed( - region, + self.region(), RegionChangeEvent::UpdateBuckets(buckets_count), self.state_role(), ); let meta = region_buckets.meta.clone(); - self.region_buckets_info_mut() - .set_bucket_stat(Some(region_buckets.clone())); { let mut store_meta = store_ctx.store_meta.lock().unwrap(); if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { @@ -302,13 +89,13 @@ impl Peer { if let Some(apply_scheduler) = self.apply_scheduler() { apply_scheduler.send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); } + if !self.is_leader() { + return; + } let version = region_buckets.meta.version; let keys = region_buckets.meta.keys.clone(); // Notify followers to flush their relevant memtables let peers = self.region().get_peers().to_vec(); - if !self.is_leader() { - return; - } for p in peers { if p == *self.peer() || p.is_witness { continue; @@ -397,9 +184,9 @@ impl Peer { if !ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } - let bucket_update_diff_size_threshold = ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; + let region_bucket_max_size = ctx.coprocessor_host.cfg.region_bucket_size.0 * 2; self.region_buckets_info() - .gen_bucket_range_for_update(bucket_update_diff_size_threshold) + .gen_bucket_range_for_update(region_bucket_max_size) } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 8920ea97e1d..1c4538ab51e 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -13,7 +13,13 @@ //! Updates truncated index, and compacts logs if the corresponding changes have //! been persisted in kvdb. -use std::path::PathBuf; +use std::{ + path::PathBuf, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, +}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; @@ -50,16 +56,21 @@ pub struct CompactLogContext { /// persisted. When persisted_apply is advanced, we need to notify tablet /// worker to destroy them. tombstone_tablets_wait_index: Vec, + /// Sometimes a tombstone tablet can be registered after tablet index is + /// advanced. We should not consider it as an active tablet otherwise it + /// might block peer destroy progress. + persisted_tablet_index: Arc, } impl CompactLogContext { - pub fn new(last_applying_index: u64) -> CompactLogContext { + pub fn new(last_applying_index: u64, persisted_applied: u64) -> CompactLogContext { CompactLogContext { skipped_ticks: 0, approximate_log_size: 0, last_applying_index, last_compacted_idx: 0, tombstone_tablets_wait_index: vec![], + persisted_tablet_index: AtomicU64::new(persisted_applied).into(), } } @@ -134,6 +145,8 @@ impl Peer { store_ctx: &mut StoreContext, force: bool, ) { + fail::fail_point!("maybe_propose_compact_log", |_| {}); + // As leader, we would not keep caches for the peers that didn't response // heartbeat in the last few seconds. That happens probably because // another TiKV is down. In this case if we do not clean up the cache, @@ -379,7 +392,9 @@ impl Peer { )); } - /// Returns if there's any tombstone being removed. + /// Returns if there's any tombstone being removed. `persisted` state may + /// not be persisted yet, caller is responsible for actually destroying the + /// physical tablets afterwards. #[inline] pub fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { let compact_log_context = self.compact_log_context_mut(); @@ -398,11 +413,21 @@ impl Peer { } } + /// User can only increase this counter. + #[inline] + pub fn remember_persisted_tablet_index(&self) -> Arc { + self.compact_log_context().persisted_tablet_index.clone() + } + + /// Returns whether there's any tombstone tablet newer than persisted tablet + /// index. They might still be referenced by inflight apply and cannot be + /// destroyed. pub fn has_pending_tombstone_tablets(&self) -> bool { - !self - .compact_log_context() - .tombstone_tablets_wait_index - .is_empty() + let ctx = self.compact_log_context(); + let persisted = ctx.persisted_tablet_index.load(Ordering::Relaxed); + ctx.tombstone_tablets_wait_index + .iter() + .any(|i| *i > persisted) } #[inline] @@ -411,6 +436,8 @@ impl Peer { ctx: &StoreContext, task: &mut WriteTask, ) { + let applied_index = self.entry_storage().applied_index(); + self.remove_tombstone_tablets(applied_index); assert!( !self.has_pending_tombstone_tablets(), "{} all tombstone should be cleared before being destroyed.", @@ -421,7 +448,6 @@ impl Peer { None => return, }; let region_id = self.region_id(); - let applied_index = self.entry_storage().applied_index(); let sched = ctx.schedulers.tablet.clone(); let _ = sched.schedule(tablet::Task::prepare_destroy( tablet, @@ -557,13 +583,17 @@ impl Peer { } if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet.clone(); + let counter = self.remember_persisted_tablet_index(); if !task.has_snapshot { task.persisted_cbs.push(Box::new(move || { let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + // Writer guarantees no race between different callbacks. + counter.store(new_persisted, Ordering::Relaxed); })); } else { // In snapshot, the index is persisted, tablet can be destroyed directly. let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); + counter.store(new_persisted, Ordering::Relaxed); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 2bd06fca6c2..55cee490e52 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -10,6 +10,7 @@ use std::time::Instant; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use fail::fail_point; use kvproto::{ metapb::{self, PeerRole}, raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, @@ -392,6 +393,14 @@ impl Apply { match change_type { ConfChangeType::AddNode => { + let add_node_fp = || { + fail_point!( + "apply_on_add_node_1_2", + self.peer_id() == 2 && self.region_id() == 1, + |_| {} + ) + }; + add_node_fp(); PEER_ADMIN_CMD_COUNTER_VEC .with_label_values(&["add_peer", "all"]) .inc(); @@ -595,15 +604,22 @@ impl Apply { "update gc peer"; "index" => log_index, "updates" => ?updates, - "gc_peers" => ?removed_records, - "merged_peers" => ?merged_records + "removed_records" => ?removed_records, + "merged_records" => ?merged_records ); removed_records.retain(|p| !updates.contains(&p.get_id())); merged_records.retain_mut(|r| { - let mut sources: Vec<_> = r.take_source_peers().into(); - sources.retain(|p| !updates.contains(&p.get_id())); - r.set_source_peers(sources.into()); - !r.get_source_peers().is_empty() + // Clean up source peers if they acknowledge GcPeerRequest. + let mut source_peers: Vec<_> = r.take_source_peers().into(); + source_peers.retain(|p| !updates.contains(&p.get_id())); + r.set_source_peers(source_peers.into()); + // Clean up source removed records (peers) if they acknowledge GcPeerRequest. + let mut source_removed_records: Vec<_> = r.take_source_removed_records().into(); + source_removed_records.retain(|p| !updates.contains(&p.get_id())); + r.set_source_removed_records(source_removed_records.into()); + // Clean up merged records if all source peers and source removed records are + // empty. + !r.get_source_peers().is_empty() || !r.get_source_removed_records().is_empty() }); self.region_state_mut() .set_removed_records(removed_records.into()); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index 5bd92e3ea1c..da26a423a97 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -172,11 +172,17 @@ impl Peer { &mut self, store_ctx: &mut StoreContext, ) { + fail::fail_point!("on_schedule_merge", |_| {}); fail::fail_point!( "ask_target_peer_to_commit_merge_2", self.region_id() == 2, |_| {} ); + fail::fail_point!( + "ask_target_peer_to_commit_merge_store_1", + store_ctx.store_id == 1, + |_| {} + ); let state = self.applied_merge_state().unwrap(); let target = state.get_target(); let target_id = target.get_id(); @@ -198,7 +204,7 @@ impl Peer { Ok(ents) => ents, Err(e) => slog_panic!( self.logger, - "failed to get merge entires"; + "failed to get merge entries"; "err" => ?e, "low" => low, "commit" => state.get_commit() @@ -261,6 +267,7 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) { + fail::fail_point!("on_ask_commit_merge", |_| {}); let expected_epoch = req.get_header().get_region_epoch(); let merge = req.get_admin_request().get_commit_merge(); assert!(merge.has_source_state() && merge.get_source_state().has_merge_state()); @@ -293,7 +300,10 @@ impl Peer { target_id: self.region_id(), }, ); - } else if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { + return; + } + // current region_epoch > region epoch in commit merge. + if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { info!( self.logger, "reject commit merge because of stale"; @@ -304,56 +314,51 @@ impl Peer { let _ = store_ctx .router .force_send(source_id, PeerMsg::RejectCommitMerge { index }); - } else if expected_epoch == region.get_region_epoch() { - assert!( - util::is_sibling_regions(source_region, region), - "{}: {:?}, {:?}", - SlogFormat(&self.logger), - source_region, - region - ); - assert!( - region_on_same_stores(source_region, region), - "{:?}, {:?}", - source_region, - region - ); - assert!(!self.storage().has_dirty_data()); - if self.is_leader() { - let index = commit_of_merge(req.get_admin_request().get_commit_merge()); - if self.proposal_control().is_merging() { - // `on_admin_command` may delay our request indefinitely. It's better to check - // directly. - info!( - self.logger, - "reject commit merge because of target is merging with another region"; - ); - } else { - let (ch, res) = CmdResChannel::pair(); - self.on_admin_command(store_ctx, req, ch); - if let Some(res) = res.take_result() - && res.get_header().has_error() - { - error!( - self.logger, - "failed to propose commit merge"; - "source" => source_id, - "res" => ?res, - ); - } else { - return; - } - } - let _ = store_ctx - .router - .force_send(source_id, PeerMsg::RejectCommitMerge { index }); - } - } else { + return; + } + // current region_epoch < region epoch in commit merge. + if util::is_epoch_stale(region.get_region_epoch(), expected_epoch) { info!( self.logger, - "ignore commit merge because self epoch is stale"; + "target region still not catch up, skip."; "source" => ?source_region, + "target_region_epoch" => ?expected_epoch, + "exist_region_epoch" => ?self.region().get_region_epoch(), + ); + return; + } + assert!( + util::is_sibling_regions(source_region, region), + "{}: {:?}, {:?}", + SlogFormat(&self.logger), + source_region, + region + ); + assert!( + region_on_same_stores(source_region, region), + "{:?}, {:?}", + source_region, + region + ); + assert!(!self.storage().has_dirty_data()); + let (ch, res) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + if let Some(res) = res.take_result() + && res.get_header().has_error() + { + error!( + self.logger, + "failed to propose commit merge"; + "source" => source_id, + "res" => ?res, ); + fail::fail_point!( + "on_propose_commit_merge_fail_store_1", + store_ctx.store_id == 1, + |_| {} + ); + } else { + fail::fail_point!("on_propose_commit_merge_success"); } } @@ -362,10 +367,11 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) -> Result { + (|| fail::fail_point!("propose_commit_merge_1", store_ctx.store_id == 1, |_| {}))(); let mut proposal_ctx = ProposalContext::empty(); proposal_ctx.insert(ProposalContext::COMMIT_MERGE); let data = req.write_to_bytes().unwrap(); - self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + self.propose_with_ctx(store_ctx, data, proposal_ctx) } } @@ -532,9 +538,6 @@ impl Apply { state.set_state(PeerState::Normal); assert!(!state.has_merge_state()); state.set_tablet_index(index); - let mut removed_records: Vec<_> = state.take_removed_records().into(); - removed_records.append(&mut source_state.get_removed_records().into()); - state.set_removed_records(removed_records.into()); let mut merged_records: Vec<_> = state.take_merged_records().into(); merged_records.append(&mut source_state.get_merged_records().into()); state.set_merged_records(merged_records.into()); @@ -542,6 +545,7 @@ impl Apply { merged_record.set_source_region_id(source_region.get_id()); merged_record.set_source_epoch(source_region.get_region_epoch().clone()); merged_record.set_source_peers(source_region.get_peers().into()); + merged_record.set_source_removed_records(source_state.get_removed_records().into()); merged_record.set_target_region_id(region.get_id()); merged_record.set_target_epoch(region.get_region_epoch().clone()); merged_record.set_target_peers(region.get_peers().into()); @@ -683,6 +687,8 @@ impl Peer { info!( self.logger, "become follower for new logs"; + "first_log_term" => first.term, + "first_log_index" => first.index, "new_log_term" => last_log.term, "new_log_index" => last_log.index, "term" => self.term(), @@ -730,6 +736,12 @@ impl Peer { store_ctx: &mut StoreContext, mut res: CommitMergeResult, ) { + fail::fail_point!( + "on_apply_res_commit_merge_2", + self.peer().store_id == 2, + |_| {} + ); + let region = res.region_state.get_region(); assert!( res.source.get_end_key() == region.get_end_key() @@ -815,6 +827,7 @@ impl Peer { "target_region" => ?self.region(), ); self.add_pending_tick(PeerTick::SplitRegionCheck); + self.maybe_schedule_gc_peer_tick(); } } diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index d3d1896287c..5de1c4cfe01 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -214,27 +214,12 @@ impl Peer { let mut proposal_ctx = ProposalContext::empty(); proposal_ctx.insert(ProposalContext::PREPARE_MERGE); let data = req.write_to_bytes().unwrap(); - self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + self.propose_with_ctx(store_ctx, data, proposal_ctx) }); if r.is_ok() { self.proposal_control_mut().set_pending_prepare_merge(false); } else { - // Match v1::post_propose_fail. - // If we just failed to propose PrepareMerge, the pessimistic locks status - // may become MergingRegion incorrectly. So, we have to revert it here. - // Note: The `is_merging` check from v1 is removed because proposed - // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). - assert!( - !self.proposal_control().is_merging(), - "{}", - SlogFormat(&self.logger) - ); - self.take_merge_context(); - self.proposal_control_mut().set_pending_prepare_merge(false); - let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); - if pessimistic_locks.status == LocksStatus::MergingRegion { - pessimistic_locks.status = LocksStatus::Normal; - } + self.post_prepare_merge_fail(); } r } @@ -707,6 +692,25 @@ impl Peer { self.propose(store_ctx, cmd.write_to_bytes().unwrap())?; Ok(()) } + + pub fn post_prepare_merge_fail(&mut self) { + // Match v1::post_propose_fail. + // If we just failed to propose PrepareMerge, the pessimistic locks status + // may become MergingRegion incorrectly. So, we have to revert it here. + // Note: The `is_merging` check from v1 is removed because proposed + // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). + assert!( + !self.proposal_control().is_merging(), + "{}", + SlogFormat(&self.logger) + ); + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); + if pessimistic_locks.status == LocksStatus::MergingRegion { + pessimistic_locks.status = LocksStatus::Normal; + } + } } impl Apply { @@ -812,6 +816,8 @@ impl Peer { store_ctx: &mut StoreContext, res: PrepareMergeResult, ) { + fail::fail_point!("on_apply_res_prepare_merge"); + let region = res.region_state.get_region().clone(); { let mut meta = store_ctx.store_meta.lock().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs index cb45fdcf1cf..adc49a928b3 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs @@ -4,9 +4,8 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse}, - raft_serverpb::PeerState, + raft_serverpb::{PeerState, RegionLocalState}, }; use raftstore::{ coprocessor::RegionChangeReason, @@ -28,7 +27,7 @@ use crate::{ #[derive(Debug)] pub struct RollbackMergeResult { commit: u64, - region: metapb::Region, + region_state: RegionLocalState, } impl Peer { @@ -38,6 +37,7 @@ impl Peer { store_ctx: &mut StoreContext, index: u64, ) { + fail::fail_point!("on_reject_commit_merge_1", store_ctx.store_id == 1, |_| {}); let self_index = self.merge_context().and_then(|c| c.prepare_merge_index()); if self_index != Some(index) { info!( @@ -75,7 +75,7 @@ impl Apply { pub fn apply_rollback_merge( &mut self, req: &AdminRequest, - _index: u64, + index: u64, ) -> Result<(AdminResponse, AdminCmdResult)> { fail::fail_point!("apply_rollback_merge"); PEER_ADMIN_CMD_COUNTER.rollback_merge.all.inc(); @@ -95,6 +95,15 @@ impl Apply { "state" => ?merge_state, ); } + + let prepare_merge_commit = rollback.commit; + info!( + self.logger, + "execute RollbackMerge"; + "commit" => prepare_merge_commit, + "index" => index, + ); + let mut region = self.region().clone(); let version = region.get_region_epoch().get_version(); // Update version to avoid duplicated rollback requests. @@ -108,7 +117,7 @@ impl Apply { AdminResponse::default(), AdminCmdResult::RollbackMerge(RollbackMergeResult { commit: rollback.get_commit(), - region, + region_state: self.region_state().clone(), }), )) } @@ -121,6 +130,7 @@ impl Peer { store_ctx: &mut StoreContext, res: RollbackMergeResult, ) { + let region = res.region_state.get_region(); assert_ne!(res.commit, 0); let current = self.merge_context().and_then(|c| c.prepare_merge_index()); if current != Some(res.commit) { @@ -133,21 +143,21 @@ impl Peer { } { let mut meta = store_ctx.store_meta.lock().unwrap(); - meta.set_region(&res.region, true, &self.logger); - let (reader, _) = meta.readers.get_mut(&res.region.get_id()).unwrap(); + meta.set_region(region, true, &self.logger); + let (reader, _) = meta.readers.get_mut(®ion.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, reader, - res.region.clone(), + region.clone(), RegionChangeReason::RollbackMerge, self.storage().region_state().get_tablet_index(), ); } - let region_state = self.storage().region_state().clone(); let region_id = self.region_id(); self.state_changes_mut() - .put_region_state(region_id, res.commit, ®ion_state) + .put_region_state(region_id, res.commit, &res.region_state) .unwrap(); + self.storage_mut().set_region_state(res.region_state); self.set_has_extra_write(); self.rollback_merge(store_ctx); diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index d59a564c696..b861f86f859 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -12,6 +12,7 @@ use compact_log::CompactLogResult; use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ + kvrpcpb::DiskFullOpt, metapb::{PeerRole, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessageType, FlushMemtable, RaftMessage}, @@ -29,17 +30,17 @@ use raftstore::{ cmd_resp, fsm::{apply, apply::validate_batch_split}, msg::ErrorCallback, - Transport, + ProposalContext, Transport, }, Error, }; -use slog::{error, info}; +use slog::{debug, error, info}; use split::SplitResult; pub use split::{ report_split_init_finish, temp_split_path, RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, SplitPendingAppend, SPLIT_PREFIX, }; -use tikv_util::{box_err, log::SlogFormat, slog_panic}; +use tikv_util::{box_err, log::SlogFormat, slog_panic, sys::disk::DiskUsage}; use txn_types::WriteBatchFlags; use self::flashback::FlashbackResult; @@ -103,6 +104,18 @@ impl Peer { let pre_transfer_leader = cmd_type == AdminCmdType::TransferLeader && !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL); + let is_conf_change = apply::is_conf_change_cmd(&req); + + // Check whether the admin request can be proposed when disk full. + let can_skip_check = is_transfer_leader || pre_transfer_leader || is_conf_change; + if !can_skip_check && let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + self.post_propose_fail(cmd_type); + return; + } // The admin request is rejected because it may need to update epoch checker // which introduces an uncertainty and may breaks the correctness of epoch @@ -134,9 +147,11 @@ impl Peer { ch.report_error(resp); return; } + // Prepare Merge need to be broadcast to as many as followers when disk full. + self.on_prepare_merge(cmd_type, ctx); // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); - let res = if apply::is_conf_change_cmd(&req) { + let res = if is_conf_change { self.propose_conf_change(ctx, req) } else { // propose other admin command. @@ -222,10 +237,14 @@ impl Peer { } } AdminCmdType::CompactLog => self.propose_compact_log(ctx, req), - AdminCmdType::UpdateGcPeer | AdminCmdType::RollbackMerge => { + AdminCmdType::UpdateGcPeer => { let data = req.write_to_bytes().unwrap(); self.propose(ctx, data) } + AdminCmdType::RollbackMerge => { + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(ctx, data, ProposalContext::ROLLBACK_MERGE) + } AdminCmdType::PrepareMerge => self.propose_prepare_merge(ctx, req), AdminCmdType::CommitMerge => self.propose_commit_merge(ctx, req), AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { @@ -258,6 +277,42 @@ impl Peer { self.post_propose_command(ctx, res, vec![ch], true); } + fn on_prepare_merge( + &mut self, + cmd_type: AdminCmdType, + ctx: &StoreContext, + ) { + let is_merge_cmd = + cmd_type == AdminCmdType::PrepareMerge || cmd_type == AdminCmdType::RollbackMerge; + let has_disk_full_peers = self.abnormal_peer_context().disk_full_peers().is_empty(); + let proposal_index = self.next_proposal_index(); + if is_merge_cmd + && (!matches!(ctx.self_disk_usage, DiskUsage::Normal) || !has_disk_full_peers) + { + self.has_region_merge_proposal = true; + self.region_merge_proposal_index = proposal_index; + let mut peers = vec![]; + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .iter_mut() + .for_each(|(k, v)| { + if !matches!(v.0, DiskUsage::AlreadyFull) { + v.1 = true; + peers.push(*k); + } + }); + debug!( + self.logger, + "adjust max inflight msgs"; + "cmd_type" => ?cmd_type, + "raft_max_inflight_msgs" => ctx.cfg.raft_max_inflight_msgs, + "region" => self.region_id() + ); + self.adjust_peers_max_inflight_msgs(&peers, ctx.cfg.raft_max_inflight_msgs); + } + } + fn start_pre_flush( &mut self, ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index c744c1b9161..cfbd7678c17 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -35,6 +35,7 @@ use engine_traits::{ use fail::fail_point; use futures::channel::oneshot; use kvproto::{ + kvrpcpb::DiskFullOpt, metapb::{self, Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, @@ -75,6 +76,9 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, + // new regions will share the region size if it's true. + // otherwise, the new region's size will be 0. + pub share_source_region_size: bool, // Hack: in common case we should use generic, but split is an infrequent // event that performance is not critical. And using `Any` can avoid polluting // all existing code. @@ -148,6 +152,9 @@ pub struct RequestSplit { pub epoch: RegionEpoch, pub split_keys: Vec>, pub source: Cow<'static, str>, + // new regions will share the region size if it's true. + // otherwise, the new region's size will be 0. + pub share_source_region_size: bool, } #[derive(Debug)] @@ -235,6 +242,7 @@ impl Peer { { return true; } + fail_point!("on_split_region_check_tick", |_| true); if ctx.schedulers.split_check.is_busy() { return false; } @@ -325,6 +333,14 @@ impl Peer { )))); return; } + // Check whether the admin request can be proposed when disk full. + if let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + info!(self.logger, "disk is full, skip split"; "err" => ?e); + ch.set_result(cmd_resp::new_error(e)); + return; + } if let Err(e) = util::validate_split_region( self.region_id(), self.peer_id(), @@ -336,7 +352,7 @@ impl Peer { ch.set_result(cmd_resp::new_error(e)); return; } - self.ask_batch_split_pd(ctx, rs.split_keys, ch); + self.ask_batch_split_pd(ctx, rs.split_keys, rs.share_source_region_size, ch); } pub fn on_request_half_split( @@ -358,6 +374,13 @@ impl Peer { info!(self.logger, "not leader, skip."); return; } + // Check whether the admin request can be proposed when disk full. + if let Err(e) = + self.check_proposal_with_disk_full_opt(ctx, DiskFullOpt::AllowedOnAlmostFull) + { + info!(self.logger, "disk is full, skip half split"; "err" => ?e); + return; + } let region = self.region(); if util::is_epoch_stale(&rhs.epoch, region.get_region_epoch()) { @@ -479,6 +502,7 @@ impl Apply { let derived_req = &[derived_req]; let right_derive = split_reqs.get_right_derive(); + let share_source_region_size = split_reqs.get_share_source_region_size(); let reqs = if right_derive { split_reqs.get_requests().iter().chain(derived_req) } else { @@ -615,6 +639,7 @@ impl Apply { derived_index, tablet_index: log_index, tablet: Box::new(tablet), + share_source_region_size, }), )) } @@ -665,6 +690,7 @@ impl Peer { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); let derived = &res.regions[res.derived_index]; + let share_source_region_size = res.share_source_region_size; let region_id = derived.get_id(); let region_locks = self.txn_context().split(&res.regions, derived); @@ -695,8 +721,14 @@ impl Peer { let new_region_count = res.regions.len() as u64; let control = self.split_flow_control_mut(); - let estimated_size = control.approximate_size.map(|v| v / new_region_count); - let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + // if share_source_region_size is true, it means the new region contains any + // data from the origin region. + let mut share_size = None; + let mut share_keys = None; + if share_source_region_size { + share_size = control.approximate_size.map(|v| v / new_region_count); + share_keys = control.approximate_keys.map(|v| v / new_region_count); + } self.post_split(); @@ -714,8 +746,11 @@ impl Peer { // After split, the peer may need to update its metrics. let control = self.split_flow_control_mut(); control.may_skip_split_check = false; - control.approximate_size = estimated_size; - control.approximate_keys = estimated_keys; + if share_source_region_size { + control.approximate_size = share_size; + control.approximate_keys = share_keys; + } + self.add_pending_tick(PeerTick::SplitRegionCheck); } self.storage_mut().set_has_dirty_data(true); @@ -760,8 +795,8 @@ impl Peer { derived_region_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, - approximate_size: estimated_size, - approximate_keys: estimated_keys, + approximate_size: share_size, + approximate_keys: share_keys, locks, })); diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 4cdeba3bc41..bf9cb426255 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -118,7 +118,7 @@ impl Peer { transferee } - fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + pub fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { if self.raft_group().raft.has_pending_conf() { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0fd88cc987b..b93ea700f80 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -43,7 +43,7 @@ use raftstore::{ }, msg::ErrorCallback, util::{self, check_flashback_state}, - Config, Transport, WriteCallback, + Config, ProposalContext, Transport, WriteCallback, }, Error, Result, }; @@ -202,7 +202,8 @@ impl Peer { // progress less error-prone. if !(admin_type.is_some() && (admin_type.unwrap() == AdminCmdType::ChangePeer - || admin_type.unwrap() == AdminCmdType::ChangePeerV2)) + || admin_type.unwrap() == AdminCmdType::ChangePeerV2 + || admin_type.unwrap() == AdminCmdType::RollbackMerge)) { return Err(Error::RecoveryInProgress(self.region_id())); } @@ -239,7 +240,7 @@ impl Peer { store_ctx: &mut StoreContext, data: Vec, ) -> Result { - self.propose_with_ctx(store_ctx, data, vec![]) + self.propose_with_ctx(store_ctx, data, ProposalContext::empty()) } #[inline] @@ -247,12 +248,12 @@ impl Peer { &mut self, store_ctx: &mut StoreContext, data: Vec, - proposal_ctx: Vec, + proposal_ctx: ProposalContext, ) -> Result { // Should not propose normal in force leader state. // In `pre_propose_raft_command`, it rejects all the requests expect // conf-change if in force leader state. - if self.has_force_leader() { + if self.has_force_leader() && proposal_ctx != ProposalContext::ROLLBACK_MERGE { store_ctx.raft_metrics.invalid_proposal.force_leader.inc(); panic!( "[{}] {} propose normal in force leader state {:?}", @@ -274,7 +275,7 @@ impl Peer { }); } let last_index = self.raft_group().raft.raft_log.last_index(); - self.raft_group_mut().propose(proposal_ctx, data)?; + self.raft_group_mut().propose(proposal_ctx.to_vec(), data)?; if self.raft_group().raft.raft_log.last_index() == last_index { // The message is dropped silently, this usually due to leader absence // or transferring leader. Both cases can be considered as NotLeader error. @@ -455,6 +456,11 @@ impl Peer { if is_leader { self.retry_pending_prepare_merge(ctx, apply_res.applied_index); } + if !apply_res.sst_applied_index.is_empty() { + self.storage_mut() + .apply_trace_mut() + .on_sst_ingested(&apply_res.sst_applied_index); + } self.on_data_modified(apply_res.modifications); self.handle_read_on_apply( ctx, @@ -476,6 +482,12 @@ impl Peer { } self.check_unsafe_recovery_state(ctx); } + + pub fn post_propose_fail(&mut self, cmd_type: AdminCmdType) { + if cmd_type == AdminCmdType::PrepareMerge { + self.post_prepare_merge_fail(); + } + } } #[derive(Debug)] @@ -583,6 +595,7 @@ impl Apply { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); fail::fail_point!("on_handle_apply_1003", self.peer_id() == 1003, |_| {}); fail::fail_point!("on_handle_apply_2", self.peer_id() == 2, |_| {}); + fail::fail_point!("on_handle_apply", |_| {}); fail::fail_point!("on_handle_apply_store_1", self.store_id() == 1, |_| {}); let now = std::time::Instant::now(); let apply_wait_time = APPLY_TASK_WAIT_TIME_HISTOGRAM.local(); @@ -865,6 +878,7 @@ impl Apply { apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); apply_res.bucket_stat = self.buckets.clone(); + apply_res.sst_applied_index = self.take_sst_applied_index(); let written_bytes = apply_res.metrics.written_bytes; let skip_report = || -> bool { diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index 7e8ed381ad0..45247b3f36f 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -2,8 +2,9 @@ use collections::HashMap; use crossbeam::channel::TrySendError; -use engine_traits::{data_cf_offset, KvEngine, RaftEngine}; +use engine_traits::{data_cf_offset, KvEngine, RaftEngine, DATA_CFS_LEN}; use kvproto::import_sstpb::SstMeta; +use pd_client::metrics::STORE_SIZE_EVENT_INT_VEC; use raftstore::{ store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, Result, @@ -16,7 +17,7 @@ use crate::{ batch::StoreContext, fsm::{ApplyResReporter, Store, StoreFsmDelegate}, raft::{Apply, Peer}, - router::{PeerMsg, StoreTick}, + router::{PeerMsg, SstApplyIndex, StoreTick}, worker::tablet, }; @@ -39,7 +40,14 @@ impl Store { &mut self, ctx: &mut StoreContext, ) -> Result<()> { + let import_size = box_try!(ctx.sst_importer.get_total_size()); + STORE_SIZE_EVENT_INT_VEC.import_size.set(import_size as i64); let ssts = box_try!(ctx.sst_importer.list_ssts()); + // filter old version SSTs + let ssts: Vec<_> = ssts + .into_iter() + .filter(|sst| sst.1 >= sst_importer::API_VERSION_2) + .collect(); if ssts.is_empty() { return Ok(()); } @@ -47,9 +55,9 @@ impl Store { let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); for sst in ssts { region_ssts - .entry(sst.get_region_id()) + .entry(sst.0.get_region_id()) .or_default() - .push(sst); + .push(sst.0); } let ranges = ctx.sst_importer.ranges_in_import(); @@ -107,10 +115,12 @@ impl Peer { impl Apply { #[inline] pub fn apply_ingest(&mut self, index: u64, ssts: Vec) -> Result<()> { + fail::fail_point!("on_apply_ingest"); PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); let mut infos = Vec::with_capacity(ssts.len()); let mut size: i64 = 0; let mut keys: u64 = 0; + let mut cf_indexes = [u64::MAX; DATA_CFS_LEN]; for sst in &ssts { // This may not be enough as ingest sst may not trigger flush at all. let off = data_cf_offset(sst.get_cf_name()); @@ -138,6 +148,7 @@ impl Apply { slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); } } + cf_indexes[off] = index; } if !infos.is_empty() { // Unlike v1, we can't batch ssts accross regions. @@ -154,6 +165,11 @@ impl Apply { self.metrics.size_diff_hint += size; self.metrics.written_bytes += size as u64; self.metrics.written_keys += keys; + for (cf_index, index) in cf_indexes.into_iter().enumerate() { + if index != u64::MAX { + self.push_sst_applied_index(SstApplyIndex { cf_index, index }); + } + } Ok(()) } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index a9d8bd664fe..5806614e192 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -12,7 +12,8 @@ use raftstore::{ fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, metrics::PEER_WRITE_CMD_COUNTER, msg::ErrorCallback, - util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, + util::{self}, + RaftCmdExtraOpts, }, Error, Result, }; @@ -42,6 +43,7 @@ impl Peer { header: Box, data: SimpleWriteBinary, ch: CmdResChannel, + extra_opts: Option, ) { if !self.serving() { apply::notify_req_region_removed(self.region_id(), ch); @@ -59,6 +61,20 @@ impl Peer { ch.report_error(resp); return; } + if let Some(opts) = extra_opts { + if let Some(Err(e)) = opts.deadline.map(|deadline| deadline.check()) { + let resp = cmd_resp::new_error(e.into()); + ch.report_error(resp); + return; + } + // Check whether the write request can be proposed with the given disk full + // option. + if let Err(e) = self.check_proposal_with_disk_full_opt(ctx, opts.disk_full_opt) { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + } // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); if let Some(conflict) = self.proposal_control_mut().check_conflict(None) { @@ -72,13 +88,10 @@ impl Peer { ch.report_error(resp); return; } - // ProposalControl is reliable only when applied to current term. - let call_proposed_on_success = self.applied_to_current_term(); let mut encoder = SimpleWriteReqEncoder::new( header, data, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, - call_proposed_on_success, ); encoder.add_response_channel(ch); self.set_has_ready(); @@ -98,7 +111,6 @@ impl Peer { Box::::default(), data, ctx.cfg.raft_entry_max_size.0 as usize, - false, ) .encode() .0 @@ -110,30 +122,17 @@ impl Peer { pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { if let Some(encoder) = self.simple_write_encoder_mut().take() { - let call_proposed_on_success = if encoder.notify_proposed() { - // The request has pass conflict check and called all proposed callbacks. + let header = encoder.header(); + let res = self.validate_command(header, None, &mut ctx.raft_metrics); + let call_proposed_on_success = if matches!(res, Err(Error::EpochNotMatch { .. })) { false } else { - // Epoch may have changed since last check. - let from_epoch = encoder.header().get_region_epoch(); - let res = util::compare_region_epoch( - from_epoch, - self.region(), - NORMAL_REQ_CHECK_CONF_VER, - NORMAL_REQ_CHECK_VER, - true, - ); - if let Err(e) = res { - // TODO: query sibling regions. - ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); - encoder.encode().1.report_error(cmd_resp::new_error(e)); - return; - } - // Only when it applies to current term, the epoch check can be reliable. self.applied_to_current_term() }; + let (data, chs) = encoder.encode(); - let res = self.propose(ctx, data); + let res = res.and_then(|_| self.propose(ctx, data)); + fail_point!("after_propose_pending_writes"); self.post_propose_command(ctx, res, chs, call_proposed_on_success); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 52f00d137f8..e9fc84643da 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -26,28 +26,35 @@ //! `merged_records`, to avoid race between destroy and merge, leader needs to //! ask target peer to destroy source peer. -use std::{cmp, mem}; +use std::{cmp, collections::HashSet, mem}; use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb::{self, Region}, + kvrpcpb::DiskFullOpt, + metapb::{self, PeerRole, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore::store::{ - fsm::{ - apply, - life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, - Proposal, +use raft::eraftpb::MessageType; +use raftstore::{ + store::{ + fsm::{ + apply, + life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, + Proposal, + }, + local_metrics::IoType as InspectIoType, + metrics::RAFT_PEER_PENDING_DURATION, + util, DiskFullPeers, Transport, WriteTask, }, - metrics::RAFT_PEER_PENDING_DURATION, - util, Transport, WriteTask, + Error, Result, }; use slog::{debug, error, info, warn}; use tikv_util::{ store::find_peer, + sys::disk::DiskUsage, time::{duration_to_sec, Instant}, }; @@ -126,16 +133,22 @@ pub struct AbnormalPeerContext { pending_peers: Vec<(u64, Instant)>, /// A inaccurate cache about which peer is marked as down. down_peers: Vec, + // disk full peer set. + disk_full_peers: DiskFullPeers, + // show whether an already disk full TiKV appears in the potential majority set. + dangerous_majority_set: bool, } impl AbnormalPeerContext { #[inline] pub fn is_empty(&self) -> bool { - self.pending_peers.is_empty() && self.down_peers.is_empty() + self.pending_peers.is_empty() && self.down_peers.is_empty() /* && self.disk_full_peers.is_empty() */ } #[inline] pub fn reset(&mut self) { + // No need to refresh disk_full_peers as it will be refreshed + // automatically when the disk usage updated. self.pending_peers.clear(); self.down_peers.clear(); } @@ -174,6 +187,26 @@ impl AbnormalPeerContext { RAFT_PEER_PENDING_DURATION.observe(elapsed); }); } + + #[inline] + pub fn disk_full_peers(&self) -> &DiskFullPeers { + &self.disk_full_peers + } + + #[inline] + pub fn disk_full_peers_mut(&mut self) -> &mut DiskFullPeers { + &mut self.disk_full_peers + } + + #[inline] + pub fn is_dangerous_majority_set(&self) -> bool { + self.dangerous_majority_set + } + + #[inline] + pub fn setup_dangerous_majority_set(&mut self, is_dangerous: bool) { + self.dangerous_majority_set = is_dangerous; + } } #[derive(Default)] @@ -384,8 +417,8 @@ impl Store { ); let region_id = msg.get_region_id(); // The message can be sent when the peer is being created, so try send it first. - let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = - ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) + let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m, _))) = + ctx.router.send(region_id, PeerMsg::RaftMessage(msg, None)) { m } else { @@ -415,6 +448,20 @@ impl Store { ctx.raft_metrics.message_dropped.stale_msg.inc(); return false; } + // Check whether this message should be dropped when disk full. + let msg_type = msg.get_message().get_msg_type(); + if matches!(ctx.self_disk_usage, DiskUsage::AlreadyFull) + && MessageType::MsgTimeoutNow == msg_type + { + debug!( + self.logger(), + "skip {:?} because of disk full", msg_type; + "region_id" => region_id, "peer_id" => to_peer.id, + ); + ctx.raft_metrics.message_dropped.disk_full.inc(); + return false; + } + let destroyed = match check_if_to_peer_destroyed(&ctx.engine, &msg, self.store_id()) { Ok(d) => d, Err(e) => { @@ -424,7 +471,13 @@ impl Store { }; if destroyed { if msg.get_is_tombstone() { + let msg_region_epoch = msg.get_region_epoch().clone(); if let Some(msg) = build_peer_destroyed_report(&mut msg) { + info!(self.logger(), "peer reports destroyed"; + "from_peer" => ?msg.get_from_peer(), + "from_region_epoch" => ?msg_region_epoch, + "region_id" => ?msg.get_region_id(), + "to_peer_id" => ?msg.get_to_peer().get_id()); let _ = ctx.trans.send(msg); } return false; @@ -510,7 +563,7 @@ impl Store { if from_peer.id != raft::INVALID_ID { // For now the peer only exists in memory. It will persist its states when // handling its first readiness. - let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg)); + let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg, None)); } true } @@ -527,9 +580,9 @@ impl Store { { // Record the last statistics of commit-log-duration and store-write-duration. inspector.record_store_wait(start_ts.saturating_elapsed()); - inspector.record_store_commit(ctx.raft_metrics.stat_commit_log.avg()); - // Reset the stat_commit_log and wait it to be refreshed in the next tick. - ctx.raft_metrics.stat_commit_log.reset(); + inspector.record_store_commit(ctx.raft_metrics.health_stats.avg(InspectIoType::Network)); + // Reset the health_stats and wait it to be refreshed in the next tick. + ctx.raft_metrics.health_stats.reset(); ctx.pending_latency_inspect.push(inspector); } } @@ -581,7 +634,11 @@ impl Peer { .iter() .find(|p| p.id == msg.get_from_peer().get_id()) { - let tombstone_msg = self.tombstone_message_for_same_region(peer.clone()); + let tombstone_msg = self.tombstone_message( + self.region_id(), + self.region().get_region_epoch().clone(), + peer.clone(), + ); self.add_message(tombstone_msg); true } else { @@ -589,13 +646,24 @@ impl Peer { } } - fn tombstone_message_for_same_region(&self, peer: metapb::Peer) -> RaftMessage { - let region_id = self.region_id(); + fn tombstone_message( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + peer: metapb::Peer, + ) -> RaftMessage { let mut tombstone_message = RaftMessage::default(); + if self.region_id() != region_id { + // After merge, target region needs to GC peers of source region. + let extra_msg = tombstone_message.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgGcPeerRequest); + let check_peer = extra_msg.mut_check_gc_peer(); + check_peer.set_from_region_id(self.region_id()); + } tombstone_message.set_region_id(region_id); tombstone_message.set_from_peer(self.peer().clone()); tombstone_message.set_to_peer(peer); - tombstone_message.set_region_epoch(self.region().get_region_epoch().clone()); + tombstone_message.set_region_epoch(region_epoch); tombstone_message.set_is_tombstone(true); tombstone_message } @@ -604,6 +672,10 @@ impl Peer { match msg.get_to_peer().get_id().cmp(&self.peer_id()) { cmp::Ordering::Less => { if let Some(msg) = build_peer_destroyed_report(msg) { + info!(self.logger, "peer reports destroyed"; + "from_peer" => ?msg.get_from_peer(), + "from_region_epoch" => ?msg.get_region_epoch(), + "to_peer_id" => ?msg.get_to_peer().get_id()); self.add_message(msg); } } @@ -656,6 +728,10 @@ impl Peer { let _ = router.send_raft_message(m.into()); }, ); + } else { + // Source peer is already destroyed. Forward to store, and let + // it report GcPeer response. + let _ = ctx.router.send_raft_message(m.into()); } }); } @@ -671,6 +747,7 @@ impl Peer { && state.get_merged_records().iter().all(|p| { p.get_source_peers() .iter() + .chain(p.get_source_removed_records()) .all(|p| p.get_id() != gc_peer_id) }) { @@ -683,6 +760,37 @@ impl Peer { ctx.confirmed_ids.push(gc_peer_id); } + // Clean up removed and merged records for peers on tombstone stores, + // otherwise it may keep sending gc peer request to the tombstone store. + pub fn on_store_maybe_tombstone_gc_peer(&mut self, store_id: u64) { + let mut peers_on_tombstone = vec![]; + let state = self.storage().region_state(); + for peer in state.get_removed_records() { + if peer.get_store_id() == store_id { + peers_on_tombstone.push(peer.clone()); + } + } + for record in state.get_merged_records() { + for peer in record.get_source_peers() { + if peer.get_store_id() == store_id { + peers_on_tombstone.push(peer.clone()); + } + } + } + if peers_on_tombstone.is_empty() { + return; + } + info!(self.logger, "gc peer on tombstone store"; + "tombstone_store_id" => store_id, + "peers" => ?peers_on_tombstone); + let ctx = self.gc_peer_context_mut(); + for peer in peers_on_tombstone { + if !ctx.confirmed_ids.contains(&peer.get_id()) { + ctx.confirmed_ids.push(peer.get_id()); + } + } + } + // Removes deleted peers from region state by proposing a `UpdateGcPeer` // command. pub fn on_gc_peer_tick(&mut self, ctx: &mut StoreContext) { @@ -695,27 +803,50 @@ impl Peer { } let mut need_gc_ids = Vec::with_capacity(5); let gc_context = self.gc_peer_context(); + let mut tombstone_removed_records = + |region_id, region_epoch: &metapb::RegionEpoch, peer: &metapb::Peer| { + need_gc_ids.push(peer.get_id()); + if gc_context.confirmed_ids.contains(&peer.get_id()) { + return; + } + + let msg = self.tombstone_message(region_id, region_epoch.clone(), peer.clone()); + // For leader, it's OK to send gc message immediately. + let _ = ctx.trans.send(msg); + }; for peer in state.get_removed_records() { - need_gc_ids.push(peer.get_id()); - if gc_context.confirmed_ids.contains(&peer.get_id()) { - continue; + tombstone_removed_records(self.region_id(), self.region().get_region_epoch(), peer); + } + // For merge, we need to + // 1. ask source removed peers to destroy. + for record in state.get_merged_records() { + for peer in record.get_source_removed_records() { + tombstone_removed_records( + record.get_source_region_id(), + record.get_source_epoch(), + peer, + ); } - - let msg = self.tombstone_message_for_same_region(peer.clone()); - // For leader, it's OK to send gc message immediately. - let _ = ctx.trans.send(msg); } + // 2. ask target to check whether source should be deleted. for record in state.get_merged_records() { - // For merge, we ask target to check whether source should be deleted. - for (source, target) in record - .get_source_peers() - .iter() - .zip(record.get_target_peers()) - { + for source in record.get_source_peers() { need_gc_ids.push(source.get_id()); if gc_context.confirmed_ids.contains(&source.get_id()) { continue; } + let Some(target) = record + .get_target_peers() + .iter() + .find(|p| p.get_store_id() == source.get_store_id()) + else { + panic!( + "[region {}] {} target peer not found, {:?}", + self.region_id(), + self.peer_id(), + state + ); + }; let mut msg = RaftMessage::default(); msg.set_region_id(record.get_target_region_id()); @@ -752,6 +883,266 @@ impl Peer { self.maybe_schedule_gc_peer_tick(); } + pub fn adjust_peers_max_inflight_msgs(&mut self, peers: &[u64], raft_max_inflight_msgs: usize) { + peers.iter().for_each(|id| { + self.raft_group_mut() + .raft + .adjust_max_inflight_msgs(*id, raft_max_inflight_msgs); + debug!( + self.logger, + "adjust max inflight msgs"; + "raft_max_inflight_msgs" => raft_max_inflight_msgs, + "peer_id" => id + ); + }); + } + + // Check disk usages for the peer itself and other peers in the raft group. + // The return value indicates whether the proposal is allowed or not. + pub fn check_proposal_with_disk_full_opt( + &mut self, + ctx: &StoreContext, + disk_full_opt: DiskFullOpt, + ) -> Result<()> { + let leader_allowed = match ctx.self_disk_usage { + DiskUsage::Normal => true, + DiskUsage::AlmostFull => !matches!(disk_full_opt, DiskFullOpt::NotAllowedOnFull), + DiskUsage::AlreadyFull => false, + }; + let mut disk_full_stores = Vec::new(); + let abnormal_peer_context = self.abnormal_peer_context(); + let disk_full_peers = abnormal_peer_context.disk_full_peers(); + if !leader_allowed { + disk_full_stores.push(ctx.store_id); + // Try to transfer leader to a node with disk usage normal to maintain write + // availability. If majority node is disk full, to transfer leader or not is not + // necessary. Note: Need to exclude learner node. + if !disk_full_peers.majority() { + let target_peer = self + .region() + .get_peers() + .iter() + .find(|x| { + !disk_full_peers.has(x.get_id()) + && x.get_id() != self.peer_id() + && !self + .abnormal_peer_context() + .down_peers() + .contains(&x.get_id()) + && !matches!(x.get_role(), PeerRole::Learner) + }) + .cloned(); + if let Some(p) = target_peer { + debug!( + self.logger, + "try to transfer leader because of current leader disk full"; + "region_id" => self.region().get_id(), + "peer_id" => self.peer_id(), + "target_peer_id" => p.get_id(), + ); + self.pre_transfer_leader(&p); + } + } + } else { + // Check followers. + if disk_full_peers.is_empty() { + return Ok(()); + } + if !abnormal_peer_context.is_dangerous_majority_set() { + if !disk_full_peers.majority() { + return Ok(()); + } + // Majority peers are in disk full status but the request carries a special + // flag. + if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) + && disk_full_peers.peers().values().any(|x| x.1) + { + return Ok(()); + } + } + for peer in self.region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + if disk_full_peers.peers().get(&peer_id).is_some() { + disk_full_stores.push(store_id); + } + } + } + let errmsg = format!( + "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", + disk_full_opt, ctx.self_disk_usage + ); + Err(Error::DiskFull(disk_full_stores, errmsg)) + } + + pub fn clear_disk_full_peers(&mut self, ctx: &StoreContext) { + let disk_full_peers = mem::take(self.abnormal_peer_context_mut().disk_full_peers_mut()); + let raft = &mut self.raft_group_mut().raft; + for peer in disk_full_peers.peers().iter() { + raft.adjust_max_inflight_msgs(*peer.0, ctx.cfg.raft_max_inflight_msgs); + } + } + + pub fn refill_disk_full_peers(&mut self, ctx: &StoreContext) { + self.clear_disk_full_peers(ctx); + debug!( + self.logger, + "region id {}, peer id {}, store id {}: refill disk full peers when peer disk usage status changed or merge triggered", + self.region().get_id(), + self.peer_id(), + ctx.store_id, + ); + + // Collect disk full peers and all peers' `next_idx` to find a potential quorum. + let peers_len = self.region().get_peers().len(); + let mut normal_peers = HashSet::default(); + let mut next_idxs = Vec::with_capacity(peers_len); + let mut min_peer_index = u64::MAX; + for peer in self.region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + let usage = ctx.store_disk_usages.get(&store_id); + if usage.is_none() { + // Always treat the leader itself as normal. + normal_peers.insert(peer_id); + } + if let Some(pr) = self.raft_group().raft.prs().get(peer_id) { + // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort + // func belowing. + let mut status = 3; + if let Some(usg) = usage { + status = match usg { + DiskUsage::Normal => 3, + DiskUsage::AlmostFull => 2, + DiskUsage::AlreadyFull => 1, + }; + } + + if !self.abnormal_peer_context().down_peers().contains(&peer_id) { + next_idxs.push((peer_id, pr.next_idx, usage, status)); + if min_peer_index > pr.next_idx { + min_peer_index = pr.next_idx; + } + } + } + } + if self.has_region_merge_proposal { + debug!( + self.logger, + "region id {}, peer id {}, store id {} has a merge request, with region_merge_proposal_index {}", + self.region_id(), + self.peer_id(), + ctx.store_id, + self.region_merge_proposal_index + ); + if min_peer_index > self.region_merge_proposal_index { + self.has_region_merge_proposal = false; + } + } + + if normal_peers.len() == peers_len { + return; + } + + // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, + // then try to get a potential quorum. + next_idxs.sort_by(|x, y| { + if x.3 == y.3 { + y.1.cmp(&x.1) + } else { + y.3.cmp(&x.3) + } + }); + + let majority = !self.raft_group().raft.prs().has_quorum(&normal_peers); + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .set_majority(majority); + // Here set all peers can be sent when merging. + for &(peer, _, usage, ..) in &next_idxs { + if let Some(usage) = usage { + if self.has_region_merge_proposal && !matches!(*usage, DiskUsage::AlreadyFull) { + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .insert(peer, (*usage, true)); + self.raft_group_mut() + .raft + .adjust_max_inflight_msgs(peer, ctx.cfg.raft_max_inflight_msgs); + debug!( + self.logger, + "refill disk full peer max inflight to {} on a merging region: region id {}, peer id {}", + ctx.cfg.raft_max_inflight_msgs, + self.region_id(), + peer + ); + } else { + self.abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut() + .insert(peer, (*usage, false)); + self.raft_group_mut().raft.adjust_max_inflight_msgs(peer, 0); + debug!( + self.logger, + "refill disk full peer max inflight to {} on region without merging: region id {}, peer id {}", + 0, + self.region_id(), + peer + ); + } + } + } + + if !self.abnormal_peer_context().disk_full_peers().majority() { + // Less than majority peers are in disk full status. + return; + } + + let (mut potential_quorum, mut quorum_ok) = (HashSet::default(), false); + let mut is_dangerous_set = false; + for &(peer_id, _, _, status) in &next_idxs { + potential_quorum.insert(peer_id); + + if status == 1 { + // already full peer. + is_dangerous_set = true; + } + + if self.raft_group().raft.prs().has_quorum(&potential_quorum) { + quorum_ok = true; + break; + } + } + + self.abnormal_peer_context_mut() + .setup_dangerous_majority_set(is_dangerous_set); + + // For the Peer with AlreadFull in potential quorum set, we still need to send + // logs to it. To support incoming configure change. + if quorum_ok { + let has_region_merge_proposal = self.has_region_merge_proposal; + let peers = self + .abnormal_peer_context_mut() + .disk_full_peers_mut() + .peers_mut(); + let mut inflight_peers = vec![]; + for peer in potential_quorum { + if let Some(x) = peers.get_mut(&peer) { + // It can help to establish a quorum. + x.1 = true; + // for merge region, all peers have been set to the max. + if !has_region_merge_proposal { + inflight_peers.push(peer); + } + } + } + debug!( + self.logger, + "refill disk full peer max inflight to 1 in potential quorum set: region id {}", + self.region_id(), + ); + self.adjust_peers_max_inflight_msgs(&inflight_peers, 1); + } + } + /// A peer can be destroyed in four cases: /// /// 1. Received a gc message; @@ -795,9 +1186,17 @@ impl Peer { } // Wait for critical commands like split. if self.has_pending_tombstone_tablets() { + let applied_index = self.entry_storage().applied_index(); + let last_index = self.entry_storage().last_index(); + let persisted = self + .remember_persisted_tablet_index() + .load(std::sync::atomic::Ordering::Relaxed); info!( self.logger, - "postpone destroy because there're pending tombstone tablets" + "postpone destroy because there're pending tombstone tablets"; + "applied_index" => applied_index, + "last_index" => last_index, + "persisted_applied" => persisted, ); return true; } diff --git a/components/raftstore-v2/src/operation/misc.rs b/components/raftstore-v2/src/operation/misc.rs index 867b4192dac..fafca29ea85 100644 --- a/components/raftstore-v2/src/operation/misc.rs +++ b/components/raftstore-v2/src/operation/misc.rs @@ -102,7 +102,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { self.store_ctx.cfg.region_compact_min_tombstones, self.store_ctx.cfg.region_compact_tombstones_percent, self.store_ctx.cfg.region_compact_min_redundant_rows, - self.store_ctx.cfg.region_compact_redundant_rows_percent, + self.store_ctx.cfg.region_compact_redundant_rows_percent(), ), })) { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 8ce592dd753..24d025c0a4d 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -24,7 +24,6 @@ pub use ready::{ }; pub(crate) use self::{ - bucket::BucketStatsInfo, command::SplitInit, query::{LocalReader, ReadDelegatePair, SharedReadTablet}, txn_ext::TxnContext, @@ -37,7 +36,7 @@ pub mod test_util { Arc, }; - use engine_traits::{CfName, CF_DEFAULT}; + use engine_traits::{CfName, KvEngine, CF_DEFAULT}; use kvproto::{kvrpcpb::ApiVersion, metapb::RegionEpoch, raft_cmdpb::RaftRequestHeader}; use raft::prelude::{Entry, EntryType}; use raftstore::store::simple_write::SimpleWriteEncoder; @@ -47,7 +46,7 @@ pub mod test_util { use super::{CatchUpLogs, SimpleWriteReqEncoder}; use crate::{fsm::ApplyResReporter, router::ApplyRes}; - pub fn create_tmp_importer() -> (TempDir, Arc) { + pub fn create_tmp_importer() -> (TempDir, Arc>) { let dir = TempDir::new().unwrap(); let importer = Arc::new( SstImporter::new(&Default::default(), dir.path(), None, ApiVersion::V1, true).unwrap(), @@ -87,7 +86,7 @@ pub mod test_util { let mut header = Box::::default(); header.set_region_id(region_id); header.set_region_epoch(region_epoch); - let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512); let (bin, _) = req_encoder.encode(); let mut e = Entry::default(); e.set_entry_type(EntryType::EntryNormal); @@ -112,7 +111,7 @@ pub mod test_util { let mut header = Box::::default(); header.set_region_id(region_id); header.set_region_epoch(region_epoch); - let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512); let (bin, _) = req_encoder.encode(); let mut e = Entry::default(); e.set_entry_type(EntryType::EntryNormal); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 817b3aa6eb6..8e392755c5e 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -103,7 +103,7 @@ impl Peer { let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), - down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), + down_peers: self.collect_down_peers(ctx), peer: self.peer().clone(), pending_peers: self.collect_pending_peers(ctx), written_bytes: self.self_stat().written_bytes, @@ -215,6 +215,7 @@ impl Peer { &self, ctx: &StoreContext, split_keys: Vec>, + share_source_region_size: bool, ch: CmdResChannel, ) { let task = pd::Task::AskBatchSplit { @@ -222,6 +223,7 @@ impl Peer { split_keys, peer: self.peer().clone(), right_derive: ctx.cfg.right_derive_when_split, + share_source_region_size, ch, }; if let Err(e) = ctx.schedulers.pd.schedule(task) { diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs index 5dd43f14e19..bc7e93a394b 100644 --- a/components/raftstore-v2/src/operation/query/capture.rs +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -116,7 +116,7 @@ impl Apply { self.flush(); let (applied_index, _) = self.apply_progress(); let snap = RegionSnapshot::from_snapshot( - Arc::new(self.tablet().snapshot()), + Arc::new(self.tablet().snapshot(None)), Arc::new(self.region().clone()), ); snap.set_apply_index(applied_index); diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 84a8ad09ed3..189986f93d2 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -168,7 +168,7 @@ impl Peer { header.set_term(self.term()); let empty_data = SimpleWriteEncoder::with_capacity(0).encode(); let (ch, _) = CmdResChannel::pair(); - self.on_simple_write(ctx, header, empty_data, ch); + self.on_simple_write(ctx, header, empty_data, ch, None); } /// response the read index request diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 36dbb26e4c7..1829628ae48 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -28,6 +28,7 @@ use raftstore::{ use slog::{debug, Logger}; use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; use time::Timespec; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::WriteBatchFlags; use crate::{ @@ -208,7 +209,7 @@ where ReadRequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); let snap = RegionSnapshot::from_snapshot( - Arc::new(delegate.cached_tablet.cache().snapshot()), + Arc::new(delegate.cached_tablet.cache().snapshot(None)), region, ); @@ -239,7 +240,7 @@ where let region = Arc::clone(&delegate.region); let snap = RegionSnapshot::from_snapshot( - Arc::new(delegate.cached_tablet.cache().snapshot()), + Arc::new(delegate.cached_tablet.cache().snapshot(None)), region, ); @@ -263,7 +264,7 @@ where let region = Arc::clone(&delegate.region); let snap = RegionSnapshot::from_snapshot( - Arc::new(delegate.cached_tablet.cache().snapshot()), + Arc::new(delegate.cached_tablet.cache().snapshot(None)), region, ); @@ -335,7 +336,12 @@ where async move { let (mut fut, mut reader) = match res { - Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Ok(snap)) => { + GLOBAL_TRACKERS.with_tracker(get_tls_tracker_token(), |t| { + t.metrics.local_read = true; + }); + return Ok(snap); + } Either::Left(Err(e)) => return Err(e), Either::Right((fut, reader)) => (fut, reader), }; @@ -580,6 +586,10 @@ impl<'r> SnapRequestInspector<'r> { )); } + fail::fail_point!("perform_read_index", |_| Ok(ReadRequestPolicy::ReadIndex)); + + fail::fail_point!("perform_read_local", |_| Ok(ReadRequestPolicy::ReadLocal)); + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { return Ok(ReadRequestPolicy::StaleRead); diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 2f1b1cd0138..10f6e3279c3 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -471,6 +471,7 @@ impl Peer { // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { if applied_term == self.term() { + fail::fail_point!("on_applied_current_term"); ctx.coprocessor_host .on_applied_current_term(StateRole::Leader, self.region()); } diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d4743448d07..e839089837d 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -29,6 +29,7 @@ use std::{ cmp, + collections::VecDeque, path::Path, sync::{atomic::Ordering, mpsc::SyncSender, Mutex}, }; @@ -46,7 +47,7 @@ use kvproto::{ use raftstore::store::{ util, ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::{info, trace, Logger}; +use slog::{info, trace, warn, Logger}; use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ @@ -56,7 +57,7 @@ use crate::{ ready::snapshot::{install_tablet, recv_snap_path}, }, raft::{Peer, Storage}, - router::PeerMsg, + router::{PeerMsg, SstApplyIndex}, worker::tablet, Result, StoreRouter, }; @@ -138,7 +139,7 @@ impl engine_traits::StateStorage for StateStorage< /// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; -#[derive(Clone, Copy, Default, Debug)] +#[derive(Clone, Default, Debug)] struct Progress { flushed: u64, /// The index of last entry that has modification to the CF. The value @@ -146,6 +147,20 @@ struct Progress { /// /// If `flushed` == `last_modified`, then all data in the CF is persisted. last_modified: u64, + // applied indexes ranges that represent sst is ingested but not flushed indexes. + pending_sst_ranges: VecDeque, +} + +// A range representing [start, end], upper bound inclusive for handling +// convenience. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct IndexRange(u64, u64); + +#[derive(Debug)] +// track the global flushed index related to the write task. +struct ReadyFlushedIndex { + ready_number: u64, + flushed_index: u64, } /// `ApplyTrace` is used to track the indexes of modifications and flushes. @@ -178,6 +193,9 @@ pub struct ApplyTrace { last_flush_trigger: u64, /// `true` means the raft cf record should be persisted in next ready. try_persist: bool, + // Because we persist the global flushed in the write task, so we should track + // the task and handle sst cleanup after the write task finished. + flushed_index_queue: VecDeque, } impl ApplyTrace { @@ -230,6 +248,25 @@ impl ApplyTrace { self.admin.last_modified = index; } + pub fn on_sst_ingested(&mut self, sst_applied_index: &[SstApplyIndex]) { + use std::cmp::Ordering; + for &SstApplyIndex { cf_index, index } in sst_applied_index { + let p = &mut self.data_cfs[cf_index]; + if p.flushed < index { + let max_idx = p.pending_sst_ranges.iter().last().map(|r| r.1).unwrap_or(0) + 1; + match max_idx.cmp(&index) { + Ordering::Less => { + p.pending_sst_ranges.push_back(IndexRange(index, index)); + } + Ordering::Equal => { + p.pending_sst_ranges.iter_mut().last().unwrap().1 = index; + } + _ => {} + } + } + } + } + pub fn persisted_apply_index(&self) -> u64 { self.persisted_applied } @@ -283,17 +320,45 @@ impl ApplyTrace { } }) .min(); + // At best effort, we can only advance the index to `mem_index`. let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); + // try advance the index if there are any sst ingestion next to the flushed + // index, and always trigger a flush if there is any sst ingestion. + let (candidate, has_ingested_sst) = self.advance_flushed_index_for_ingest(candidate); if candidate > self.admin.flushed { self.admin.flushed = candidate; - if self.admin.flushed > self.persisted_applied + 100 { + if has_ingested_sst || (self.admin.flushed > self.persisted_applied + 100) { self.try_persist = true; } } // TODO: persist admin.flushed every 10 minutes. } + fn advance_flushed_index_for_ingest(&mut self, mut max_index: u64) -> (u64, bool) { + let mut has_ingest = false; + loop { + let mut has_change = false; + for p in self.data_cfs.iter_mut() { + while let Some(r) = p.pending_sst_ranges.front_mut() { + if r.0 > max_index + 1 { + break; + } else if r.1 > max_index { + max_index = r.1; + has_change = true; + } + p.pending_sst_ranges.pop_front(); + has_ingest = true; + } + } + if !has_change { + break; + } + } + + (max_index, has_ingest) + } + /// Get the flushed indexes of all data CF that is needed when recoverying /// logs. /// @@ -348,6 +413,38 @@ impl ApplyTrace { fail_point!("should_persist_apply_trace", |_| true); self.try_persist } + + #[inline] + pub fn register_flush_task(&mut self, ready_number: u64, flushed_index: u64) { + assert!( + self.flushed_index_queue + .iter() + .last() + .map(|f| f.ready_number) + .unwrap_or(0) + < ready_number + ); + self.flushed_index_queue.push_back(ReadyFlushedIndex { + ready_number, + flushed_index, + }); + } + + #[inline] + pub fn take_flush_index(&mut self, ready_number: u64) -> Option { + use std::cmp::Ordering; + while let Some(r) = self.flushed_index_queue.pop_front() { + match r.ready_number.cmp(&ready_number) { + Ordering::Equal => return Some(r.flushed_index), + Ordering::Greater => { + self.flushed_index_queue.push_front(r); + break; + } + _ => {} + } + } + None + } } impl Storage { @@ -546,6 +643,7 @@ impl Storage { .unwrap(); trace.try_persist = false; trace.persisted_applied = trace.admin.flushed; + trace.register_flush_task(write_task.ready_number(), trace.admin.flushed); } } @@ -566,24 +664,7 @@ impl Peer { let apply_trace = self.storage_mut().apply_trace_mut(); apply_trace.on_flush(cf, index); apply_trace.maybe_advance_admin_flushed(apply_index); - let stale_ssts = self.sst_apply_state().stale_ssts(cf, index); - if stale_ssts.is_empty() { - return; - } - info!( - self.logger, - "schedule delete stale ssts after flush"; - "stale_ssts" => ?stale_ssts, - "apply_index" => apply_index, - "cf" => cf, - "flushed_index" => index, - ); - let _ = ctx - .schedulers - .tablet - .schedule(tablet::Task::CleanupImportSst( - stale_ssts.into_boxed_slice(), - )); + self.cleanup_stale_ssts(ctx, &[cf], index, apply_index); } pub fn on_data_modified(&mut self, modification: DataTrace) { @@ -598,6 +679,38 @@ impl Peer { apply_trace.maybe_advance_admin_flushed(apply_index); } + pub fn cleanup_stale_ssts( + &mut self, + ctx: &mut StoreContext, + cfs: &[&str], + index: u64, + apply_index: u64, + ) { + let mut stale_ssts = vec![]; + for cf in cfs { + let ssts = self.sst_apply_state().stale_ssts(cf, index); + if !ssts.is_empty() { + info!( + self.logger, + "schedule delete stale ssts after flush"; + "stale_ssts" => ?stale_ssts, + "apply_index" => apply_index, + "cf" => cf, + "flushed_index" => index, + ); + stale_ssts.extend(ssts); + } + } + if !stale_ssts.is_empty() { + _ = ctx + .schedulers + .tablet + .schedule(tablet::Task::CleanupImportSst( + stale_ssts.into_boxed_slice(), + )); + } + } + pub fn flush_before_close(&mut self, ctx: &StoreContext, tx: SyncSender<()>) { info!( self.logger, @@ -605,7 +718,7 @@ impl Peer { ); let region_id = self.region_id(); let flush_threshold: u64 = (|| { - fail_point!("flush_before_cluse_threshold", |t| { + fail_point!("flush_before_close_threshold", |t| { t.unwrap().parse::().unwrap() }); 50 @@ -619,7 +732,18 @@ impl Peer { // flush the oldest cf one by one until we are under the replay count threshold loop { let replay_count = self.storage().estimate_replay_count(); - if replay_count < flush_threshold { + if replay_count < flush_threshold || tried_count == 3 { + // Ideally, the replay count should be 0 after three flush_oldest_cf. If not, + // there may exist bug, but it's not desireable to block here, so we at most try + // three times. + if replay_count >= flush_threshold && tried_count == 3 { + warn!( + self.logger, + "after three flush_oldest_cf, the expected replay count still exceeds the threshold"; + "replay_count" => replay_count, + "threshold" => flush_threshold, + ); + } if flushed { let admin_flush = self.storage_mut().apply_trace_mut().admin.flushed; let (_, _, tablet_index) = ctx @@ -678,7 +802,7 @@ impl Peer { #[cfg(test)] mod tests { - use engine_traits::RaftEngineReadOnly; + use engine_traits::{CfName, RaftEngineReadOnly}; use kvproto::metapb::Peer; use tempfile::TempDir; @@ -798,6 +922,93 @@ mod tests { // Because modify is recorded, so we know there should be no admin // modification and index can be advanced. assert_eq!(5, trace.admin.flushed); + + fn range_equals(trace: &ApplyTrace, cf: &str, expected: Vec) { + let pending_ranges = &trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges; + assert_eq!( + pending_ranges.len(), + expected.len(), + "actual: {:?}, expected: {:?}", + pending_ranges, + &expected + ); + pending_ranges + .iter() + .zip(expected.iter()) + .for_each(|(r, e)| { + assert_eq!(r, e); + }); + } + + trace.on_modify(CF_DEFAULT, 8); + let ingested_ssts_idx = + make_sst_apply_index(vec![(CF_DEFAULT, 6), (CF_WRITE, 6), (CF_WRITE, 7)]); + trace.on_sst_ingested(&ingested_ssts_idx); + range_equals(&trace, CF_DEFAULT, vec![IndexRange(6, 6)]); + range_equals(&trace, CF_WRITE, vec![IndexRange(6, 7)]); + trace.maybe_advance_admin_flushed(8); + assert_eq!(7, trace.admin.flushed); + for cf in [CF_DEFAULT, CF_WRITE] { + assert_eq!( + trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges.len(), + 0 + ); + } + trace.on_modify(CF_DEFAULT, 10); + let ingested_ssts_idx = make_sst_apply_index(vec![(CF_DEFAULT, 10)]); + trace.on_sst_ingested(&ingested_ssts_idx); + trace.on_flush(CF_DEFAULT, 8); + trace.maybe_advance_admin_flushed(10); + assert_eq!(8, trace.admin.flushed); + range_equals(&trace, CF_DEFAULT, vec![IndexRange(10, 10)]); + + trace.on_modify(CF_DEFAULT, 16); + let ingested_ssts_idx = make_sst_apply_index(vec![ + (CF_DEFAULT, 11), + (CF_WRITE, 12), + (CF_LOCK, 13), + (CF_DEFAULT, 14), + (CF_WRITE, 14), + (CF_WRITE, 15), + (CF_LOCK, 16), + ]); + trace.on_sst_ingested(&ingested_ssts_idx); + range_equals( + &trace, + CF_DEFAULT, + vec![IndexRange(10, 11), IndexRange(14, 14)], + ); + range_equals( + &trace, + CF_WRITE, + vec![IndexRange(12, 12), IndexRange(14, 15)], + ); + range_equals( + &trace, + CF_LOCK, + vec![IndexRange(13, 13), IndexRange(16, 16)], + ); + trace.maybe_advance_admin_flushed(16); + assert_eq!(8, trace.admin.flushed); + + trace.on_flush(CF_DEFAULT, 9); + trace.maybe_advance_admin_flushed(16); + assert_eq!(16, trace.admin.flushed); + for cf in DATA_CFS { + assert_eq!( + trace.data_cfs[data_cf_offset(cf)].pending_sst_ranges.len(), + 0 + ); + } + } + + fn make_sst_apply_index(data: Vec<(CfName, u64)>) -> Vec { + data.into_iter() + .map(|d| SstApplyIndex { + cf_index: data_cf_offset(d.0), + index: d.1, + }) + .collect() } #[test] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index b985fd69c27..39ce9707359 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,7 +31,7 @@ use std::{ time::Instant, }; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, DATA_CFS}; use error_code::ErrorCodeExt; use kvproto::{ raft_cmdpb::AdminCmdType, @@ -43,6 +43,7 @@ use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ fsm::store::StoreRegionMeta, + local_metrics::IoType, needs_evict_entry_cache, util::{self, is_first_append_entry, is_initial_msg}, worker_metrics::SNAP_COUNTER, @@ -54,7 +55,8 @@ use tikv_util::{ log::SlogFormat, slog_panic, store::find_peer, - time::{duration_to_sec, monotonic_raw_now, Duration}, + sys::disk::DiskUsage, + time::{duration_to_sec, monotonic_raw_now, Duration, Instant as TiInstant}, }; pub use self::{ @@ -247,10 +249,18 @@ impl Peer { } } + pub fn on_store_maybe_tombstone(&mut self, store_id: u64) { + if !self.is_leader() { + return; + } + self.on_store_maybe_tombstone_gc_peer(store_id); + } + pub fn on_raft_message( &mut self, ctx: &mut StoreContext, mut msg: Box, + send_time: Option, ) { debug!( self.logger, @@ -258,7 +268,15 @@ impl Peer { "message_type" => %util::MsgType(&msg), "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), + "disk_usage" => ?msg.disk_usage, ); + if let Some(send_time) = send_time { + let process_wait_time = send_time.saturating_elapsed(); + ctx.raft_metrics + .process_wait_time + .observe(duration_to_sec(process_wait_time)); + } + if self.pause_for_replay() && msg.get_message().get_msg_type() == MessageType::MsgAppend { ctx.raft_metrics.message_dropped.recovery.inc(); return; @@ -280,6 +298,9 @@ impl Peer { return; } } + + self.handle_reported_disk_usage(ctx, &msg); + if msg.get_to_peer().get_store_id() != self.peer().get_store_id() { ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); return; @@ -418,9 +439,10 @@ impl Peer { return; } + let msg_type = msg.get_message().get_msg_type(); // This can be a message that sent when it's still a follower. Nevertheleast, // it's meaningless to continue to handle the request as callbacks are cleared. - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.is_leader() && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.peer_id()) @@ -429,14 +451,18 @@ impl Peer { return; } - if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + if msg_type == MessageType::MsgReadIndex && self.is_leader() && self.on_step_read_index(ctx, msg.mut_message()) { // Read index has respond in `on_step_read_index`, // No need to step again. } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { - error!(self.logger, "raft step error"; "err" => ?e); + error!(self.logger, "raft step error"; + "from_peer" => ?msg.get_from_peer(), + "region_epoch" => ?msg.get_region_epoch(), + "message_type" => ?msg_type, + "err" => ?e); } else { let committed_index = self.raft_group().raft.raft_log.committed; self.report_commit_log_duration(ctx, pre_committed_index, committed_index); @@ -503,7 +529,11 @@ impl Peer { /// /// If the recipient can't be found, `None` is returned. #[inline] - fn build_raft_message(&mut self, msg: eraftpb::Message) -> Option { + fn build_raft_message( + &mut self, + msg: eraftpb::Message, + disk_usage: DiskUsage, + ) -> Option { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { @@ -518,6 +548,8 @@ impl Peer { }; let mut raft_msg = self.prepare_raft_message(); + // Fill in the disk usage. + raft_msg.set_disk_usage(disk_usage); raft_msg.set_to_peer(to_peer); if msg.from != self.peer().id { @@ -760,8 +792,9 @@ impl Peer { if !ready.messages().is_empty() { debug_assert!(self.is_leader()); + let disk_usage = ctx.self_disk_usage; for msg in ready.take_messages() { - if let Some(msg) = self.build_raft_message(msg) { + if let Some(msg) = self.build_raft_message(msg, disk_usage) { self.send_raft_message_on_leader(ctx, msg); } } @@ -790,10 +823,11 @@ impl Peer { self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { + let disk_usage = ctx.self_disk_usage; write_task.messages = ready .take_persisted_messages() .into_iter() - .flat_map(|m| self.build_raft_message(m)) + .flat_map(|m| self.build_raft_message(m, disk_usage)) .collect(); } if self.has_pending_messages() { @@ -896,6 +930,14 @@ impl Peer { self.storage_mut() .entry_storage_mut() .update_cache_persisted(persisted_index); + if let Some(idx) = self + .storage_mut() + .apply_trace_mut() + .take_flush_index(ready_number) + { + let apply_index = self.flush_state().applied_index(); + self.cleanup_stale_ssts(ctx, DATA_CFS, idx, apply_index); + } if self.is_in_force_leader() { // forward commit index, the committed entries will be applied in @@ -948,7 +990,7 @@ impl Peer { return; } let now = Instant::now(); - let stat_raft_commit_log = &mut ctx.raft_metrics.stat_commit_log; + let health_stats = &mut ctx.raft_metrics.health_stats; for i in old_index + 1..=new_index { if let Some((term, trackers)) = self.proposals().find_trackers(i) { if self.entry_storage().term(i).map_or(false, |t| t == term) { @@ -961,14 +1003,11 @@ impl Peer { for tracker in trackers { // Collect the metrics related to commit_log // durations. - stat_raft_commit_log.record(Duration::from_nanos(tracker.observe( - now, - hist, - |t| { - t.metrics.commit_not_persisted = !commit_persisted; - &mut t.metrics.wf_commit_log_nanos - }, - ))); + let duration = tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + health_stats.observe(Duration::from_nanos(duration), IoType::Network); } } } @@ -1049,6 +1088,16 @@ impl Peer { // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); + if !ctx.store_disk_usages.is_empty() { + self.refill_disk_full_peers(ctx); + debug!( + self.logger, + "become leader refills disk full peers to {:?}", + self.abnormal_peer_context().disk_full_peers(); + "region_id" => self.region_id(), + ); + } + self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); @@ -1189,6 +1238,52 @@ impl Peer { ); } } + + fn handle_reported_disk_usage( + &mut self, + ctx: &mut StoreContext, + msg: &RaftMessage, + ) { + let store_id = msg.get_from_peer().get_store_id(); + let peer_id = msg.get_from_peer().get_id(); + let disk_full_peers = self.abnormal_peer_context().disk_full_peers(); + let refill_disk_usages = if matches!(msg.disk_usage, DiskUsage::Normal) { + ctx.store_disk_usages.remove(&store_id); + if !self.is_leader() { + return; + } + disk_full_peers.has(peer_id) + } else { + ctx.store_disk_usages.insert(store_id, msg.disk_usage); + if !self.is_leader() { + return; + } + + disk_full_peers.is_empty() + || disk_full_peers + .get(peer_id) + .map_or(true, |x| x != msg.disk_usage) + }; + + if refill_disk_usages || self.has_region_merge_proposal { + let prev = disk_full_peers.get(peer_id); + if Some(msg.disk_usage) != prev { + info!( + self.logger, + "reported disk usage changes {:?} -> {:?}", prev, msg.disk_usage; + "region_id" => self.region_id(), + "peer_id" => peer_id, + ); + } + self.refill_disk_full_peers(ctx); + debug!( + self.logger, + "raft message refills disk full peers to {:?}", + self.abnormal_peer_context().disk_full_peers(); + "region_id" => self.region_id(), + ); + } + } } impl Storage { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 17deed333c1..c29399ac6a0 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -30,10 +30,7 @@ use std::{ }; use encryption_export::DataKeyManager; -use engine_traits::{ - EncryptionKeyManager, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, - ALL_CFS, -}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, ALL_CFS}; use fail::fail_point; use kvproto::{ metapb::PeerRole, @@ -343,10 +340,12 @@ impl Peer { } self.schedule_apply_fsm(ctx); if self.remove_tombstone_tablets(snapshot_index) { + let counter = self.remember_persisted_tablet_index(); let _ = ctx .schedulers .tablet .schedule(tablet::Task::destroy(region_id, snapshot_index)); + counter.store(snapshot_index, Ordering::Relaxed); } if let Some(msg) = self.split_pending_append_mut().take_append_message() { let _ = ctx.router.send_raft_message(msg); diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 272b2526b39..6a379b9a1a2 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -9,11 +9,15 @@ use std::sync::{atomic::Ordering, Arc}; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use kvproto::{ + kvrpcpb::{DiskFullOpt, ExtraOp}, + metapb::Region, + raft_cmdpb::RaftRequestHeader, +}; use parking_lot::RwLockWriteGuard; use raft::eraftpb; use raftstore::store::{ - LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, + LocksStatus, PeerPessimisticLocks, RaftCmdExtraOpts, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, }; use slog::{error, info, Logger}; @@ -266,8 +270,17 @@ impl Peer { self.logger, "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; - self.on_simple_write(ctx, write.header, write.data, write.ch); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write_with_opt(header, encoder.encode(), RaftCmdExtraOpts { + disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, + ..Default::default() + }).0 else {unreachable!()}; + self.on_simple_write( + ctx, + write.header, + write.data, + write.ch, + Some(write.extra_opts), + ); true } } diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs index 37962a45452..20a42b9f978 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/demote.rs @@ -75,6 +75,7 @@ impl Peer { "Unsafe recovery, fail to finish demotion"; "err" => ?resp.get_header().get_error(), ); + *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::Failed); return; } *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::DemoteFailedVoters { @@ -129,6 +130,7 @@ impl Peer { "Unsafe recovery, fail to exit joint state"; "err" => ?resp.get_header().get_error(), ); + *self.unsafe_recovery_state_mut()= Some(UnsafeRecoveryState::Failed); } } else { error!(self.logger, diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs b/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs index ba7e391dbef..e6af0fddb7b 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/force_leader.rs @@ -5,7 +5,9 @@ use std::mem; use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use raft::{eraftpb::MessageType, StateRole, Storage}; -use raftstore::store::{util::LeaseState, ForceLeaderState, UnsafeRecoveryForceLeaderSyncer}; +use raftstore::store::{ + util::LeaseState, ForceLeaderState, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, +}; use slog::{info, warn}; use tikv_util::time::Instant as TiInstant; @@ -182,11 +184,20 @@ impl Peer { self.set_has_ready(); } - pub fn on_exit_force_leader(&mut self, ctx: &StoreContext) { + // TODO: add exit force leader check tick for raftstore v2 + pub fn on_exit_force_leader(&mut self, ctx: &StoreContext, force: bool) { if !self.has_force_leader() { return; } + if let Some(UnsafeRecoveryState::Failed) = self.unsafe_recovery_state() && !force { + // Skip force leader if the plan failed, so wait for the next retry of plan with force leader state holding + info!( + self.logger, "skip exiting force leader state" + ); + return; + } + info!(self.logger, "exit force leader state"); *self.force_leader_mut() = None; // leader lease shouldn't be renewed in force leader state. diff --git a/components/raftstore-v2/src/operation/unsafe_recovery/report.rs b/components/raftstore-v2/src/operation/unsafe_recovery/report.rs index 7173d00363a..90c8e3db34d 100644 --- a/components/raftstore-v2/src/operation/unsafe_recovery/report.rs +++ b/components/raftstore-v2/src/operation/unsafe_recovery/report.rs @@ -44,11 +44,19 @@ impl Peer { self.raft_group().raft.raft_log.committed }; - *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::WaitApply { - target_index, - syncer, - }); - self.unsafe_recovery_maybe_finish_wait_apply(!self.serving()); + if target_index > self.raft_group().raft.raft_log.applied { + info!( + self.logger, + "Unsafe recovery, start wait apply"; + "target_index" => target_index, + "applied" => self.raft_group().raft.raft_log.applied, + ); + *self.unsafe_recovery_state_mut() = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.unsafe_recovery_maybe_finish_wait_apply(!self.serving()); + } } pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { @@ -113,7 +121,7 @@ impl Peer { Some(UnsafeRecoveryState::DemoteFailedVoters { .. }) => { self.unsafe_recovery_maybe_finish_demote_failed_voters(ctx) } - Some(UnsafeRecoveryState::Destroy(_)) | None => {} + Some(UnsafeRecoveryState::Destroy(_)) | Some(UnsafeRecoveryState::Failed) | None => {} } } } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 3e660c4549c..35959dd8aea 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -20,7 +20,7 @@ use tikv_util::{log::SlogFormat, worker::Scheduler, yatp_pool::FuturePool}; use crate::{ operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, - router::CmdResChannel, + router::{CmdResChannel, SstApplyIndex}, TabletTask, }; @@ -64,6 +64,7 @@ pub struct Apply { admin_cmd_result: Vec, flush_state: Arc, sst_apply_state: SstApplyState, + sst_applied_index: Vec, /// The flushed indexes of each column family before being restarted. /// /// If an apply index is less than the flushed index, the log can be @@ -75,7 +76,7 @@ pub struct Apply { res_reporter: R, read_scheduler: Scheduler>, - sst_importer: Arc, + sst_importer: Arc>, observe: Observe, coprocessor_host: CoprocessorHost, @@ -101,7 +102,7 @@ impl Apply { log_recovery: Option>, applied_term: u64, buckets: Option, - sst_importer: Arc, + sst_importer: Arc>, coprocessor_host: CoprocessorHost, tablet_scheduler: Scheduler>, high_priority_pool: FuturePool, @@ -138,6 +139,7 @@ impl Apply { res_reporter, flush_state, sst_apply_state, + sst_applied_index: vec![], log_recovery, metrics: ApplyMetrics::default(), buckets, @@ -308,6 +310,16 @@ impl Apply { &self.sst_apply_state } + #[inline] + pub fn push_sst_applied_index(&mut self, sst_index: SstApplyIndex) { + self.sst_applied_index.push(sst_index); + } + + #[inline] + pub fn take_sst_applied_index(&mut self) -> Vec { + mem::take(&mut self.sst_applied_index) + } + #[inline] pub fn log_recovery(&self) -> &Option> { &self.log_recovery @@ -323,7 +335,7 @@ impl Apply { } #[inline] - pub fn sst_importer(&self) -> &SstImporter { + pub fn sst_importer(&self) -> &SstImporter { &self.sst_importer } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 2f3a3376fe9..9b095b872e7 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -23,8 +23,9 @@ use raftstore::{ fsm::ApplyMetrics, metrics::RAFT_PEER_PENDING_DURATION, util::{Lease, RegionReadProgress}, - Config, EntryStorage, ForceLeaderState, PeerStat, ProposalQueue, ReadDelegate, - ReadIndexQueue, ReadProgress, TabletSnapManager, UnsafeRecoveryState, WriteTask, + BucketStatsInfo, Config, EntryStorage, ForceLeaderState, PeerStat, ProposalQueue, + ReadDelegate, ReadIndexQueue, ReadProgress, TabletSnapManager, UnsafeRecoveryState, + WriteTask, }, }; use slog::{debug, info, Logger}; @@ -32,11 +33,12 @@ use tikv_util::{slog_panic, time::duration_to_sec}; use super::storage::Storage; use crate::{ + batch::StoreContext, fsm::ApplyScheduler, operation::{ - AbnormalPeerContext, AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, - GcPeerContext, MergeContext, ProposalControl, ReplayWatch, SimpleWriteReqEncoder, - SplitFlowControl, SplitPendingAppend, TxnContext, + AbnormalPeerContext, AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, + MergeContext, ProposalControl, ReplayWatch, SimpleWriteReqEncoder, SplitFlowControl, + SplitPendingAppend, TxnContext, }, router::{ApplyTask, CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -126,6 +128,10 @@ pub struct Peer { abnormal_peer_context: AbnormalPeerContext, + // region merge logic need to be broadcast to all followers when disk full happens. + pub has_region_merge_proposal: bool, + pub region_merge_proposal_index: u64, + /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader /// out of accordance with Raft election rule, and forbids any @@ -158,6 +164,7 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); let merge_context = MergeContext::from_region_state(&logger, storage.region_state()); + let persisted_applied = storage.apply_trace().persisted_apply_index(); let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); @@ -184,7 +191,7 @@ impl Peer { self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), - compact_log_context: CompactLogContext::new(applied_index), + compact_log_context: CompactLogContext::new(applied_index, persisted_applied), merge_context: merge_context.map(|c| Box::new(c)), last_sent_snapshot_index: 0, raw_write_encoder: None, @@ -226,10 +233,20 @@ impl Peer { pending_messages: vec![], gc_peer_context: GcPeerContext::default(), abnormal_peer_context: AbnormalPeerContext::default(), + has_region_merge_proposal: false, + region_merge_proposal_index: 0_u64, force_leader_state: None, unsafe_recovery_state: None, }; + // If merge_context is not None, it means the PrepareMerge is applied before + // restart. So we have to neter prepare merge again to prevent all proposals + // except for RollbackMerge. + if let Some(ref state) = peer.merge_context { + peer.proposal_control + .enter_prepare_merge(state.prepare_merge_index().unwrap()); + } + // If this region has only one peer and I am the one, campaign directly. let region = peer.region(); if region.get_peers().len() == 1 @@ -264,9 +281,6 @@ impl Peer { } /// Set the region of a peer. - /// - /// This will update the region of the peer, caller must ensure the region - /// has been preserved in a durable device. pub fn set_region( &mut self, host: &CoprocessorHost, @@ -594,7 +608,7 @@ impl Peer { ) } - pub fn collect_down_peers(&mut self, max_duration: Duration) -> Vec { + pub fn collect_down_peers(&mut self, ctx: &StoreContext) -> Vec { let mut down_peers = Vec::new(); let mut down_peer_ids = Vec::new(); let now = Instant::now(); @@ -604,7 +618,7 @@ impl Peer { } if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { let elapsed = now.saturating_duration_since(*instant); - if elapsed >= max_duration { + if elapsed >= ctx.cfg.max_peer_down_duration.0 { let mut stats = pdpb::PeerStats::default(); stats.set_peer(p.clone()); stats.set_down_seconds(elapsed.as_secs()); @@ -613,8 +627,11 @@ impl Peer { } } } + let exist_down_peers = !down_peer_ids.is_empty(); *self.abnormal_peer_context_mut().down_peers_mut() = down_peer_ids; - // TODO: `refill_disk_full_peers` + if exist_down_peers { + self.refill_disk_full_peers(ctx); + } down_peers } @@ -861,6 +878,16 @@ impl Peer { ) } + #[inline] + pub fn leader_transferee(&self) -> u64 { + self.leader_transferee + } + + #[inline] + pub fn leader_transferring(&self) -> bool { + self.leader_transferee != raft::INVALID_ID + } + #[inline] pub fn long_uncommitted_threshold(&self) -> Duration { Duration::from_secs(self.long_uncommitted_threshold) @@ -909,6 +936,11 @@ impl Peer { self.last_sent_snapshot_index } + #[inline] + pub fn next_proposal_index(&self) -> u64 { + self.raft_group.raft.raft_log.last_index() + 1 + } + #[inline] pub fn index_term(&self, idx: u64) -> u64 { match self.raft_group.raft.raft_log.term(idx) { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 9c6cca96ae4..e7a63f6d48f 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -43,12 +43,18 @@ impl AsyncReadNotifier for StoreRouter { } impl raftstore::coprocessor::StoreHandle for StoreRouter { - fn update_approximate_size(&self, region_id: u64, size: u64) { - let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + // TODO: add splitable logic in raftstore-v2 + fn update_approximate_size(&self, region_id: u64, size: Option, _may_split: Option) { + if let Some(size) = size { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); + } } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { - let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + // TODO: add splitable logic in raftstore-v2 + fn update_approximate_keys(&self, region_id: u64, keys: Option, _may_split: Option) { + if let Some(keys) = keys { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); + } } fn ask_split( @@ -58,7 +64,7 @@ impl raftstore::coprocessor::StoreHandle for Store split_keys: Vec>, source: Cow<'static, str>, ) { - let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string()); + let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string(), true); let res = self.send(region_id, msg); if let Err(e) = res { warn!( diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 6c8d1136b3a..7ac86c3f8c7 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -25,4 +25,11 @@ pub struct ApplyRes { pub modifications: DataTrace, pub metrics: ApplyMetrics, pub bucket_stat: Option, + pub sst_applied_index: Vec, +} + +#[derive(Copy, Clone, Debug)] +pub struct SstApplyIndex { + pub cf_index: usize, + pub index: u64, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index f09314b4f17..59d1edd8198 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -14,7 +14,7 @@ use kvproto::{ }; use raftstore::store::{ fsm::ChangeObserver, metrics::RaftEventDurationType, simple_write::SimpleWriteBinary, - util::LatencyInspector, FetchedLogs, GenSnapRes, TabletSnapKey, + util::LatencyInspector, FetchedLogs, GenSnapRes, RaftCmdExtraOpts, TabletSnapKey, UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryWaitApplySyncer, }; @@ -134,6 +134,7 @@ pub struct SimpleWrite { pub header: Box, pub data: SimpleWriteBinary, pub ch: CmdResChannel, + pub extra_opts: RaftCmdExtraOpts, } #[derive(Debug)] @@ -156,7 +157,7 @@ pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(Box), + RaftMessage(Box, Option), /// Query won't change any state. A typical query is KV read. In most cases, /// it will be processed using lease or read index. RaftQuery(RaftRequest), @@ -197,6 +198,11 @@ pub enum PeerMsg { StoreUnreachable { to_store_id: u64, }, + // A store may be tombstone. Use it with caution, it also means store not + // found, PD can not distinguish them now, as PD may delete tombstone stores. + StoreMaybeTombstone { + store_id: u64, + }, /// Reports whether the snapshot sending is successful or not. SnapshotSent { to_peer_id: u64, @@ -291,6 +297,14 @@ impl PeerMsg { pub fn simple_write( header: Box, data: SimpleWriteBinary, + ) -> (Self, CmdResSubscriber) { + PeerMsg::simple_write_with_opt(header, data, RaftCmdExtraOpts::default()) + } + + pub fn simple_write_with_opt( + header: Box, + data: SimpleWriteBinary, + extra_opts: RaftCmdExtraOpts, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -299,6 +313,7 @@ impl PeerMsg { header, data, ch, + extra_opts, }), sub, ) @@ -315,6 +330,7 @@ impl PeerMsg { epoch: metapb::RegionEpoch, split_keys: Vec>, source: String, + share_source_region_size: bool, ) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); ( @@ -323,6 +339,7 @@ impl PeerMsg { epoch, split_keys, source: source.into(), + share_source_region_size, }, ch, }, @@ -344,6 +361,7 @@ impl PeerMsg { epoch, split_keys, source: source.into(), + share_source_region_size: false, }, ch, }, diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 7630e35c2a5..83a2497b331 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -12,7 +12,7 @@ pub use self::response_channel::FlushChannel; pub use self::response_channel::FlushSubscriber; pub use self::{ imp::{RaftRouter, UnsafeRecoveryRouter}, - internal_message::ApplyRes, + internal_message::{ApplyRes, SstApplyIndex}, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ build_any_channel, AnyResChannel, AnyResSubscriber, BaseSubscriber, CmdResChannel, diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index f89ea75b604..7e07d26e61f 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -14,9 +14,9 @@ use pd_client::{BucketStat, PdClient}; use raftstore::store::{ metrics::STORE_INSPECT_DURATION_HISTOGRAM, util::{KeysInfoFormatter, LatencyInspector, RaftstoreDuration}, - AutoSplitController, Config, FlowStatsReporter, PdStatsMonitor, ReadStats, - RegionReadProgressRegistry, SplitInfo, StoreStatsReporter, TabletSnapManager, TxnExt, - WriteStats, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + AutoSplitController, Config, FlowStatsReporter, PdStatsMonitor, ReadStats, SplitInfo, + StoreStatsReporter, TabletSnapManager, TxnExt, WriteStats, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }; use resource_metering::{Collector, CollectorRegHandle, RawRecords}; use service::service_manager::GrpcServiceManager; @@ -57,7 +57,6 @@ pub enum Task { }, // In region.rs. RegionHeartbeat(RegionHeartbeatTask), - ReportRegionBuckets(BucketStat), UpdateReadStats(ReadStats), UpdateWriteStats(WriteStats), UpdateRegionCpuRecords(Arc), @@ -70,6 +69,7 @@ pub enum Task { split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: CmdResChannel, }, ReportBatchSplit { @@ -84,6 +84,7 @@ pub enum Task { initial_status: u64, txn_ext: Arc, }, + // BucketStat is the delta write flow of the bucket. ReportBuckets(BucketStat), ReportMinResolvedTs { store_id: u64, @@ -122,7 +123,6 @@ impl Display for Task { hb_task.region, hb_task.peer.get_id(), ), - Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), Task::UpdateReadStats(ref stats) => { write!(f, "update read stats: {stats:?}") } @@ -245,7 +245,6 @@ where causal_ts_provider: Option>, // used for rawkv apiv2 pd_scheduler: Scheduler, auto_split_controller: AutoSplitController, - region_read_progress: RegionReadProgressRegistry, collector_reg_handle: CollectorRegHandle, grpc_service_manager: GrpcServiceManager, logger: Logger, @@ -255,16 +254,10 @@ where let store_heartbeat_interval = cfg.value().pd_store_heartbeat_tick_interval.0; let mut stats_monitor = PdStatsMonitor::new( store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, - cfg.value().report_min_resolved_ts_interval.0, cfg.value().inspect_interval.0, PdReporter::new(pd_scheduler, logger.clone()), ); - stats_monitor.start( - auto_split_controller, - region_read_progress, - collector_reg_handle, - store_id, - )?; + stats_monitor.start(auto_split_controller, collector_reg_handle)?; let slowness_stats = slowness::SlownessStatistics::new(&cfg.value()); Ok(Self { store_id, @@ -313,7 +306,6 @@ where write_io_rates, } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), - Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), @@ -324,7 +316,15 @@ where peer, right_derive, ch, - } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), + share_source_region_size, + } => self.handle_ask_batch_split( + region, + split_keys, + peer, + right_derive, + share_source_region_size, + ch, + ), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), Task::UpdateMaxTimestamp { @@ -332,7 +332,7 @@ where initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), - Task::ReportBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::ReportBuckets(delta_buckets) => self.handle_report_region_buckets(delta_buckets), Task::ReportMinResolvedTs { store_id, min_resolved_ts, diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index e825dd54c32..d3ef54bd75a 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -288,6 +288,7 @@ where epoch, split_keys: split_region.take_keys().into(), source: "pd".into(), + share_source_region_size: false, }, ch, } @@ -338,9 +339,9 @@ where self.is_hb_receiver_scheduled = true; } - pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { - let region_id = region_buckets.meta.region_id; - self.merge_buckets(region_buckets); + pub fn handle_report_region_buckets(&mut self, delta_buckets: BucketStat) { + let region_id = delta_buckets.meta.region_id; + self.merge_buckets(delta_buckets); let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); let last_report_ts = if report_buckets.last_report_ts.is_zero() { self.start_ts @@ -387,8 +388,8 @@ where .engine_total_query_num .add_query_stats(®ion_info.query_stats.0); } - for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { - self.merge_buckets(region_buckets); + for (_, delta_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(delta_buckets); } if !stats.region_infos.is_empty() { self.stats_monitor.maybe_send_read_stats(stats); @@ -423,18 +424,18 @@ where } } - fn merge_buckets(&mut self, mut buckets: BucketStat) { - let region_id = buckets.meta.region_id; + fn merge_buckets(&mut self, mut delta: BucketStat) { + let region_id = delta.meta.region_id; self.region_buckets .entry(region_id) .and_modify(|report_bucket| { let current = &mut report_bucket.current_stat; - if current.meta < buckets.meta { - std::mem::swap(current, &mut buckets); + if current.meta < delta.meta { + std::mem::swap(current, &mut delta); } - current.merge(&buckets); + current.merge(&delta); }) - .or_insert_with(|| ReportBucket::new(buckets)); + .or_insert_with(|| ReportBucket::new(delta)); } fn calculate_region_cpu_records( diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index bf13e01120a..7fec5a31bb6 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -17,10 +17,13 @@ fn new_batch_split_region_request( split_keys: Vec>, ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::BatchSplit); req.mut_splits().set_right_derive(right_derive); + req.mut_splits() + .set_share_source_region_size(share_source_region_size); let mut requests = Vec::with_capacity(ids.len()); for (mut id, key) in ids.into_iter().zip(split_keys) { let mut split = SplitRequest::default(); @@ -46,6 +49,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: CmdResChannel, ) { Self::ask_batch_split_imp( @@ -57,6 +61,7 @@ where split_keys, peer, right_derive, + share_source_region_size, Some(ch), ); } @@ -70,6 +75,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, ch: Option, ) { if split_keys.is_empty() { @@ -98,6 +104,7 @@ where split_keys, resp.take_ids().into(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -148,6 +155,7 @@ where vec![split_key], split_info.peer, true, + false, None, ); // Try to split the region on half within the given key diff --git a/components/raftstore-v2/src/worker/pd/store.rs b/components/raftstore-v2/src/worker/pd/store.rs index a5aad42d85c..b3fd3245be6 100644 --- a/components/raftstore-v2/src/worker/pd/store.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -9,7 +9,7 @@ use kvproto::pdpb; use pd_client::{ metrics::{ REGION_READ_BYTES_HISTOGRAM, REGION_READ_KEYS_HISTOGRAM, REGION_WRITTEN_BYTES_HISTOGRAM, - REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_GAUGE_VEC, + REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_EVENT_INT_VEC, }, PdClient, }; @@ -263,15 +263,9 @@ where self.store_stat.region_bytes_read.flush(); self.store_stat.region_keys_read.flush(); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["capacity"]) - .set(capacity as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["available"]) - .set(available as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["used"]) - .set(used_size as i64); + STORE_SIZE_EVENT_INT_VEC.capacity.set(capacity as i64); + STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); + STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); // Update slowness statistics self.update_slowness_in_store_stats(&mut stats, last_query_sum); @@ -473,12 +467,16 @@ where true }); let snap_size = self.snap_mgr.total_snap_size().unwrap(); - let used_size = snap_size - + kv_size - + self - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + let raft_size = self + .raft_engine + .get_engine_size() + .expect("engine used size"); + + STORE_SIZE_EVENT_INT_VEC.kv_size.set(kv_size as i64); + STORE_SIZE_EVENT_INT_VEC.raft_size.set(raft_size as i64); + STORE_SIZE_EVENT_INT_VEC.snap_size.set(snap_size as i64); + + let used_size = snap_size + kv_size + raft_size; let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. diff --git a/components/raftstore-v2/src/worker/tablet.rs b/components/raftstore-v2/src/worker/tablet.rs index 183bb33cd34..b2a6d46e39c 100644 --- a/components/raftstore-v2/src/worker/tablet.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -235,7 +235,7 @@ impl Task { pub struct Runner { tablet_registry: TabletRegistry, - sst_importer: Arc, + sst_importer: Arc>, snap_mgr: TabletSnapManager, logger: Logger, @@ -252,7 +252,7 @@ pub struct Runner { impl Runner { pub fn new( tablet_registry: TabletRegistry, - sst_importer: Arc, + sst_importer: Arc>, snap_mgr: TabletSnapManager, logger: Logger, ) -> Self { @@ -298,6 +298,8 @@ impl Runner { .spawn(async move { let range1 = Range::new(&[], &start_key); let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + // Note: Refer to https://github.com/facebook/rocksdb/pull/11468. There's could be + // some files missing from compaction if dynamic_level_bytes is off. for r in [range1, range2] { // When compaction filter is present, trivial move is disallowed. if let Err(e) = @@ -323,6 +325,16 @@ impl Runner { return; } } + if let Err(e) = tablet.check_in_range(Some(&start_key), Some(&end_key)) { + debug_assert!(false, "check_in_range failed {:?}, is titan enabled?", e); + error!( + logger, + "trim did not remove all dirty data"; + "path" => tablet.path(), + "err" => %e, + ); + return; + } // drop before callback. drop(tablet); fail_point!("tablet_trimmed_finished"); @@ -581,6 +593,13 @@ impl Runner { } } +#[cfg(test)] +impl Runner { + pub fn get_running_task_count(&self) -> usize { + self.low_pri_pool.get_running_task_count() + } +} + impl Runnable for Runner where EK: KvEngine, @@ -801,6 +820,14 @@ mod tests { runner.run(Task::destroy(r_1, 100)); assert!(path.exists()); registry.remove(r_1); + // waiting for async `pause_background_work` to be finished, + // this task can block tablet's destroy. + for _i in 0..100 { + if runner.get_running_task_count() == 0 { + break; + } + std::thread::sleep(Duration::from_millis(5)); + } runner.on_timeout(); assert!(!path.exists()); assert!(runner.pending_destroy_tasks.is_empty()); diff --git a/components/raftstore-v2/tests/failpoints/test_merge.rs b/components/raftstore-v2/tests/failpoints/test_merge.rs index 890b8c5e27a..11fe666b49b 100644 --- a/components/raftstore-v2/tests/failpoints/test_merge.rs +++ b/components/raftstore-v2/tests/failpoints/test_merge.rs @@ -7,7 +7,7 @@ use std::{ use engine_traits::Peekable; use raftstore_v2::router::{PeerMsg, PeerTick}; -use tikv_util::store::new_peer; +use tikv_util::{config::ReadableDuration, info, store::new_peer}; use crate::cluster::{ life_helper::assert_peer_not_exist, @@ -179,7 +179,9 @@ fn test_rollback() { // Target is merging. #[test] fn test_merge_conflict_0() { - let mut cluster = Cluster::default(); + let mut cluster = Cluster::with_configs(1, None, None, |cfg| { + cfg.merge_check_tick_interval = ReadableDuration::millis(100); + }); let store_id = cluster.node(0).id(); let router = &mut cluster.routers[0]; @@ -216,6 +218,7 @@ fn test_merge_conflict_0() { format!("k{}", region_3_id).as_bytes(), false, ); + info!("regions: {:?}, {:?}, {:?}", region_1, region_2, region_3); // pause merge progress of 2+3. let fp = fail::FailGuard::new("apply_commit_merge", "pause"); @@ -236,9 +239,9 @@ fn test_merge_conflict_0() { .unwrap(); let region_2 = cluster.routers[0].region_detail(region_2.get_id()); merge_region(&cluster, 0, region_1, peer_1, region_2, false); + drop(fp); // wait for rollback. rx.recv_timeout(std::time::Duration::from_secs(1)).unwrap(); - drop(fp); fail::remove("apply_rollback_merge"); // Check region 1 is not merged and can serve writes. diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 5b3cc5feb93..88ad9a0e380 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -515,6 +515,7 @@ pub fn disable_all_auto_ticks(cfg: &mut Config) { cfg.region_compact_check_interval = ReadableDuration::ZERO; cfg.pd_heartbeat_tick_interval = ReadableDuration::ZERO; cfg.pd_store_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.pd_report_min_resolved_ts_interval = ReadableDuration::ZERO; cfg.snap_mgr_gc_tick_interval = ReadableDuration::ZERO; cfg.lock_cf_compact_interval = ReadableDuration::ZERO; cfg.peer_stale_state_check_interval = ReadableDuration::ZERO; @@ -524,7 +525,6 @@ pub fn disable_all_auto_ticks(cfg: &mut Config) { cfg.merge_check_tick_interval = ReadableDuration::ZERO; cfg.cleanup_import_sst_interval = ReadableDuration::ZERO; cfg.inspect_interval = ReadableDuration::ZERO; - cfg.report_min_resolved_ts_interval = ReadableDuration::ZERO; cfg.reactive_memory_lock_tick_interval = ReadableDuration::ZERO; cfg.report_region_buckets_tick_interval = ReadableDuration::ZERO; cfg.check_long_uncommitted_interval = ReadableDuration::ZERO; diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 27380a52882..cde5c961f3f 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -23,23 +23,20 @@ test-engines-panic = [ "engine_test/test-engines-panic", ] -cloud-aws = ["sst_importer/cloud-aws"] -cloud-gcp = ["sst_importer/cloud-gcp"] -cloud-azure = ["sst_importer/cloud-azure"] - [dependencies] batch-system = { workspace = true } bitflags = "1.0.1" byteorder = "1.2" bytes = "1.0" causal_ts = { workspace = true } +chrono = "0.4" collections = { workspace = true } concurrency_manager = { workspace = true } crc32fast = "1.2" crossbeam = "0.8" derivative = "2" encryption = { workspace = true } -engine_rocks = { workspace = true, optional = true } +engine_rocks = { workspace = true, optional = true } # Should be [dev-dependencies] but we need to control the features # https://github.com/rust-lang/cargo/issues/6915 @@ -62,7 +59,7 @@ log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug log_wrappers = { workspace = true } memory_trace_macros = { workspace = true } online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } ordered-float = "2.6" parking_lot = "0.12" pd_client = { workspace = true } @@ -98,5 +95,7 @@ yatp = { workspace = true } encryption_export = { workspace = true } engine_panic = { workspace = true } engine_rocks = { workspace = true } +hybrid_engine = { workspace = true } panic_hook = { workspace = true } +region_cache_memory_engine = { workspace = true } test_sst_importer = { workspace = true } diff --git a/components/raftstore/src/compacted_event_sender.rs b/components/raftstore/src/compacted_event_sender.rs index 99ba70a0512..736332b52c5 100644 --- a/components/raftstore/src/compacted_event_sender.rs +++ b/components/raftstore/src/compacted_event_sender.rs @@ -1,18 +1,26 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::sync::Mutex; -use engine_rocks::{CompactedEventSender, RocksCompactedEvent, RocksEngine}; -use engine_traits::RaftEngine; +use engine_rocks::{CompactedEventSender, RocksCompactedEvent}; +use engine_traits::{KvEngine, RaftEngine}; use tikv_util::error_unknown; use crate::store::{fsm::store::RaftRouter, StoreMsg}; // raftstore v1's implementation -pub struct RaftRouterCompactedEventSender { - pub router: Mutex>, +pub struct RaftRouterCompactedEventSender +where + EK: KvEngine, + ER: RaftEngine, +{ + pub router: Mutex>, } -impl CompactedEventSender for RaftRouterCompactedEventSender { +impl CompactedEventSender for RaftRouterCompactedEventSender +where + EK: KvEngine, + ER: RaftEngine, +{ fn send(&self, event: RocksCompactedEvent) { let router = self.router.lock().unwrap(); let event = StoreMsg::CompactedEvent(event); diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index e1246e8d59d..b1dc3830bbb 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -168,7 +168,7 @@ impl Config { Ok(()) } - pub fn validate(&mut self) -> Result<()> { + pub fn validate(&mut self, raft_kv_v2: bool) -> Result<()> { if self.region_split_keys.is_none() { self.region_split_keys = Some((self.region_split_size().as_mb_f64() * 10000.0) as u64); } @@ -199,8 +199,9 @@ impl Config { None => self.region_max_keys = Some(self.region_split_keys() / 2 * 3), } let res = self.validate_bucket_size(); - // If it's OK to enable bucket, we will prefer to enable it if useful. - if let Ok(()) = res && self.enable_region_bucket.is_none() { + // If it's OK to enable bucket, we will prefer to enable it if useful for + // raftstore-v2. + if let Ok(()) = res && self.enable_region_bucket.is_none() && raft_kv_v2 { let useful = self.region_split_size() >= self.region_bucket_size * 2; self.enable_region_bucket = Some(useful); } else if let Err(e) = res && self.enable_region_bucket() { @@ -237,39 +238,39 @@ mod tests { #[test] fn test_config_validate() { let mut cfg = Config::default(); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); cfg = Config::default(); cfg.region_max_size = Some(ReadableSize(10)); cfg.region_split_size = Some(ReadableSize(20)); - cfg.validate().unwrap_err(); + cfg.validate(false).unwrap_err(); cfg = Config::default(); cfg.region_max_size = None; cfg.region_split_size = Some(ReadableSize(20)); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_max_size, Some(ReadableSize(30))); cfg = Config::default(); cfg.region_max_keys = Some(10); cfg.region_split_keys = Some(20); - cfg.validate().unwrap_err(); + cfg.validate(false).unwrap_err(); cfg = Config::default(); cfg.region_max_keys = None; cfg.region_split_keys = Some(20); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_max_keys, Some(30)); cfg = Config::default(); cfg.enable_region_bucket = Some(false); cfg.region_split_size = Some(ReadableSize(20)); cfg.region_bucket_size = ReadableSize(30); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); cfg = Config::default(); cfg.region_split_size = Some(ReadableSize::mb(20)); - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); assert_eq!(cfg.region_split_keys, Some(200000)); } } diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index d082013cd2c..c7d6731d3e9 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -19,8 +19,8 @@ use crate::store::BucketRange; /// A handle for coprocessor to schedule some command back to raftstore. pub trait StoreHandle: Clone + Send { - fn update_approximate_size(&self, region_id: u64, size: u64); - fn update_approximate_keys(&self, region_id: u64, keys: u64); + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option); + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option); fn ask_split( &self, region_id: u64, @@ -48,11 +48,13 @@ pub trait StoreHandle: Clone + Send { pub enum SchedTask { UpdateApproximateSize { region_id: u64, - size: u64, + splitable: Option, + size: Option, }, UpdateApproximateKeys { region_id: u64, - keys: u64, + splitable: Option, + keys: Option, }, AskSplit { region_id: u64, @@ -75,12 +77,20 @@ pub enum SchedTask { } impl StoreHandle for std::sync::mpsc::SyncSender { - fn update_approximate_size(&self, region_id: u64, size: u64) { - let _ = self.try_send(SchedTask::UpdateApproximateSize { region_id, size }); + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option) { + let _ = self.try_send(SchedTask::UpdateApproximateSize { + region_id, + splitable, + size, + }); } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { - let _ = self.try_send(SchedTask::UpdateApproximateKeys { region_id, keys }); + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option) { + let _ = self.try_send(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + keys, + }); } fn ask_split( diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 2c0e71dd8cb..d6a49175441 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -157,9 +157,11 @@ impl SplitCheckObserver for KeysCheckObserver } }; - self.router.update_approximate_keys(region_id, region_keys); + self.router + .update_approximate_keys(region_id, Some(region_keys), None); REGION_KEYS_HISTOGRAM.observe(region_keys as f64); + // if bucket checker using scan is added, to utilize the scan, // add keys checker as well for free // It has the assumption that the size's checker is before the keys's check in @@ -299,12 +301,28 @@ mod tests { None, )); // keys has not reached the max_keys 100 yet. - match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { - assert_eq!(region_id, region.get_id()); + let mut recv_cnt = 0; + loop { + match rx.try_recv() { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, region.get_id()); + assert!(splitable.is_none()); + recv_cnt += 1; + if recv_cnt == 2 { + break; + } + } + others => panic!("expect recv empty, but got {:?}", others), } - others => panic!("expect recv empty, but got {:?}", others), } put_data(&engine, 90, 160, true); @@ -403,12 +421,28 @@ mod tests { None, )); // keys has not reached the max_keys 100 yet. - match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { - assert_eq!(region_id, region.get_id()); + let mut recv_cnt = 0; + loop { + match rx.try_recv() { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, region.get_id()); + assert!(splitable.is_none()); + recv_cnt += 1; + if recv_cnt == 2 { + break; + } + } + others => panic!("expect recv empty, but got {:?}", others), } - others => panic!("expect recv empty, but got {:?}", others), } put_data(&engine, 90, 160, true); diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 4b320bef1b6..e5048a83826 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -158,13 +158,14 @@ impl SplitCheckObserver for SizeCheckObserver }; // send it to raftstore to update region approximate size - self.router.update_approximate_size(region_id, region_size); + self.router + .update_approximate_size(region_id, Some(region_size), None); + let need_split_region = region_size >= host.cfg.region_max_size().0; let need_bucket_checker = host.cfg.enable_region_bucket() && region_size >= 2 * host.cfg.region_bucket_size.0; REGION_SIZE_HISTOGRAM.observe(region_size as f64); - let need_split_region = region_size >= host.cfg.region_max_size().0; if need_split_region || need_bucket_checker { // when it's a large region use approximate way to produce split keys if need_split_region { @@ -265,11 +266,23 @@ pub mod tests { exp_split_keys: Vec>, ignore_split_keys: bool, ) { + let mut split = false; loop { match rx.try_recv() { - Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) - | Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { + Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, exp_region.get_id()); + split = split || splitable.unwrap_or(false); + } + Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } Ok(SchedTask::AskSplit { @@ -283,6 +296,7 @@ pub mod tests { if !ignore_split_keys { assert_eq!(split_keys, exp_split_keys); } + assert!(split); break; } others => panic!("expect split check result, but got {:?}", others), @@ -303,11 +317,23 @@ pub mod tests { exp_region: &Region, exp_split_keys_count: usize, ) { + let mut split = false; loop { match rx.try_recv() { - Ok(SchedTask::UpdateApproximateSize { region_id, .. }) - | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) - | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { + Ok(SchedTask::UpdateApproximateSize { + region_id, + splitable, + .. + }) + | Ok(SchedTask::UpdateApproximateKeys { + region_id, + splitable, + .. + }) => { + assert_eq!(region_id, exp_region.get_id()); + split = split || splitable.unwrap_or(false); + } + Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } Ok(SchedTask::AskSplit { @@ -319,6 +345,7 @@ pub mod tests { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); assert_eq!(split_keys.len(), exp_split_keys_count); + assert!(split); break; } others => panic!("expect split check result, but got {:?}", others), diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index d1597a77121..49a52de26e1 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -7,7 +7,10 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use kvproto::{errorpb, metapb, raft_serverpb}; use protobuf::ProtobufError; use thiserror::Error; -use tikv_util::{codec, deadline::DeadlineError}; +use tikv_util::{ + codec, + deadline::{set_deadline_exceeded_busy_error, DeadlineError}, +}; use super::{coprocessor::Error as CopError, store::SnapError}; @@ -287,6 +290,9 @@ impl From for errorpb::Error { e.set_store_peer_id(store_peer_id); errorpb.set_mismatch_peer_id(e); } + Error::DeadlineExceeded => { + set_deadline_exceeded_busy_error(&mut errorpb); + } _ => {} }; @@ -350,3 +356,20 @@ impl ErrorCodeExt for Error { } } } + +#[cfg(test)] +mod tests { + use kvproto::errorpb; + + use crate::Error; + + #[test] + fn test_deadline_exceeded_error() { + let err: errorpb::Error = Error::DeadlineExceeded.into(); + assert_eq!( + err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!(err.get_message(), "Deadline is exceeded"); + } +} diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 3a76a5ad26f..452616caf7e 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,10 +1,13 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::borrow::Cow; +use std::{ + borrow::Cow, + sync::{Arc, Mutex}, +}; // #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine, Snapshot}; +use engine_traits::{KvEngine, RaftEngine, Snapshot, SnapshotContext}; use error_code::ErrorCodeExt; use kvproto::{metapb, raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::SnapshotStatus; @@ -118,6 +121,7 @@ where { fn read( &mut self, + snap_ctx: Option, read_id: Option, req: RaftCmdRequest, cb: Callback, @@ -248,11 +252,12 @@ impl RaftStoreRouter for ServerRaftStoreRouter impl LocalReadRouter for ServerRaftStoreRouter { fn read( &mut self, + snap_ctx: Option, read_id: Option, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()> { - self.local_reader.read(read_id, req, cb); + self.local_reader.read(snap_ctx, read_id, req, cb); Ok(()) } @@ -286,11 +291,11 @@ impl RaftStoreRouter for RaftRouter { // duplicated codes. impl crate::coprocessor::StoreHandle for RaftRouter { - fn update_approximate_size(&self, region_id: u64, size: u64) { + fn update_approximate_size(&self, region_id: u64, size: Option, splitable: Option) { if let Err(e) = CasualRouter::send( self, region_id, - CasualMessage::RegionApproximateSize { size }, + CasualMessage::RegionApproximateSize { size, splitable }, ) { warn!( "failed to send approximate region size"; @@ -301,11 +306,11 @@ impl crate::coprocessor::StoreHandle for RaftRoute } } - fn update_approximate_keys(&self, region_id: u64, keys: u64) { + fn update_approximate_keys(&self, region_id: u64, keys: Option, splitable: Option) { if let Err(e) = CasualRouter::send( self, region_id, - CasualMessage::RegionApproximateKeys { keys }, + CasualMessage::RegionApproximateKeys { keys, splitable }, ) { warn!( "failed to send approximate region keys"; @@ -331,6 +336,7 @@ impl crate::coprocessor::StoreHandle for RaftRoute split_keys, callback: Callback::None, source, + share_source_region_size: true, }, ) { warn!( @@ -405,6 +411,33 @@ where ) -> RaftStoreResult<()>; } +impl> CdcHandle for Arc> { + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback<::Snapshot>, + ) -> RaftStoreResult<()> { + Mutex::lock(self).unwrap().capture_change( + region_id, + region_epoch, + change_observer, + callback, + ) + } + + fn check_leadership( + &self, + region_id: u64, + callback: Callback<::Snapshot>, + ) -> RaftStoreResult<()> { + Mutex::lock(self) + .unwrap() + .check_leadership(region_id, callback) + } +} + /// A wrapper of SignificantRouter that is specialized for implementing /// CdcHandle. #[derive(Clone)] diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 24abf24c4fd..97e865a6bfe 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -330,7 +330,7 @@ fn test_worker() { t.worker.write_to_db(true); - let snapshot = engines.kv.snapshot(); + let snapshot = engines.kv.snapshot(None); assert_eq!(snapshot.get_value(b"kv_k1").unwrap().unwrap(), b"kv_v1"); assert_eq!(snapshot.get_value(b"kv_k2").unwrap().unwrap(), b"kv_v2"); assert_eq!(snapshot.get_value(b"kv_k3").unwrap().unwrap(), b"kv_v3"); @@ -536,7 +536,7 @@ fn test_basic_flow() { must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); - let snapshot = engines.kv.snapshot(); + let snapshot = engines.kv.snapshot(None); assert!(snapshot.get_value(b"kv_k1").unwrap().is_none()); assert_eq!(snapshot.get_value(b"kv_k2").unwrap().unwrap(), b"kv_v2"); assert_eq!(snapshot.get_value(b"kv_k3").unwrap().unwrap(), b"kv_v3"); diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index efee09be906..161a8f9c4db 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -23,10 +23,16 @@ pub struct CompactionGuardGeneratorFactory { cf_name: CfNames, provider: P, min_output_file_size: u64, + max_compaction_size: u64, } impl CompactionGuardGeneratorFactory

{ - pub fn new(cf: CfName, provider: P, min_output_file_size: u64) -> Result { + pub fn new( + cf: CfName, + provider: P, + min_output_file_size: u64, + max_compaction_size: u64, + ) -> Result { let cf_name = match cf { CF_DEFAULT => CfNames::default, CF_LOCK => CfNames::lock, @@ -43,6 +49,7 @@ impl CompactionGuardGeneratorFactory

{ cf_name, provider, min_output_file_size, + max_compaction_size, }) } } @@ -72,6 +79,15 @@ impl SstPartitionerFactory use_guard: false, boundaries: vec![], pos: 0, + next_level_pos: 0, + next_level_boundaries: context + .next_level_boundaries + .iter() + .map(|v| v.to_vec()) + .collect(), + next_level_size: context.next_level_sizes.clone(), + current_next_level_size: 0, + max_compaction_size: self.max_compaction_size, }) } } @@ -86,7 +102,20 @@ pub struct CompactionGuardGenerator { use_guard: bool, // The boundary keys are exclusive. boundaries: Vec>, + /// The SST boundaries overlapped with the compaction input at the next + /// level of output level (let we call it L+2). When the output level is the + /// bottom-most level(usually L6), this will be empty. The boundaries + /// are the first key of the first sst concatenating with all ssts' end key. + next_level_boundaries: Vec>, + /// The size of each "segment" of L+2. If the `next_level_boundaries`(let we + /// call it NLB) isn't empty, `next_level_size` will have length + /// `NLB.len() - 1`, and at the position `N` stores the size of range + /// `[NLB[N], NLB[N+1]]` in L+2. + next_level_size: Vec, pos: usize, + next_level_pos: usize, + current_next_level_size: u64, + max_compaction_size: u64, } impl CompactionGuardGenerator

{ @@ -153,27 +182,52 @@ impl SstPartitioner for CompactionGuardGenerator

{ if !self.use_guard { return SstPartitionerResult::NotRequired; } - let mut pos = self.pos; - let mut skip_count = 0; - while pos < self.boundaries.len() && self.boundaries[pos].as_slice() <= req.prev_user_key { - pos += 1; - skip_count += 1; - if skip_count >= COMPACTION_GUARD_MAX_POS_SKIP { - let prev_user_key = req.prev_user_key.to_vec(); - pos = match self.boundaries.binary_search(&prev_user_key) { - Ok(search_pos) => search_pos + 1, - Err(search_pos) => search_pos, - }; - break; - } + self.pos = seek_to(&self.boundaries, req.prev_user_key, self.pos); + // Generally this shall be a noop... because each time we are moving the cursor + // to the previous key. + let left_next_level_pos = seek_to( + &self.next_level_boundaries, + req.prev_user_key, + self.next_level_pos, + ); + let right_next_level_pos = seek_to( + &self.next_level_boundaries, + req.current_user_key, + left_next_level_pos, + ); + // The cursor has been moved. + if right_next_level_pos > left_next_level_pos { + self.current_next_level_size += self.next_level_size + [left_next_level_pos..right_next_level_pos - 1] + .iter() + .map(|x| *x as u64) + .sum::(); } - self.pos = pos; - if pos < self.boundaries.len() && self.boundaries[pos].as_slice() <= req.current_user_key { - if req.current_output_file_size >= self.min_output_file_size { + self.next_level_pos = right_next_level_pos; + + if self.pos < self.boundaries.len() + && self.boundaries[self.pos].as_slice() <= req.current_user_key + { + if req.current_output_file_size >= self.min_output_file_size + // Or, the output file may make a huge compaction even greater than the max compaction size. + || self.current_next_level_size >= self.max_compaction_size + { COMPACTION_GUARD_ACTION_COUNTER .get(self.cf_name) .partition .inc(); + // The current pointer status should be like (let * be the current pos, ^ be + // where the previous user key is): + // boundaries: A B C D + // size: 1 3 2 + // ^ * + // You will notice that the previous user key is between B and C, which indices + // that there must still be something between previous user key and C. + // We still set `current_next_level_size` to zero here, so the segment will be + // forgotten. I think that will be acceptable given generally a segment won't be + // greater than the `max-sst-size`, which is tiny comparing to the + // `max-compaction-size` usually. + self.current_next_level_size = 0; SstPartitionerResult::Required } else { COMPACTION_GUARD_ACTION_COUNTER @@ -193,10 +247,28 @@ impl SstPartitioner for CompactionGuardGenerator

{ } } +fn seek_to(all_data: &Vec>, target_key: &[u8], from_pos: usize) -> usize { + let mut pos = from_pos; + let mut skip_count = 0; + while pos < all_data.len() && all_data[pos].as_slice() <= target_key { + pos += 1; + skip_count += 1; + if skip_count >= COMPACTION_GUARD_MAX_POS_SKIP { + pos = match all_data.binary_search_by(|probe| probe.as_slice().cmp(target_key)) { + Ok(search_pos) => search_pos + 1, + Err(search_pos) => search_pos, + }; + break; + } + } + pos +} + #[cfg(test)] mod tests { - use std::str; + use std::{path::Path, str}; + use collections::HashMap; use engine_rocks::{ raw::{BlockBasedOptions, DBCompressionType}, util::new_engine_opt, @@ -212,6 +284,13 @@ mod tests { use super::*; use crate::coprocessor::region_info_accessor::MockRegionInfoProvider; + impl CompactionGuardGenerator { + fn reset_next_level_size_state(&mut self) { + self.current_next_level_size = 0; + self.next_level_pos = 0; + } + } + #[test] fn test_compaction_guard_non_data() { let mut guard = CompactionGuardGenerator { @@ -224,6 +303,11 @@ mod tests { use_guard: false, boundaries: vec![], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: vec![], + next_level_size: vec![], + max_compaction_size: 1 << 30, }; guard.smallest_key = keys::LOCAL_MIN_KEY.to_vec(); @@ -267,8 +351,16 @@ mod tests { provider: MockRegionInfoProvider::new(vec![]), initialized: true, use_guard: true, - boundaries: vec![b"bbb".to_vec(), b"ccc".to_vec()], + boundaries: vec![b"bbb".to_vec(), b"ccc".to_vec(), b"ddd".to_vec()], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: (0..10) + .map(|x| format!("bbb{:02}", x).into_bytes()) + .chain((0..100).map(|x| format!("cccz{:03}", x).into_bytes())) + .collect(), + next_level_size: [&[1 << 18; 99][..], &[1 << 28; 10][..]].concat(), + max_compaction_size: 1 << 30, // 1GB }; // Crossing region boundary. let mut req = SstPartitionerRequest { @@ -277,7 +369,11 @@ mod tests { current_output_file_size: 32 << 20, }; assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); + assert_eq!(guard.next_level_pos, 10); assert_eq!(guard.pos, 0); + assert_eq!(guard.current_next_level_size, 0); + guard.reset_next_level_size_state(); + // Output file size too small. req = SstPartitionerRequest { prev_user_key: b"bba", @@ -289,6 +385,10 @@ mod tests { SstPartitionerResult::NotRequired ); assert_eq!(guard.pos, 0); + assert_eq!(guard.next_level_pos, 10); + assert_eq!(guard.current_next_level_size, 9 << 18); + guard.reset_next_level_size_state(); + // Not crossing boundary. req = SstPartitionerRequest { prev_user_key: b"aaa", @@ -300,6 +400,9 @@ mod tests { SstPartitionerResult::NotRequired ); assert_eq!(guard.pos, 0); + assert_eq!(guard.next_level_pos, 0); + guard.reset_next_level_size_state(); + // Move position req = SstPartitionerRequest { prev_user_key: b"cca", @@ -308,6 +411,30 @@ mod tests { }; assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); assert_eq!(guard.pos, 1); + assert_eq!(guard.next_level_pos, 110); + guard.reset_next_level_size_state(); + + // Move next level posistion + req = SstPartitionerRequest { + prev_user_key: b"cccz000", + current_user_key: b"cccz042", + current_output_file_size: 1 << 20, + }; + assert_eq!( + guard.should_partition(&req), + SstPartitionerResult::NotRequired + ); + assert_eq!(guard.pos, 2); + assert_eq!(guard.next_level_pos, 53); + + req = SstPartitionerRequest { + prev_user_key: b"cccz090", + current_user_key: b"dde", + current_output_file_size: 1 << 20, + }; + assert_eq!(guard.should_partition(&req), SstPartitionerResult::Required); + assert_eq!(guard.pos, 2); + assert_eq!(guard.next_level_pos, 110); } #[test] @@ -339,6 +466,11 @@ mod tests { b"aaa15".to_vec(), ], pos: 0, + current_next_level_size: 0, + next_level_pos: 0, + next_level_boundaries: vec![], + next_level_size: vec![], + max_compaction_size: 1 << 30, }; // Binary search meet exact match. guard.pos = 0; @@ -365,15 +497,23 @@ mod tests { const MIN_OUTPUT_FILE_SIZE: u64 = 1024; const MAX_OUTPUT_FILE_SIZE: u64 = 4096; + const MAX_COMPACTION_SIZE: u64 = 10240; fn new_test_db(provider: MockRegionInfoProvider) -> (RocksEngine, TempDir) { let temp_dir = TempDir::new().unwrap(); let mut cf_opts = RocksCfOptions::default(); + cf_opts.set_max_bytes_for_level_base(MAX_OUTPUT_FILE_SIZE); + cf_opts.set_max_bytes_for_level_multiplier(5); cf_opts.set_target_file_size_base(MAX_OUTPUT_FILE_SIZE); cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory( - CompactionGuardGeneratorFactory::new(CF_DEFAULT, provider, MIN_OUTPUT_FILE_SIZE) - .unwrap(), + CompactionGuardGeneratorFactory::new( + CF_DEFAULT, + provider, + MIN_OUTPUT_FILE_SIZE, + MAX_COMPACTION_SIZE, + ) + .unwrap(), )); cf_opts.set_disable_auto_compactions(true); cf_opts.compression_per_level(&[ @@ -401,7 +541,7 @@ mod tests { } fn collect_keys(path: &str) -> Vec> { - let reader = RocksSstReader::open(path).unwrap(); + let reader = RocksSstReader::open(path, None).unwrap(); let mut sst_reader = reader.iter(IterOptions::default()).unwrap(); let mut valid = sst_reader.seek_to_first().unwrap(); let mut ret = vec![]; @@ -412,6 +552,16 @@ mod tests { ret } + fn get_sst_files(dir: &Path) -> Vec { + let files = dir.read_dir().unwrap(); + let mut sst_files = files + .map(|entry| entry.unwrap().path().to_str().unwrap().to_owned()) + .filter(|entry| entry.ends_with(".sst")) + .collect::>(); + sst_files.sort(); + sst_files + } + #[test] fn test_compaction_guard_with_rocks() { let provider = MockRegionInfoProvider::new(vec![ @@ -463,11 +613,7 @@ mod tests { ) .unwrap(); - let files = dir.path().read_dir().unwrap(); - let mut sst_files = files - .map(|entry| entry.unwrap().path().to_str().unwrap().to_owned()) - .filter(|entry| entry.ends_with(".sst")) - .collect::>(); + let mut sst_files = get_sst_files(dir.path()); sst_files.sort(); assert_eq!(3, sst_files.len()); assert_eq!(collect_keys(&sst_files[0]), [b"za1", b"zb1", b"zb2"]); @@ -477,4 +623,120 @@ mod tests { ); assert_eq!(collect_keys(&sst_files[2]), [b"zc6"]); } + + fn simple_regions() -> MockRegionInfoProvider { + MockRegionInfoProvider::new(vec![ + Region { + id: 1, + start_key: b"a".to_vec(), + end_key: b"b".to_vec(), + ..Default::default() + }, + Region { + id: 2, + start_key: b"b".to_vec(), + end_key: b"c".to_vec(), + ..Default::default() + }, + Region { + id: 3, + start_key: b"c".to_vec(), + end_key: b"d".to_vec(), + ..Default::default() + }, + ]) + } + + #[test] + fn test_next_level_compaction() { + let provider = simple_regions(); + let (db, _dir) = new_test_db(provider); + assert_eq!(b"z", DATA_PREFIX_KEY); + let tiny_value = [b'v'; 1]; + let value = vec![b'v'; 1024 * 10]; + ['a', 'b', 'c'] + .into_iter() + .flat_map(|x| (1..10).map(move |n| format!("z{x}{n}").into_bytes())) + .for_each(|key| db.put(&key, &value).unwrap()); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(2)).unwrap(); + db.put(b"za0", &tiny_value).unwrap(); + db.put(b"zd0", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 2, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[0].largestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[1].smallestkey, b"zd0", "{:?}", level_1); + assert_eq!(level_1[1].largestkey, b"zd0", "{:?}", level_1); + } + + #[test] + fn test_next_level_compaction_no_split() { + let provider = simple_regions(); + let (db, _dir) = new_test_db(provider); + assert_eq!(b"z", DATA_PREFIX_KEY); + let tiny_value = [b'v'; 1]; + let value = vec![b'v'; 1024 * 10]; + ['a', 'b', 'c'] + .into_iter() + .flat_map(|x| (1..10).map(move |n| format!("z{x}{n}").into_bytes())) + .for_each(|key| db.put(&key, &value).unwrap()); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(2)).unwrap(); + // So... the next-level size will be almost 1024 * 9, which doesn't exceeds the + // compaction size limit. + db.put(b"za0", &tiny_value).unwrap(); + db.put(b"za9", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 1, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za0", "{:?}", level_1); + assert_eq!(level_1[0].largestkey, b"za9", "{:?}", level_1); + db.compact_range(None, None, false, 1).unwrap(); + + // So... the next-level size will be almost 1024 * 15, which should reach the + // limit. + db.put(b"za30", &tiny_value).unwrap(); + db.put(b"zb90", &tiny_value).unwrap(); + db.flush_cfs(&[], true).unwrap(); + db.compact_files_in_range(None, None, Some(1)).unwrap(); + + let level_1 = &level_files(&db)[&1]; + assert_eq!(level_1.len(), 2, "{:?}", level_1); + assert_eq!(level_1[0].smallestkey, b"za30", "{:?}", level_1); + assert_eq!(level_1[1].largestkey, b"zb90", "{:?}", level_1); + } + + #[derive(Debug)] + #[allow(dead_code)] + struct OwnedSstFileMetadata { + name: String, + size: usize, + smallestkey: Vec, + largestkey: Vec, + } + + #[allow(unused)] + fn level_files(db: &RocksEngine) -> HashMap> { + let db = db.as_inner(); + let cf = db.cf_handle("default").unwrap(); + let md = db.get_column_family_meta_data(cf); + let mut res: HashMap> = HashMap::default(); + for (i, level) in md.get_levels().into_iter().enumerate() { + for file in level.get_files() { + res.entry(i).or_default().push(OwnedSstFileMetadata { + name: file.get_name(), + size: file.get_size(), + smallestkey: file.get_smallestkey().to_owned(), + largestkey: file.get_largestkey().to_owned(), + }); + } + } + res + } } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 817be7eb969..c7c65e80d6c 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize}; use serde_with::with_prefix; use tikv_util::{ box_err, - config::{ReadableDuration, ReadableSize, VersionTrack}, + config::{ReadableDuration, ReadableSchedule, ReadableSize, VersionTrack}, error, info, sys::SysQuota, warn, @@ -104,12 +104,11 @@ pub struct Config { pub max_manual_flush_rate: f64, // When a peer is not responding for this time, leader will not keep entry cache for it. pub raft_entry_cache_life_time: ReadableDuration, - // Deprecated! The configuration has no effect. - // They are preserved for compatibility check. // When a peer is newly added, reject transferring leader to the peer for a while. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. It has no effect"] pub raft_reject_transfer_leader_duration: ReadableDuration, /// Whether to disable checking quorum for the raft group. This will make @@ -140,9 +139,10 @@ pub struct Config { pub region_compact_min_redundant_rows: u64, /// Minimum percentage of redundant rows to trigger manual compaction. /// Should between 1 and 100. - pub region_compact_redundant_rows_percent: u64, + pub region_compact_redundant_rows_percent: Option, pub pd_heartbeat_tick_interval: ReadableDuration, pub pd_store_heartbeat_tick_interval: ReadableDuration, + pub pd_report_min_resolved_ts_interval: ReadableDuration, pub snap_mgr_gc_tick_interval: ReadableDuration, pub snap_gc_timeout: ReadableDuration, /// The duration of snapshot waits for region split. It prevents leader from @@ -152,6 +152,15 @@ pub struct Config { pub lock_cf_compact_interval: ReadableDuration, pub lock_cf_compact_bytes_threshold: ReadableSize, + /// Hours of the day during which we may execute a periodic full compaction. + /// If not set or empty, periodic full compaction will not run. In toml this + /// should be a list of timesin "HH:MM" format with an optional timezone + /// offset. If no timezone is specified, local timezone is used. E.g., + /// `["23:00 +0000", "03:00 +0700"]` or `["23:00", "03:00"]`. + pub periodic_full_compact_start_times: ReadableSchedule, + /// Do not start a full compaction if cpu utilization exceeds this number. + pub periodic_full_compact_start_max_cpu: f64, + #[online_config(skip)] pub notify_capacity: usize, pub messages_per_tick: usize, @@ -169,6 +178,9 @@ pub struct Config { /// and try to alert monitoring systems, if there is any. pub abnormal_leader_missing_duration: ReadableDuration, pub peer_stale_state_check_interval: ReadableDuration, + /// Interval to check GC peers. + #[doc(hidden)] + pub gc_peer_check_interval: ReadableDuration, #[online_config(hidden)] pub leader_transfer_max_log_lag: u64, @@ -317,39 +329,40 @@ pub struct Config { pub io_reschedule_concurrent_max_count: usize, pub io_reschedule_hotpot_duration: ReadableDuration, - // Deprecated! Batch is done in raft client. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. Batch is done in raft client."] pub raft_msg_flush_interval: ReadableDuration, - // Deprecated! These configuration has been moved to Coprocessor. - // They are preserved for compatibility check. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to coprocessor.region_max_size."] pub region_max_size: ReadableSize, #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to coprocessor.region_split_size."] pub region_split_size: ReadableSize, - // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot - // sequence number. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been removed. The time to clean stale peer safely can be decided based on RocksDB snapshot sequence number."] pub clean_stale_peer_delay: ReadableDuration, // Interval to inspect the latency of raftstore for slow store detection. pub inspect_interval: ReadableDuration, + /// Threshold of CPU utilization to inspect for slow store detection. + #[doc(hidden)] + pub inspect_cpu_util_thd: f64, // The unsensitive(increase it to reduce sensitiveness) of the cause-trend detection pub slow_trend_unsensitive_cause: f64, // The unsensitive(increase it to reduce sensitiveness) of the result-trend detection pub slow_trend_unsensitive_result: f64, - - // Interval to report min resolved ts, if it is zero, it means disabled. - pub report_min_resolved_ts_interval: ReadableDuration, + // The sensitiveness of slowness on network-io. + pub slow_trend_network_io_factor: f64, /// Interval to check whether to reactivate in-memory pessimistic lock after /// being disabled before transferring leader. @@ -397,6 +410,7 @@ pub struct Config { } impl Default for Config { + #[allow(deprecated)] fn default() -> Config { Config { prevote: true, @@ -429,9 +443,15 @@ impl Default for Config { region_compact_min_tombstones: 10000, region_compact_tombstones_percent: 30, region_compact_min_redundant_rows: 50000, - region_compact_redundant_rows_percent: 20, + region_compact_redundant_rows_percent: Some(20), pd_heartbeat_tick_interval: ReadableDuration::minutes(1), pd_store_heartbeat_tick_interval: ReadableDuration::secs(10), + pd_report_min_resolved_ts_interval: ReadableDuration::secs(1), + // Disable periodic full compaction by default. + periodic_full_compact_start_times: ReadableSchedule::default(), + // If periodic full compaction is enabled, do not start a full compaction + // if the CPU utilization is over 10%. + periodic_full_compact_start_max_cpu: 0.1, notify_capacity: 40960, snap_mgr_gc_tick_interval: ReadableDuration::minutes(1), snap_gc_timeout: ReadableDuration::hours(4), @@ -500,16 +520,22 @@ impl Default for Config { region_max_size: ReadableSize(0), region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), - inspect_interval: ReadableDuration::millis(500), + inspect_interval: ReadableDuration::millis(100), + // The default value of `inspect_cpu_util_thd` is 0.4, which means + // when the cpu utilization is greater than 40%, the store might be + // regarded as a slow node if there exists delayed inspected messages. + // It's good enough for most cases to reduce the false positive rate. + inspect_cpu_util_thd: 0.4, // The param `slow_trend_unsensitive_cause == 2.0` can yield good results, // make it `10.0` to reduce a bit sensitiveness because SpikeFilter is disabled slow_trend_unsensitive_cause: 10.0, slow_trend_unsensitive_result: 0.5, - report_min_resolved_ts_interval: ReadableDuration::secs(1), + slow_trend_network_io_factor: 0.0, check_leader_lease_interval: ReadableDuration::secs(0), renew_leader_lease_advance_duration: ReadableDuration::secs(0), allow_unsafe_vote_after_start: false, report_region_buckets_tick_interval: ReadableDuration::secs(10), + gc_peer_check_interval: ReadableDuration::secs(60), max_snapshot_file_raw_size: ReadableSize::mb(100), unreachable_backoff: ReadableDuration::secs(10), // TODO: make its value reasonable @@ -581,6 +607,10 @@ impl Config { self.region_compact_check_step.unwrap() } + pub fn region_compact_redundant_rows_percent(&self) -> u64 { + self.region_compact_redundant_rows_percent.unwrap() + } + #[inline] pub fn warmup_entry_cache_enabled(&self) -> bool { self.max_entry_cache_warmup_duration.0 != Duration::from_secs(0) @@ -766,6 +796,15 @@ impl Config { )); } + let region_compact_redundant_rows_percent = + self.region_compact_redundant_rows_percent.unwrap(); + if !(1..=100).contains(®ion_compact_redundant_rows_percent) { + return Err(box_err!( + "region-compact-redundant-rows-percent must between 1 and 100, current value is {}", + region_compact_redundant_rows_percent + )); + } + if self.local_read_batch_size == 0 { return Err(box_err!("local-read-batch-size must be greater than 0")); } @@ -903,6 +942,12 @@ impl Config { )); } + if self.slow_trend_network_io_factor < 0.0 { + return Err(box_err!( + "slow_trend_network_io_factor must be greater than 0" + )); + } + Ok(()) } @@ -992,14 +1037,20 @@ impl Config { .with_label_values(&["region_compact_min_redundant_rows"]) .set(self.region_compact_min_redundant_rows as f64); CONFIG_RAFTSTORE_GAUGE - .with_label_values(&["region_compact_tombstones_percent"]) - .set(self.region_compact_tombstones_percent as f64); + .with_label_values(&["region_compact_redundant_rows_percent"]) + .set( + self.region_compact_redundant_rows_percent + .unwrap_or_default() as f64, + ); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["pd_heartbeat_tick_interval"]) .set(self.pd_heartbeat_tick_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["pd_store_heartbeat_tick_interval"]) .set(self.pd_store_heartbeat_tick_interval.as_secs_f64()); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["pd_report_min_resolved_ts_interval"]) + .set(self.pd_report_min_resolved_ts_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["snap_mgr_gc_tick_interval"]) .set(self.snap_mgr_gc_tick_interval.as_secs_f64()); @@ -1035,6 +1086,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["leader_transfer_max_log_lag"]) .set(self.leader_transfer_max_log_lag as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["gc_peer_check_interval"]) + .set(self.gc_peer_check_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["snap_apply_batch_size"]) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index e2b1cedc88d..221e5b1dcea 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -272,6 +272,7 @@ pub enum ExecResult { regions: Vec, derived: Region, new_split_regions: HashMap, + share_source_region_size: bool, }, PrepareMerge { region: Region, @@ -391,7 +392,7 @@ where tag: String, timer: Option, host: CoprocessorHost, - importer: Arc, + importer: Arc>, region_scheduler: Scheduler>, router: ApplyRouter, notifier: Box>, @@ -474,7 +475,7 @@ where pub fn new( tag: String, host: CoprocessorHost, - importer: Arc, + importer: Arc>, region_scheduler: Scheduler>, engine: EK, router: ApplyRouter, @@ -661,9 +662,7 @@ where results: VecDeque>, ) { if self.host.pre_persist(&delegate.region, true, None) { - if !delegate.pending_remove { - delegate.maybe_write_apply_state(self); - } + delegate.maybe_write_apply_state(self); self.commit_opt(delegate, false); } else { debug!("do not persist when finish_for"; @@ -678,7 +677,7 @@ where exec_res: results, metrics: mem::take(&mut delegate.metrics), applied_term: delegate.applied_term, - bucket_stat: delegate.buckets.clone().map(Box::new), + bucket_stat: delegate.buckets.clone(), }); if !self.kv_wb().is_empty() { // Pending writes not flushed, need to set seqno to following ApplyRes later @@ -2114,14 +2113,14 @@ where match change_type { ConfChangeType::AddNode => { - let add_ndoe_fp = || { + let add_node_fp = || { fail_point!( "apply_on_add_node_1_2", self.id() == 2 && self.region_id() == 1, |_| {} ) }; - add_ndoe_fp(); + add_node_fp(); PEER_ADMIN_CMD_COUNTER_VEC .with_label_values(&["add_peer", "all"]) @@ -2516,6 +2515,9 @@ where admin_req .mut_splits() .set_right_derive(split.get_right_derive()); + admin_req + .mut_split() + .set_share_source_region_size(split.get_share_source_region_size()); admin_req.mut_splits().mut_requests().push(split); // This method is executed only when there are unapplied entries after being // restarted. So there will be no callback, it's OK to return a response @@ -2560,6 +2562,7 @@ where derived.mut_region_epoch().set_version(new_version); let right_derive = split_reqs.get_right_derive(); + let share_source_region_size = split_reqs.get_share_source_region_size(); let mut regions = Vec::with_capacity(new_region_cnt + 1); // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. @@ -2724,6 +2727,7 @@ where regions, derived, new_split_regions, + share_source_region_size, }), )) } @@ -3240,7 +3244,7 @@ where // open files in rocksdb. // TODO: figure out another way to do consistency check without snapshot // or short life snapshot. - snap: ctx.engine.snapshot(), + snap: ctx.engine.snapshot(None), }) }, )) @@ -3868,7 +3872,7 @@ where pub applied_term: u64, pub exec_res: VecDeque>, pub metrics: ApplyMetrics, - pub bucket_stat: Option>, + pub bucket_stat: Option, pub write_seqno: Vec, } @@ -4070,6 +4074,7 @@ where /// Handles peer destroy. When a peer is destroyed, the corresponding apply /// delegate should be removed too. fn handle_destroy(&mut self, ctx: &mut ApplyContext, d: Destroy) { + fail_point!("on_apply_handle_destroy"); assert_eq!(d.region_id, self.delegate.region_id()); if d.merge_from_snapshot { assert_eq!(self.delegate.stopped, false); @@ -4193,7 +4198,7 @@ where } if let Err(e) = snap_task.generate_and_schedule_snapshot::( - apply_ctx.engine.snapshot(), + apply_ctx.engine.snapshot(None), self.delegate.applied_term, self.delegate.apply_state.clone(), &apply_ctx.region_scheduler, @@ -4265,7 +4270,7 @@ where ReadResponse { response: Default::default(), snapshot: Some(RegionSnapshot::from_snapshot( - Arc::new(apply_ctx.engine.snapshot()), + Arc::new(apply_ctx.engine.snapshot(None)), Arc::new(self.delegate.region.clone()), )), txn_extra_op: TxnExtraOp::Noop, @@ -4655,7 +4660,7 @@ pub struct Builder { tag: String, cfg: Arc>, coprocessor_host: CoprocessorHost, - importer: Arc, + importer: Arc>, region_scheduler: Scheduler::Snapshot>>, engine: EK, sender: Box>, @@ -5056,7 +5061,7 @@ mod tests { (path, engine) } - pub fn create_tmp_importer(path: &str) -> (TempDir, Arc) { + pub fn create_tmp_importer(path: &str) -> (TempDir, Arc>) { let dir = Builder::new().prefix(path).tempdir().unwrap(); let importer = Arc::new( SstImporter::new( @@ -5526,6 +5531,21 @@ mod tests { ) } + fn cb_conf_change( + idx: u64, + term: u64, + tx: Sender, + ) -> Proposal> { + proposal( + true, + idx, + term, + Callback::write(Box::new(move |resp: WriteResponse| { + tx.send(resp.response).unwrap(); + })), + ) + } + struct EntryBuilder { entry: Entry, req: RaftCmdRequest, @@ -5653,6 +5673,14 @@ mod tests { self } + fn conf_change(mut self, changes: Vec) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeerV2); + req.mut_change_peer_v2().set_changes(changes.into()); + self.req.set_admin_request(req); + self + } + fn build(mut self) -> Entry { self.entry .set_data(self.req.write_to_bytes().unwrap().into()); @@ -5739,7 +5767,6 @@ mod tests { self.header.clone(), bin, 1000, - false, ); let (bytes, _) = req_encoder.encode(); self.entry.set_data(bytes.into()); @@ -6924,7 +6951,7 @@ mod tests { router.schedule_task(1, Msg::apply(apply2)); let res = fetch_apply_res(&rx); - let bucket_version = res.bucket_stat.unwrap().as_ref().meta.version; + let bucket_version = res.bucket_stat.unwrap().meta.version; assert_eq!(bucket_version, 2); @@ -7088,6 +7115,7 @@ mod tests { regions, derived: _, new_split_regions: _, + share_source_region_size: _, } = apply_res.exec_res.front().unwrap() { let r8 = regions.get(0).unwrap(); @@ -7650,6 +7678,125 @@ mod tests { system.shutdown(); } + // When a peer is removed, it is necessary to update its apply state because + // this peer may be simultaneously taking a snapshot. An outdated apply state + // invalidates the coprocessor cache assumption (apply state must match data + // in the snapshot) and potentially lead to a violation of linearizability + // (returning stale cache). + #[test] + fn test_conf_change_remove_node_update_apply_state() { + let (_path, engine) = create_tmp_engine("test-delegate"); + let (_import_dir, importer) = create_tmp_importer("test-delegate"); + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + term: 1, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3), new_peer(4, 5), new_learner_peer(6, 7)]; + reg.region.set_peers(peers.into()); + let (tx, apply_res_rx) = mpsc::channel(); + let sender = Box::new(TestNotifier { tx }); + let coprocessor_host = CoprocessorHost::::default(); + let (region_scheduler, _) = dummy_scheduler(); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + importer, + region_scheduler, + coprocessor_host, + engine: engine.clone(), + router: router.clone(), + store_id: 2, + pending_create_peers, + }; + system.spawn("test-conf-change".to_owned(), builder); + + router.schedule_task(1, Msg::Registration(reg.dup())); + + let mut index_id = 1; + let epoch = reg.region.get_region_epoch().to_owned(); + + // Write some data. + let (capture_tx, capture_rx) = mpsc::channel(); + let put_entry = EntryBuilder::new(index_id, 1) + .put(b"k1", b"v1") + .epoch(epoch.get_conf_ver(), epoch.get_version()) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![put_entry], + vec![cb(index_id, 1, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let initial_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap(); + assert_ne!(initial_state.get_applied_index(), 0); + match apply_res_rx.recv_timeout(Duration::from_secs(3)) { + Ok(PeerMsg::ApplyRes { + res: TaskRes::Apply(apply_res), + }) => assert_eq!(apply_res.apply_state, initial_state), + e => panic!("unexpected result: {:?}", e), + } + index_id += 1; + + // Remove itself. + let (capture_tx, capture_rx) = mpsc::channel(); + let mut remove_node = ChangePeerRequest::default(); + remove_node.set_change_type(ConfChangeType::RemoveNode); + remove_node.set_peer(new_peer(2, 3)); + let conf_change = EntryBuilder::new(index_id, 1) + .conf_change(vec![remove_node]) + .epoch(epoch.get_conf_ver(), epoch.get_version()) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![conf_change], + vec![cb_conf_change(index_id, 1, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let apply_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap(); + match apply_res_rx.recv_timeout(Duration::from_secs(3)) { + Ok(PeerMsg::ApplyRes { + res: TaskRes::Apply(apply_res), + }) => assert_eq!(apply_res.apply_state, apply_state), + e => panic!("unexpected result: {:?}", e), + } + assert!( + apply_state.get_applied_index() > initial_state.get_applied_index(), + "\n{:?}\n{:?}", + apply_state, + initial_state + ); + + system.shutdown(); + } + #[test] fn pending_cmd_leak() { let res = panic_hook::recover_safe(|| { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index da91e26eb09..7c33bf66b87 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -41,7 +41,7 @@ use kvproto::{ replication_modepb::{DrAutoSyncState, ReplicationMode}, }; use parking_lot::RwLockWriteGuard; -use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::BucketMeta; use protobuf::Message; use raft::{ self, @@ -51,7 +51,7 @@ use raft::{ use smallvec::SmallVec; use tikv_alloc::trace::TraceEvent; use tikv_util::{ - box_err, debug, defer, error, escape, info, is_zero_duration, + box_err, debug, defer, error, escape, info, info_or_debug, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, @@ -97,7 +97,7 @@ use crate::{ UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, }, util, - util::{is_region_initialized, KeysInfoFormatter, LeaseState}, + util::{KeysInfoFormatter, LeaseState}, worker::{ Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, @@ -206,7 +206,7 @@ where let callback = match msg { PeerMsg::RaftCommand(cmd) => cmd.callback, PeerMsg::CasualMessage(CasualMessage::SplitRegion { callback, .. }) => callback, - PeerMsg::RaftMessage(im) => { + PeerMsg::RaftMessage(im, _) => { raft_messages_size += im.heap_size; continue; } @@ -286,6 +286,7 @@ where region, meta_peer, wait_data, + None, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -316,12 +317,16 @@ where engines: Engines, region_id: u64, peer: metapb::Peer, + create_by_peer: metapb::Peer, ) -> Result> { // We will remove tombstone key when apply snapshot info!( "replicate peer"; "region_id" => region_id, "peer_id" => peer.get_id(), + "store_id" => store_id, + "create_by_peer_id" => create_by_peer.get_id(), + "create_by_peer_store_id" => create_by_peer.get_store_id(), ); let mut region = metapb::Region::default(); @@ -341,6 +346,7 @@ where ®ion, peer, false, + Some(create_by_peer), )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -616,10 +622,16 @@ where let count = msgs.len(); for m in msgs.drain(..) { match m { - PeerMsg::RaftMessage(msg) => { + PeerMsg::RaftMessage(msg, sent_time) => { + if let Some(sent_time) = sent_time { + let wait_time = sent_time.saturating_elapsed().as_secs_f64(); + self.ctx.raft_metrics.process_wait_time.observe(wait_time); + } + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { continue; } + if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -823,6 +835,8 @@ where target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), demote_after_exit: true, }); + } else { + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Failed); } } else { self.unsafe_recovery_demote_failed_voters(syncer, failed_voters); @@ -862,6 +876,8 @@ where target_index: self.fsm.peer.raft_group.raft.raft_log.last_index(), demote_after_exit: false, }); + } else { + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::Failed); } } else { warn!( @@ -912,13 +928,22 @@ where self.fsm.peer.raft_group.raft.raft_log.committed }; - self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { - target_index, - syncer, - }); - self.fsm - .peer - .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + if target_index > self.fsm.peer.raft_group.raft.raft_log.applied { + info!( + "Unsafe recovery, start wait apply"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer_id(), + "target_index" => target_index, + "applied" => self.fsm.peer.raft_group.raft.raft_log.applied, + ); + self.fsm.peer.unsafe_recovery_state = Some(UnsafeRecoveryState::WaitApply { + target_index, + syncer, + }); + self.fsm + .peer + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); + } } // func be invoked firstly after assigned leader by BR, wait all leader apply to @@ -1048,8 +1073,15 @@ where split_keys, callback, source, + share_source_region_size, } => { - self.on_prepare_split_region(region_epoch, split_keys, callback, &source); + self.on_prepare_split_region( + region_epoch, + split_keys, + callback, + &source, + share_source_region_size, + ); } CasualMessage::ComputeHashResult { index, @@ -1058,11 +1090,11 @@ where } => { self.on_hash_computed(index, context, hash); } - CasualMessage::RegionApproximateSize { size } => { - self.on_approximate_region_size(size); + CasualMessage::RegionApproximateSize { size, splitable } => { + self.on_approximate_region_size(size, splitable); } - CasualMessage::RegionApproximateKeys { keys } => { - self.on_approximate_region_keys(keys); + CasualMessage::RegionApproximateKeys { keys, splitable } => { + self.on_approximate_region_keys(keys, splitable); } CasualMessage::RefreshRegionBuckets { region_epoch, @@ -1340,9 +1372,7 @@ where } fn on_clear_region_size(&mut self) { - self.fsm.peer.approximate_size = None; - self.fsm.peer.approximate_keys = None; - self.fsm.peer.may_skip_split_check = false; + self.fsm.peer.split_check_trigger.on_clear_region_size(); self.register_split_region_check_tick(); } @@ -1458,7 +1488,7 @@ where } => { self.on_enter_pre_force_leader(syncer, failed_stores); } - SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(), + SignificantMsg::ExitForceLeaderState => self.on_exit_force_leader(false), SignificantMsg::UnsafeRecoveryDemoteFailedVoters { syncer, failed_voters, @@ -1692,10 +1722,19 @@ where self.fsm.has_ready = true; } - fn on_exit_force_leader(&mut self) { + fn on_exit_force_leader(&mut self, force: bool) { if self.fsm.peer.force_leader.is_none() { return; } + if let Some(UnsafeRecoveryState::Failed) = self.fsm.peer.unsafe_recovery_state && !force { + // Skip force leader if the plan failed, so wait for the next retry of plan with force leader state holding + info!( + "skip exiting force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + return; + } info!( "exit force leader state"; @@ -1704,7 +1743,7 @@ where ); self.fsm.peer.force_leader = None; // make sure it's not hibernated - assert_eq!(self.fsm.hibernate_state.group_state(), GroupState::Ordered); + assert_ne!(self.fsm.hibernate_state.group_state(), GroupState::Idle); // leader lease shouldn't be renewed in force leader state. assert_eq!( self.fsm.peer.leader_lease().inspect(None), @@ -2266,7 +2305,10 @@ where } } // Destroy does not need be processed, the state is cleaned up together with peer. - Some(_) | None => {} + Some(UnsafeRecoveryState::Destroy { .. }) + | Some(UnsafeRecoveryState::Failed) + | Some(UnsafeRecoveryState::WaitInitialize(..)) + | None => {} } } @@ -2288,10 +2330,11 @@ where return; } let applied_index = res.apply_state.applied_index; - let buckets = self.fsm.peer.region_buckets.as_mut(); - if let (Some(delta), Some(buckets)) = (res.bucket_stat, buckets) { - buckets.merge(&delta); - } + self.fsm + .peer + .region_buckets_info_mut() + .add_bucket_flow(&res.bucket_stat); + self.fsm.has_ready |= self.fsm.peer.post_apply( self.ctx, res.apply_state, @@ -2460,6 +2503,7 @@ where } }); + let is_initialized_peer = self.fsm.peer.is_initialized(); debug!( "handle raft message"; "region_id" => self.region_id(), @@ -2467,6 +2511,7 @@ where "message_type" => %util::MsgType(&msg), "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), + "is_initialized_peer" => is_initialized_peer, ); if self.fsm.peer.pending_remove || self.fsm.stopped { @@ -2849,6 +2894,11 @@ where } fn reset_raft_tick(&mut self, state: GroupState) { + debug!( + "reset raft tick to {:?}", state; + "region_id"=> self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); self.fsm.reset_hibernate_state(state); self.fsm.missing_ticks = 0; self.fsm.peer.should_wake_up = false; @@ -3664,14 +3714,7 @@ where } let region_id = self.region_id(); - let is_initialized = self.fsm.peer.is_initialized(); - info!( - "starts destroy"; - "region_id" => region_id, - "peer_id" => self.fsm.peer_id(), - "merged_by_target" => merged_by_target, - "is_initialized" => is_initialized, - ); + let is_peer_initialized = self.fsm.peer.is_initialized(); // We can't destroy a peer which is handling snapshot. assert!(!self.fsm.peer.is_handling_snapshot()); @@ -3688,27 +3731,40 @@ where .snapshot_recovery_maybe_finish_wait_apply(/* force= */ true); } + (|| { + fail_point!( + "before_destroy_peer_on_peer_1003", + self.fsm.peer.peer_id() == 1003, + |_| {} + ); + })(); let mut meta = self.ctx.store_meta.lock().unwrap(); - let is_region_initialized_in_meta = meta - .regions - .get(®ion_id) - .map_or(false, |region| is_region_initialized(region)); - if !is_initialized && is_region_initialized_in_meta { - let region_in_meta = meta.regions.get(®ion_id).unwrap(); - error!( - "peer is destroyed inconsistently"; - "region_id" => region_id, + let is_latest_initialized = { + if let Some(latest_region_info) = meta.regions.get(®ion_id) { + util::is_region_initialized(latest_region_info) + } else { + false + } + }; + + if !is_peer_initialized && is_latest_initialized { + info!("skip destroy uninitialized peer as it's already initialized in meta"; + "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), - "peers" => ?self.region().get_peers(), "merged_by_target" => merged_by_target, - "is_initialized" => is_initialized, - "is_region_initialized_in_meta" => is_region_initialized_in_meta, - "start_key_in_meta" => log_wrappers::Value::key(region_in_meta.get_start_key()), - "end_key_in_meta" => log_wrappers::Value::key(region_in_meta.get_end_key()), - "peers_in_meta" => ?region_in_meta.get_peers(), ); + return false; } + info!( + "starts destroy"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "merged_by_target" => merged_by_target, + "is_peer_initialized" => is_peer_initialized, + "is_latest_initialized" => is_latest_initialized, + ); + if meta.atomic_snap_regions.contains_key(&self.region_id()) { drop(meta); panic!( @@ -3764,7 +3820,7 @@ where self.ctx.router.close(region_id); self.fsm.stop(); - if is_initialized + if is_peer_initialized && !merged_by_target && meta .region_ranges @@ -3773,6 +3829,7 @@ where { panic!("{} meta corruption detected", self.fsm.peer.tag); } + if meta.regions.remove(®ion_id).is_none() && !merged_by_target { panic!("{} meta corruption detected", self.fsm.peer.tag) } @@ -3785,14 +3842,19 @@ where self.fsm.peer.tag ); } else { + // Remove itself from atomic_snap_regions as it has cleaned both + // data and metadata. let target_region_id = *meta.targets_map.get(®ion_id).unwrap(); - let is_ready = meta - .atomic_snap_regions + meta.atomic_snap_regions .get_mut(&target_region_id) .unwrap() - .get_mut(®ion_id) - .unwrap(); - *is_ready = true; + .remove(®ion_id); + meta.destroyed_region_for_snap.remove(®ion_id); + info!("peer has destroyed, clean up for incoming overlapped snapshot"; + "region_id" => region_id, + "peer_id" => self.fsm.peer_id(), + "target_region_id" => target_region_id, + ); } } @@ -4032,6 +4094,7 @@ where derived: metapb::Region, regions: Vec, new_split_regions: HashMap, + share_source_region_size: bool, ) { fail_point!("on_split", self.ctx.store_id() == 3, |_| {}); @@ -4053,8 +4116,25 @@ where // Roughly estimate the size and keys for new regions. let new_region_count = regions.len() as u64; - let estimated_size = self.fsm.peer.approximate_size.map(|v| v / new_region_count); - let estimated_keys = self.fsm.peer.approximate_keys.map(|v| v / new_region_count); + let mut share_size = None; + let mut share_keys = None; + // if share_source_region_size is true, it means the new region contains any + // data from the origin region + if share_source_region_size { + share_size = self + .fsm + .peer + .split_check_trigger + .approximate_size + .map(|v| v / new_region_count); + share_keys = self + .fsm + .peer + .split_check_trigger + .approximate_keys + .map(|v| v / new_region_count); + } + let mut meta = self.ctx.store_meta.lock().unwrap(); meta.set_region( &self.ctx.coprocessor_host, @@ -4064,13 +4144,12 @@ where ); self.fsm.peer.post_split(); - // It's not correct anymore, so set it to false to schedule a split check task. - self.fsm.peer.may_skip_split_check = false; - let is_leader = self.fsm.peer.is_leader(); if is_leader { - self.fsm.peer.approximate_size = estimated_size; - self.fsm.peer.approximate_keys = estimated_keys; + if share_source_region_size { + self.fsm.peer.split_check_trigger.approximate_size = share_size; + self.fsm.peer.split_check_trigger.approximate_keys = share_keys; + } self.fsm.peer.heartbeat_pd(self.ctx); // Notify pd immediately to let it update the region meta. info!( @@ -4098,7 +4177,6 @@ where if meta.region_ranges.remove(&last_key).is_none() { panic!("{} original region should exist", self.fsm.peer.tag); } - let last_region_id = regions.last().unwrap().get_id(); for (new_region, locks) in regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); @@ -4139,6 +4217,7 @@ where // Insert new regions and validation let mut is_uninitialized_peer_exist = false; + let self_store_id = self.ctx.store.get_id(); if let Some(r) = meta.regions.get(&new_region_id) { // Suppose a new node is added by conf change and the snapshot comes slowly. // Then, the region splits and the first vote message comes to the new node @@ -4160,6 +4239,7 @@ where "region_id" => new_region_id, "region" => ?new_region, "is_uninitialized_peer_exist" => is_uninitialized_peer_exist, + "store_id" => self_store_id, ); let (sender, mut new_peer) = match PeerFsm::create( @@ -4203,8 +4283,8 @@ where new_peer.has_ready |= campaigned; if is_leader { - new_peer.peer.approximate_size = estimated_size; - new_peer.peer.approximate_keys = estimated_keys; + new_peer.peer.split_check_trigger.approximate_size = share_size; + new_peer.peer.split_check_trigger.approximate_keys = share_keys; *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. @@ -4222,11 +4302,6 @@ where .insert(new_region_id, ReadDelegate::from_peer(new_peer.get_peer())); meta.region_read_progress .insert(new_region_id, new_peer.peer.read_progress.clone()); - if last_region_id == new_region_id { - // To prevent from big region, the right region needs run split - // check again after split. - new_peer.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; - } let mailbox = BasicMailbox::new(sender, new_peer, self.ctx.router.state_cnt().clone()); self.ctx.router.register(new_region_id, mailbox); self.ctx @@ -4239,7 +4314,10 @@ where .pending_msgs .swap_remove_front(|m| m.get_to_peer() == &meta_peer) { - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size: 0, msg }); + let peer_msg = PeerMsg::RaftMessage( + InspectedRaftMessage { heap_size: 0, msg }, + Some(TiInstant::now()), + ); if let Err(e) = self.ctx.router.force_send(new_region_id, peer_msg) { warn!("handle first requset failed"; "region_id" => region_id, "error" => ?e); } @@ -4405,6 +4483,9 @@ where fn schedule_merge(&mut self) -> Result<()> { fail_point!("on_schedule_merge", |_| Ok(())); + fail_point!("on_schedule_merge_ret_err", |_| Err(Error::RegionNotFound( + 1 + ))); let (request, target_id) = { let state = self.fsm.peer.pending_merge_state.as_ref().unwrap(); let expect_region = state.get_target(); @@ -4528,6 +4609,17 @@ where "error_code" => %e.error_code(), ); self.rollback_merge(); + } else if let Some(ForceLeaderState::ForceLeader { .. }) = + &self.fsm.peer.force_leader + { + info!( + "failed to schedule merge, rollback in force leader state"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + "error_code" => %e.error_code(), + ); + self.rollback_merge(); } } else if !is_learner(&self.fsm.peer.peer) { info!( @@ -4554,6 +4646,7 @@ where } fn on_ready_prepare_merge(&mut self, region: metapb::Region, state: MergeState) { + fail_point!("on_apply_res_prepare_merge"); { let mut meta = self.ctx.store_meta.lock().unwrap(); meta.set_region( @@ -4703,7 +4796,7 @@ where // make approximate size and keys updated in time. // the reason why follower need to update is that there is a issue that after // merge and then transfer leader, the new leader may have stale size and keys. - self.fsm.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; + self.fsm.peer.split_check_trigger.reset_skip_check(); self.fsm.peer.reset_region_buckets(); if self.fsm.peer.is_leader() { info!( @@ -4896,6 +4989,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), "region" => ?region, + "destroy_regions" => ?persist_res.destroy_regions, ); let mut state = self.ctx.global_replication_state.lock().unwrap(); @@ -5031,7 +5125,13 @@ where derived, regions, new_split_regions, - } => self.on_ready_split_region(derived, regions, new_split_regions), + share_source_region_size, + } => self.on_ready_split_region( + derived, + regions, + new_split_regions, + share_source_region_size, + ), ExecResult::PrepareMerge { region, state } => { self.on_ready_prepare_merge(region, state) } @@ -5158,6 +5258,14 @@ where &mut self, msg: &RaftCmdRequest, ) -> Result> { + // failpoint + fail_point!( + "fail_pre_propose_split", + msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSplit, + |_| Err(Error::Other(box_err!("fail_point"))) + ); + // Check store_id, make sure that the msg is dispatched to the right place. if let Err(e) = util::check_store_id(msg.get_header(), self.store_id()) { self.ctx @@ -5193,7 +5301,8 @@ where // error-prone if !(msg.has_admin_request() && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer - || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) + || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2 + || msg.get_admin_request().get_cmd_type() == AdminCmdType::RollbackMerge)) { return Err(Error::RecoveryInProgress(self.region_id())); } @@ -5381,7 +5490,10 @@ where return; } Err(e) => { - debug!( + // log for admin requests + let is_admin_request = msg.has_admin_request(); + info_or_debug!( + is_admin_request; "failed to propose"; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -5749,14 +5861,16 @@ where // whether the region should split. // We assume that `may_skip_split_check` is only set true after the split check // task is scheduled. - if self.fsm.peer.may_skip_split_check - && self.fsm.peer.compaction_declined_bytes < self.ctx.cfg.region_split_check_diff().0 - && self.fsm.peer.size_diff_hint < self.ctx.cfg.region_split_check_diff().0 + if self + .fsm + .peer + .split_check_trigger + .should_skip(self.ctx.cfg.region_split_check_diff().0) { return; } - fail_point!("on_split_region_check_tick"); + fail_point!("on_split_region_check_tick", |_| {}); self.register_split_region_check_tick(); // To avoid frequent scan, we only add new scan tasks if all previous tasks @@ -5766,6 +5880,11 @@ where return; } + // To avoid run the check if it's splitting. + if self.fsm.peer.is_splitting() { + return; + } + // When Lightning or BR is importing data to TiKV, their ingest-request may fail // because of region-epoch not matched. So we hope TiKV do not check region size // and split region during importing. @@ -5804,10 +5923,7 @@ where ); return; } - self.fsm.peer.size_diff_hint = 0; - self.fsm.peer.compaction_declined_bytes = 0; - // the task is scheduled, next tick may skip it. - self.fsm.peer.may_skip_split_check = true; + self.fsm.peer.split_check_trigger.post_triggered(); } fn on_prepare_split_region( @@ -5816,6 +5932,7 @@ where split_keys: Vec>, cb: Callback, source: &str, + share_source_region_size: bool, ) { info!( "on split"; @@ -5861,6 +5978,7 @@ where split_keys, peer: self.fsm.peer.peer.clone(), right_derive: self.ctx.cfg.right_derive_when_split, + share_source_region_size, callback: cb, }; if let Err(ScheduleError::Stopped(t)) = self.ctx.pd_scheduler.schedule(task) { @@ -5881,15 +5999,21 @@ where } } - fn on_approximate_region_size(&mut self, size: u64) { - self.fsm.peer.approximate_size = Some(size); + fn on_approximate_region_size(&mut self, size: Option, splitable: Option) { + self.fsm + .peer + .split_check_trigger + .on_approximate_region_size(size, splitable); self.register_split_region_check_tick(); self.register_pd_heartbeat_tick(); fail_point!("on_approximate_region_size"); } - fn on_approximate_region_keys(&mut self, keys: u64) { - self.fsm.peer.approximate_keys = Some(keys); + fn on_approximate_region_keys(&mut self, keys: Option, splitable: Option) { + self.fsm + .peer + .split_check_trigger + .on_approximate_region_keys(keys, splitable); self.register_split_region_check_tick(); self.register_pd_heartbeat_tick(); } @@ -5897,7 +6021,7 @@ where fn on_refresh_region_buckets( &mut self, region_epoch: RegionEpoch, - mut buckets: Vec, + buckets: Vec, bucket_ranges: Option>, _cb: Callback, ) { @@ -5912,27 +6036,6 @@ where } }; - // bucket version layout - // term logical counter - // |-----------|-----------| - // high bits low bits - // term: given 10s election timeout, the 32 bit means 1362 year running time - let gen_bucket_version = |term, current_version| { - let current_version_term = current_version >> 32; - let bucket_version: u64 = if current_version_term == term { - current_version + 1 - } else { - if term > u32::MAX.into() { - error!( - "unexpected term {} more than u32::MAX. Bucket version will be backward.", - term - ); - } - term << 32 - }; - bucket_version - }; - let region = self.fsm.peer.region(); if util::is_epoch_stale(®ion_epoch, region.get_region_epoch()) { info!( @@ -5946,14 +6049,14 @@ where // test purpose #[cfg(any(test, feature = "testexport"))] { - let default_buckets = BucketStat::default(); test_only_callback( _cb, self.fsm .peer - .region_buckets + .region_buckets_info() + .bucket_stat() .as_ref() - .unwrap_or(&default_buckets) + .unwrap() .meta .clone(), ); @@ -5961,108 +6064,54 @@ where return; } - let mut current_version = self + let current_version = self.fsm.peer.region_buckets_info().version(); + let next_bucket_version = util::gen_bucket_version(self.fsm.peer.term(), current_version); + let region = self.region().clone(); + let change_bucket_version = self + .fsm + .peer + .region_buckets_info_mut() + .on_refresh_region_buckets( + &self.ctx.coprocessor_host.cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + let region_buckets = self .fsm .peer - .region_buckets + .region_buckets_info() + .bucket_stat() .as_ref() - .map(|b| b.meta.version) - .unwrap_or_default(); - if current_version == 0 { - current_version = self - .fsm - .peer - .last_region_buckets - .as_ref() - .map(|b| b.meta.version) - .unwrap_or_default(); - } - let mut region_buckets: BucketStat; - if let Some(bucket_ranges) = bucket_ranges { - assert_eq!(buckets.len(), bucket_ranges.len()); - let mut i = 0; - region_buckets = self.fsm.peer.region_buckets.clone().unwrap(); - let mut meta = (*region_buckets.meta).clone(); - if !buckets.is_empty() { - meta.version = gen_bucket_version(self.fsm.peer.term(), current_version); - } - meta.region_epoch = region_epoch; - for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { - while i < meta.keys.len() && meta.keys[i] != bucket_range.0 { - i += 1; - } - assert!(i != meta.keys.len()); - // the bucket size is small and does not have split keys, - // then it should be merged with its left neighbor - let region_bucket_merge_size = - self.ctx.coprocessor_host.cfg.region_bucket_merge_size_ratio - * (self.ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); - if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { - meta.sizes[i] = bucket.size; - // i is not the last entry (which is end key) - assert!(i < meta.keys.len() - 1); - // the region has more than one bucket - // and the left neighbor + current bucket size is not very big - if meta.keys.len() > 2 - && i != 0 - && meta.sizes[i - 1] + bucket.size - < self.ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 - { - // bucket is too small - region_buckets.left_merge(i); - meta.left_merge(i); - continue; - } - } else { - // update size - meta.sizes[i] = bucket.size / (bucket.keys.len() + 1) as u64; - // insert new bucket keys (split the original bucket) - for bucket_key in bucket.keys { - i += 1; - region_buckets.split(i); - meta.split(i, bucket_key); - } - } - i += 1; - } - region_buckets.meta = Arc::new(meta); - } else { - debug!( - "refresh_region_buckets re-generates buckets"; + .unwrap() + .clone(); + let buckets_count = region_buckets.meta.keys.len() - 1; + if change_bucket_version { + // TODO: we may need to make it debug once the coprocessor timeout is resolved. + info!( + "finished on_refresh_region_buckets"; "region_id" => self.fsm.region_id(), + "buckets_count" => buckets_count, + "buckets_size" => ?region_buckets.meta.sizes, ); - assert_eq!(buckets.len(), 1); - let bucket_keys = buckets.pop().unwrap().keys; - let bucket_count = bucket_keys.len() + 1; - - let mut meta = BucketMeta { - region_id: self.fsm.region_id(), - region_epoch, - version: gen_bucket_version(self.fsm.peer.term(), current_version), - keys: bucket_keys, - sizes: vec![self.ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], - }; - meta.keys.insert(0, region.get_start_key().to_vec()); - meta.keys.push(region.get_end_key().to_vec()); - region_buckets = BucketStat::from_meta(Arc::new(meta)); + } else { + // it means the buckets key range not any change, so don't need to refresh. + #[cfg(any(test, feature = "testexport"))] + test_only_callback(_cb, region_buckets.meta); + return; } - - let buckets_count = region_buckets.meta.keys.len() - 1; self.ctx.coprocessor_host.on_region_changed( - region, + self.region(), RegionChangeEvent::UpdateBuckets(buckets_count), self.fsm.peer.get_role(), ); let keys = region_buckets.meta.keys.clone(); - let old_region_buckets: Option = - self.fsm.peer.region_buckets.replace(region_buckets); - self.fsm.peer.last_region_buckets = old_region_buckets; + let version = region_buckets.meta.version; let mut store_meta = self.ctx.store_meta.lock().unwrap(); - let version = self.fsm.peer.region_buckets.as_ref().unwrap().meta.version; if let Some(reader) = store_meta.readers.get_mut(&self.fsm.region_id()) { - reader.update(ReadProgress::region_buckets( - self.fsm.peer.region_buckets.as_ref().unwrap().meta.clone(), - )); + reader.update(ReadProgress::region_buckets(region_buckets.meta.clone())); } // Notify followers to refresh their buckets version @@ -6083,19 +6132,9 @@ where .send_extra_message(extra_msg, &mut self.ctx.trans, &p); } } - - debug!( - "finished on_refresh_region_buckets"; - "region_id" => self.fsm.region_id(), - "buckets_count" => buckets_count, - "buckets_size" => ?self.fsm.peer.region_buckets.as_ref().unwrap().meta.sizes, - ); // test purpose #[cfg(any(test, feature = "testexport"))] - test_only_callback( - _cb, - self.fsm.peer.region_buckets.as_ref().unwrap().meta.clone(), - ); + test_only_callback(_cb, region_buckets.meta); } pub fn on_msg_refresh_buckets(&mut self, msg: RaftMessage) { @@ -6122,8 +6161,10 @@ where } fn on_compaction_declined_bytes(&mut self, declined_bytes: u64) { - self.fsm.peer.compaction_declined_bytes += declined_bytes; - if self.fsm.peer.compaction_declined_bytes >= self.ctx.cfg.region_split_check_diff().0 { + self.fsm.peer.split_check_trigger.compaction_declined_bytes += declined_bytes; + if self.fsm.peer.split_check_trigger.compaction_declined_bytes + >= self.ctx.cfg.region_split_check_diff().0 + { UPDATE_REGION_SIZE_BY_COMPACTION_COUNTER.inc(); } self.register_split_region_check_tick(); @@ -6134,50 +6175,11 @@ where if !self.ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } - let region_buckets = self.fsm.peer.region_buckets.as_ref()?; - let stats = ®ion_buckets.stats; - let keys = ®ion_buckets.meta.keys; - - let empty_last_keys = vec![]; - let empty_last_stats = metapb::BucketStats::default(); - let (last_keys, last_stats, stats_reset) = self - .fsm + let region_bucket_max_size = self.ctx.coprocessor_host.cfg.region_bucket_size.0 * 2; + self.fsm .peer - .last_region_buckets - .as_ref() - .map(|b| { - ( - &b.meta.keys, - &b.stats, - region_buckets.create_time != b.create_time, - ) - }) - .unwrap_or((&empty_last_keys, &empty_last_stats, false)); - - let mut bucket_ranges = vec![]; - let mut j = 0; - assert_eq!(keys.len(), stats.write_bytes.len() + 1); - for i in 0..stats.write_bytes.len() { - let mut diff_in_bytes = stats.write_bytes[i]; - while j < last_keys.len() && keys[i] > last_keys[j] { - j += 1; - } - if j < last_keys.len() && keys[i] == last_keys[j] { - if !stats_reset { - diff_in_bytes -= last_stats.write_bytes[j]; - } - j += 1; - } - - // if the bucket's write_bytes exceed half of the configured region_bucket_size, - // add it to the bucket_ranges for checking update - let bucket_update_diff_size_threshold = - self.ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; - if diff_in_bytes >= bucket_update_diff_size_threshold { - bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); - } - } - Some(bucket_ranges) + .region_buckets_info() + .gen_bucket_range_for_update(region_bucket_max_size) } fn on_schedule_half_split_region( @@ -6237,6 +6239,11 @@ where cb(peer_stat); } } + + // only check the suspect buckets, not split region. + if source == "bucket" { + return; + } let task = SplitCheckTask::split_check_key_range( region.clone(), start_key, @@ -6323,13 +6330,6 @@ where return; } - if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { - // Clean up the force leader state after a timeout, since the PD recovery - // process may have been aborted for some reasons. - if time.saturating_elapsed() > UNSAFE_RECOVERY_STATE_TIMEOUT { - self.on_exit_force_leader(); - } - } if let Some(state) = &mut self.fsm.peer.unsafe_recovery_state { let unsafe_recovery_state_timeout_failpoint = || -> bool { fail_point!("unsafe_recovery_state_timeout", |_| true); @@ -6342,6 +6342,15 @@ where { info!("timeout, abort unsafe recovery"; "state" => ?state); state.abort(); + self.fsm.peer.unsafe_recovery_state = None; + } + } + + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // Clean up the force leader state after a timeout, since the PD recovery + // process may have been aborted for some reasons. + if time.saturating_elapsed() > UNSAFE_RECOVERY_STATE_TIMEOUT { + self.on_exit_force_leader(true); } } @@ -6392,19 +6401,26 @@ where fail_point!("peer_check_stale_state", state != StaleState::Valid, |_| {}); match state { StaleState::Valid => (), - StaleState::LeaderMissing => { - warn!( - "leader missing longer than abnormal_leader_missing_duration"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - "expect" => %self.ctx.cfg.abnormal_leader_missing_duration, - ); - self.ctx - .raft_metrics - .leader_missing - .lock() - .unwrap() - .insert(self.region_id()); + StaleState::LeaderMissing | StaleState::MaybeLeaderMissing => { + if state == StaleState::LeaderMissing { + warn!( + "leader missing longer than abnormal_leader_missing_duration"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "expect" => %self.ctx.cfg.abnormal_leader_missing_duration, + ); + self.ctx + .raft_metrics + .leader_missing + .lock() + .unwrap() + .insert(self.region_id()); + } + + // It's very likely that this is a stale peer. To prevent + // resolved ts from being blocked for too long, we check stale + // peer eagerly. + self.fsm.peer.bcast_check_stale_peer_message(self.ctx); } StaleState::ToValidate => { // for peer B in case 1 above @@ -6471,7 +6487,7 @@ where fn on_report_region_buckets_tick(&mut self) { if !self.fsm.peer.is_leader() - || self.fsm.peer.region_buckets.is_none() + || self.fsm.peer.region_buckets_info().bucket_stat().is_none() || self.fsm.hibernate_state.group_state() == GroupState::Idle { return; @@ -6479,11 +6495,11 @@ where let region_id = self.region_id(); let peer_id = self.fsm.peer_id(); - let region_buckets = self.fsm.peer.region_buckets.as_mut().unwrap(); + let region_buckets = self.fsm.peer.region_buckets_info_mut().report_bucket_stat(); if let Err(e) = self .ctx .pd_scheduler - .schedule(PdTask::ReportBuckets(region_buckets.clone())) + .schedule(PdTask::ReportBuckets(region_buckets)) { error!( "failed to report region buckets"; @@ -6492,8 +6508,6 @@ where "err" => ?e, ); } - // todo: it will delete in next pr. - region_buckets.stats = new_bucket_stats(®ion_buckets.meta); self.register_report_region_buckets_tick(); } @@ -6562,17 +6576,14 @@ where size += sst.total_bytes; keys += sst.total_kvs; } - self.fsm.peer.approximate_size = - Some(self.fsm.peer.approximate_size.unwrap_or_default() + size); - self.fsm.peer.approximate_keys = - Some(self.fsm.peer.approximate_keys.unwrap_or_default() + keys); + self.fsm + .peer + .split_check_trigger + .on_ingest_sst_result(size, keys); - if let Some(buckets) = &mut self.fsm.peer.region_buckets { + if let Some(buckets) = &mut self.fsm.peer.region_buckets_info_mut().bucket_stat_mut() { buckets.ingest_sst(keys, size); } - // The ingested file may be overlapped with the data in engine, so we need to - // check it again to get the accurate value. - self.fsm.peer.may_skip_split_check = false; if self.fsm.peer.is_leader() { self.on_pd_heartbeat_tick(); self.register_split_region_check_tick(); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c21ea65a589..9c3274d7945 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -14,7 +14,7 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, }, - time::{Duration, Instant}, + time::{Duration, Instant, SystemTime}, u64, }; @@ -36,7 +36,6 @@ use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; use kvproto::{ - import_sstpb::{SstMeta, SwitchMode}, metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, @@ -60,8 +59,11 @@ use tikv_util::{ mpsc::{self, LooseBoundedSender, Receiver}, slow_log, store::{find_peer, region_on_stores}, - sys as sys_util, - sys::disk::{get_disk_status, DiskUsage}, + sys::{ + self as sys_util, + cpu_time::ProcessStat, + disk::{get_disk_status, DiskUsage}, + }, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, timer::SteadyTimer, warn, @@ -91,7 +93,7 @@ use crate::{ ApplyBatchSystem, ApplyNotifier, ApplyPollerBuilder, ApplyRes, ApplyRouter, ApplyTaskRes, }, - local_metrics::RaftMetrics, + local_metrics::{IoType as InspectIoType, RaftMetrics}, memory::*, metrics::*, peer_storage, @@ -105,9 +107,10 @@ use crate::{ ReadDelegate, RefreshConfigRunner, RefreshConfigTask, RegionRunner, RegionTask, SplitCheckTask, }, - Callback, CasualMessage, CompactThreshold, GlobalReplicationState, InspectedRaftMessage, - MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, SignificantMsg, SnapManager, - StoreMsg, StoreTick, + worker_metrics::PROCESS_STAT_CPU_USAGE, + Callback, CasualMessage, CompactThreshold, FullCompactController, GlobalReplicationState, + InspectedRaftMessage, MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, + SignificantMsg, SnapManager, StoreMsg, StoreTick, }, Error, Result, }; @@ -118,6 +121,13 @@ pub const PENDING_MSG_CAP: usize = 100; pub const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region +// Every 30 minutes, check if we can run full compaction. This allows the config +// setting `periodic_full_compact_start_times` to be changed dynamically. +const PERIODIC_FULL_COMPACT_TICK_INTERVAL_DURATION: Duration = Duration::from_secs(30 * 60); +// If periodic full compaction is enabled (`periodic_full_compact_start_times` +// is set), sample load metrics every 10 minutes. +const LOAD_STATS_WINDOW_DURATION: Duration = Duration::from_secs(10 * 60); + pub struct StoreInfo { pub kv_engine: EK, pub raft_engine: ER, @@ -383,7 +393,10 @@ where for e in msg.get_message().get_entries() { heap_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = PeerMsg::RaftMessage( + InspectedRaftMessage { heap_size, msg }, + Some(TiInstant::now()), + ); let event = TraceEvent::Add(heap_size); let send_failed = Cell::new(true); @@ -398,13 +411,13 @@ where send_failed.set(false); return Ok(()); } - Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(im)))) => { + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(im, _)))) => { return Err(TrySendError::Full(im.msg)); } - Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im)))) => { + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im, _)))) => { return Err(TrySendError::Disconnected(im.msg)); } - Either::Right(PeerMsg::RaftMessage(im)) => StoreMsg::RaftMessage(im), + Either::Right(PeerMsg::RaftMessage(im, _)) => StoreMsg::RaftMessage(im), _ => unreachable!(), }; match self.send_control(store_msg) { @@ -468,10 +481,6 @@ where self.update_trace(); } - pub fn clear_cache(&self) { - self.router.clear_cache(); - } - fn update_trace(&self) { let router_trace = self.router.trace(); MEMTRACE_RAFT_ROUTER_ALIVE.trace(TraceEvent::Reset(router_trace.alive)); @@ -527,7 +536,7 @@ where pub region_scheduler: Scheduler>, pub apply_router: ApplyRouter, pub router: RaftRouter, - pub importer: Arc, + pub importer: Arc>, pub store_meta: Arc>, pub feature_gate: FeatureGate, /// region_id -> (peer_id, is_splitting) @@ -577,6 +586,8 @@ where pub pending_latency_inspect: Vec, pub safe_point: Arc, + + pub process_stat: Option, } impl PollContext @@ -676,6 +687,8 @@ where "region_id" => region_id, "current_region_epoch" => ?cur_epoch, "msg_type" => ?msg_type, + "to_peer_id" => ?from_peer.get_id(), + "to_peer_store_id" => ?from_peer.get_store_id(), ); self.raft_metrics.message_dropped.stale_msg.inc(); @@ -694,6 +707,8 @@ where error!(?e; "send gc message failed"; "region_id" => region_id, + "to_peer_id" => ?from_peer.get_id(), + "to_peer_store_id" => ?from_peer.get_store_id(), ); } } @@ -770,8 +785,11 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreTick::SnapGc => self.on_snap_mgr_gc(), StoreTick::CompactLockCf => self.on_compact_lock_cf(), StoreTick::CompactCheck => self.on_compact_check_tick(), + StoreTick::PeriodicFullCompact => self.on_full_compact_tick(), + StoreTick::LoadMetricsWindow => self.on_load_metrics_window_tick(), StoreTick::ConsistencyCheck => self.on_consistency_check_tick(), StoreTick::CleanupImportSst => self.on_cleanup_import_sst_tick(), + StoreTick::PdReportMinResolvedTs => self.on_pd_report_min_resolved_ts_tick(), } let elapsed = timer.saturating_elapsed(); self.ctx @@ -814,9 +832,6 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } } StoreMsg::CompactedEvent(event) => self.on_compaction_finished(event), - StoreMsg::ValidateSstResult { invalid_ssts } => { - self.on_validate_sst_result(invalid_ssts) - } StoreMsg::ClearRegionSizeInRange { start_key, end_key } => { self.clear_region_size_in_range(&start_key, &end_key) } @@ -832,6 +847,14 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> mut inspector, } => { inspector.record_store_wait(send_time.saturating_elapsed()); + inspector.record_store_commit( + self.ctx + .raft_metrics + .health_stats + .avg(InspectIoType::Network), + ); + // Reset the health_stats and wait it to be refreshed in the next tick. + self.ctx.raft_metrics.health_stats.reset(); self.ctx.pending_latency_inspect.push(inspector); } StoreMsg::UnsafeRecoveryReport(report) => self.store_heartbeat_pd(Some(report)), @@ -863,7 +886,10 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> self.fsm.store.start_time = Some(time::get_time()); self.register_cleanup_import_sst_tick(); self.register_compact_check_tick(); + self.register_full_compact_tick(); + self.register_load_metrics_window_tick(); self.register_pd_store_heartbeat_tick(); + self.register_pd_report_min_resolved_ts_tick(); self.register_compact_lock_cf_tick(); self.register_snap_mgr_gc_tick(); self.register_consistency_check_tick(); @@ -1188,7 +1214,7 @@ pub struct RaftPollerBuilder { pub region_scheduler: Scheduler>, apply_router: ApplyRouter, pub router: RaftRouter, - pub importer: Arc, + pub importer: Arc>, pub store_meta: Arc>, pub pending_create_peers: Arc>>, snap_mgr: SnapManager, @@ -1460,6 +1486,7 @@ where sync_write_worker, pending_latency_inspect: vec![], safe_point: self.safe_point.clone(), + process_stat: None, }; ctx.update_ticks_timeout(); let tag = format!("[store {}]", ctx.store.get_id()); @@ -1577,7 +1604,7 @@ impl RaftBatchSystem { pd_worker: LazyWorker>, store_meta: Arc>, coprocessor_host: CoprocessorHost, - importer: Arc, + importer: Arc>, split_check_scheduler: Scheduler, background_worker: Worker, auto_split_controller: AutoSplitController, @@ -1617,7 +1644,7 @@ impl RaftBatchSystem { } else { None }; - + let bgworker_remote = background_worker.remote(); let workers = Workers { pd_worker, background_worker, @@ -1655,13 +1682,8 @@ impl RaftBatchSystem { ReadRunner::new(self.router.clone(), engines.raft.clone()), ); - let compact_runner = CompactRunner::new(engines.kv.clone()); - let cleanup_sst_runner = CleanupSstRunner::new( - meta.get_id(), - self.router.clone(), - Arc::clone(&importer), - Arc::clone(&pd_client), - ); + let compact_runner = CompactRunner::new(engines.kv.clone(), bgworker_remote); + let cleanup_sst_runner = CleanupSstRunner::new(Arc::clone(&importer)); let gc_snapshot_runner = GcSnapshotRunner::new( meta.get_id(), self.router.clone(), // RaftRouter @@ -1687,7 +1709,6 @@ impl RaftBatchSystem { &cfg, )?; - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); let mut builder = RaftPollerBuilder { cfg, store: meta, @@ -1724,7 +1745,6 @@ impl RaftBatchSystem { mgr, pd_client, collector_reg_handle, - region_read_progress, health_service, causal_ts_provider, snap_generator_pool, @@ -1743,7 +1763,6 @@ impl RaftBatchSystem { snap_mgr: SnapManager, pd_client: Arc, collector_reg_handle: CollectorRegHandle, - region_read_progress: RegionReadProgressRegistry, health_service: Option, causal_ts_provider: Option>, // used for rawkv apiv2 snap_generator_pool: FuturePool, @@ -1835,7 +1854,6 @@ impl RaftBatchSystem { snap_mgr, workers.pd_worker.remote(), collector_reg_handle, - region_read_progress, health_service, coprocessor_host, causal_ts_provider, @@ -1847,8 +1865,6 @@ impl RaftBatchSystem { warn!("set thread priority for raftstore failed"; "error" => ?e); } self.workers = Some(workers); - // This router will not be accessed again, free all caches. - self.router.clear_cache(); Ok(()) } @@ -1961,7 +1977,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } info!( "region doesn't exist yet, wait for it to be split"; - "region_id" => region_id + "region_id" => region_id, + "to_peer_id" => msg.get_to_peer().get_id(), ); return Ok(CheckMsgStatus::FirstRequest); } @@ -2081,14 +2098,18 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER }); let region_id = msg.msg.get_region_id(); - let msg = match self.ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) { + let msg = match self + .ctx + .router + .send(region_id, PeerMsg::RaftMessage(msg, None)) + { Ok(()) => { forwarded.set(true); return Ok(()); } Err(TrySendError::Full(_)) => return Ok(()), Err(TrySendError::Disconnected(_)) if self.ctx.router.is_shutdown() => return Ok(()), - Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im))) => im.msg, + Err(TrySendError::Disconnected(PeerMsg::RaftMessage(im, None))) => im.msg, Err(_) => unreachable!(), }; @@ -2160,7 +2181,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER check_msg_status == CheckMsgStatus::NewPeerFirst, )? { // Peer created, send the message again. - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = + PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); if self.ctx.router.send(region_id, peer_msg).is_ok() { forwarded.set(true); } @@ -2183,7 +2205,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER store_meta.pending_msgs.push(msg); } else { drop(store_meta); - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }); + let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); if let Err(e) = self.ctx.router.force_send(region_id, peer_msg) { warn!("handle first request failed"; "region_id" => region_id, "error" => ?e); } else { @@ -2382,6 +2404,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.engines.clone(), region_id, target.clone(), + msg.get_from_peer().clone(), )?; // WARNING: The checking code must be above this line. @@ -2442,6 +2465,127 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } + fn register_load_metrics_window_tick(&self) { + // For now, we will only gather these metrics is periodic full compaction is + // enabled. + if !self.ctx.cfg.periodic_full_compact_start_times.is_empty() { + self.ctx + .schedule_store_tick(StoreTick::LoadMetricsWindow, LOAD_STATS_WINDOW_DURATION) + } + } + + fn on_load_metrics_window_tick(&mut self) { + self.register_load_metrics_window_tick(); + + let proc_stat = self + .ctx + .process_stat + .get_or_insert_with(|| ProcessStat::cur_proc_stat().unwrap()); + let cpu_usage: f64 = proc_stat.cpu_usage().unwrap(); + PROCESS_STAT_CPU_USAGE.set(cpu_usage); + } + + fn register_full_compact_tick(&self) { + if !self.ctx.cfg.periodic_full_compact_start_times.is_empty() { + self.ctx.schedule_store_tick( + StoreTick::PeriodicFullCompact, + PERIODIC_FULL_COMPACT_TICK_INTERVAL_DURATION, + ) + } + } + + fn on_full_compact_tick(&mut self) { + self.register_full_compact_tick(); + + let local_time = chrono::Local::now(); + if !self + .ctx + .cfg + .periodic_full_compact_start_times + .is_scheduled_this_hour(&local_time) + { + debug!( + "full compaction may not run at this time"; + "local_time" => ?local_time, + "periodic_full_compact_start_times" => ?self.ctx.cfg.periodic_full_compact_start_times, + ); + return; + } + + let compact_predicate_fn = self.is_low_load_for_full_compact(); + // Do not start if the load is high. + if !compact_predicate_fn() { + return; + } + + let ranges = self.ranges_for_full_compact(); + + let compact_load_controller = + FullCompactController::new(1, 15 * 60, Box::new(compact_predicate_fn)); + + // Attempt executing a periodic full compaction. + // Note that full compaction will not run if another full compact tasks has + // started. + if let Err(e) = self.ctx.cleanup_scheduler.schedule(CleanupTask::Compact( + CompactTask::PeriodicFullCompact { + ranges, + compact_load_controller, + }, + )) { + error!( + "failed to schedule a periodic full compaction"; + "store_id" => self.fsm.store.id, + "err" => ?e + ); + } + } + + /// Use ranges assigned to each region as increments for full compaction. + fn ranges_for_full_compact(&self) -> Vec<(Vec, Vec)> { + let meta = self.ctx.store_meta.lock().unwrap(); + let mut ranges = Vec::with_capacity(meta.regions.len()); + + for region in meta.regions.values() { + let start_key = keys::enc_start_key(region); + let end_key = keys::enc_end_key(region); + ranges.push((start_key, end_key)) + } + ranges + } + + /// Returns a predicate `Fn` which is evaluated: + /// 1. Before full compaction runs: if `false`, we return and wait for the + /// next full compaction tick + /// (`PERIODIC_FULL_COMPACT_TICK_INTERVAL_DURATION`) before starting. If + /// true, we begin full compaction, which means the first incremental range + /// will be compactecd. See: ``StoreFsmDelegate::on_full_compact_tick`` + /// in this file. + /// + /// 2. After each incremental range finishes and before next one (if any) + /// starts. If `false`, we pause compaction and wait. See: + /// `CompactRunner::full_compact` in `worker/compact.rs`. + fn is_low_load_for_full_compact(&self) -> impl Fn() -> bool { + let max_start_cpu_usage = self.ctx.cfg.periodic_full_compact_start_max_cpu; + let global_stat = self.ctx.global_stat.clone(); + move || { + if global_stat.stat.is_busy.load(Ordering::SeqCst) { + warn!("full compaction may not run at this time, `is_busy` flag is true",); + return false; + } + + let cpu_usage = PROCESS_STAT_CPU_USAGE.get(); + if cpu_usage > max_start_cpu_usage { + warn!( + "full compaction may not run at this time, cpu usage is above max"; + "cpu_usage" => cpu_usage, + "threshold" => max_start_cpu_usage, + ); + return false; + } + true + } + } + fn register_compact_check_tick(&self) { self.ctx.schedule_store_tick( StoreTick::CompactCheck, @@ -2525,7 +2669,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.cfg.region_compact_min_tombstones, self.ctx.cfg.region_compact_tombstones_percent, self.ctx.cfg.region_compact_min_redundant_rows, - self.ctx.cfg.region_compact_redundant_rows_percent, + self.ctx.cfg.region_compact_redundant_rows_percent(), ), }, )) { @@ -2537,6 +2681,25 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } + fn report_min_resolved_ts(&self) { + let read_progress = { + let meta = self.ctx.store_meta.lock().unwrap(); + meta.region_read_progress().clone() + }; + let min_resolved_ts = read_progress.get_min_resolved_ts(); + + let task = PdTask::ReportMinResolvedTs { + store_id: self.fsm.store.id, + min_resolved_ts, + }; + if let Err(e) = self.ctx.pd_scheduler.schedule(task) { + error!("failed to send min resolved ts to pd worker"; + "store_id" => self.fsm.store.id, + "err" => ?e + ); + } + } + fn store_heartbeat_pd(&mut self, report: Option) { let mut stats = StoreStats::default(); @@ -2643,6 +2806,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.register_pd_store_heartbeat_tick(); } + fn on_pd_report_min_resolved_ts_tick(&mut self) { + self.report_min_resolved_ts(); + self.register_pd_report_min_resolved_ts_tick(); + } + fn on_snap_mgr_gc(&mut self) { // refresh multi_snapshot_files enable flag self.ctx.snap_mgr.set_enable_multi_snapshot_files( @@ -2703,16 +2871,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER fn on_wake_up_regions(&self, abnormal_stores: Vec) { info!("try to wake up all hibernated regions in this store"; "to_all" => abnormal_stores.is_empty()); + let store_id = self.ctx.store_id(); let meta = self.ctx.store_meta.lock().unwrap(); - for region_id in meta.regions.keys() { - let region = &meta.regions[region_id]; + + for (region_id, region) in &meta.regions { // Check whether the current region is not found on abnormal stores. If so, // this region is not the target to be awaken. if !region_on_stores(region, &abnormal_stores) { continue; } let peer = { - match find_peer(region, self.ctx.store_id()) { + match find_peer(region, store_id) { None => continue, Some(p) => p.clone(), } @@ -2746,6 +2915,13 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER ); } + fn register_pd_report_min_resolved_ts_tick(&self) { + self.ctx.schedule_store_tick( + StoreTick::PdReportMinResolvedTs, + self.ctx.cfg.pd_report_min_resolved_ts_interval.0, + ); + } + fn register_snap_mgr_gc_tick(&self) { self.ctx .schedule_store_tick(StoreTick::SnapGc, self.ctx.cfg.snap_mgr_gc_tick_interval.0) @@ -2759,60 +2935,47 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } -impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { - fn on_validate_sst_result(&mut self, ssts: Vec) { - if ssts.is_empty() || self.ctx.importer.get_mode() == SwitchMode::Import { - return; - } - // A stale peer can still ingest a stale Sst before it is - // destroyed. We need to make sure that no stale peer exists. - let mut delete_ssts = Vec::new(); - { - let meta = self.ctx.store_meta.lock().unwrap(); - for sst in ssts { - if !meta.regions.contains_key(&sst.get_region_id()) { - delete_ssts.push(sst); - } - } - } - if delete_ssts.is_empty() { - return; - } - - let task = CleanupSstTask::DeleteSst { ssts: delete_ssts }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to delete ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } +// we will remove 1-week old version 1 SST files. +const VERSION_1_SST_CLEANUP_DURATION: Duration = Duration::from_secs(7 * 24 * 60 * 60); +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { fn on_cleanup_import_sst(&mut self) -> Result<()> { let mut delete_ssts = Vec::new(); - let mut validate_ssts = Vec::new(); let ssts = box_try!(self.ctx.importer.list_ssts()); if ssts.is_empty() { return Ok(()); } + let now = SystemTime::now(); { let meta = self.ctx.store_meta.lock().unwrap(); for sst in ssts { - if let Some(r) = meta.regions.get(&sst.get_region_id()) { + if let Some(r) = meta.regions.get(&sst.0.get_region_id()) { let region_epoch = r.get_region_epoch(); - if util::is_epoch_stale(sst.get_region_epoch(), region_epoch) { + if util::is_epoch_stale(sst.0.get_region_epoch(), region_epoch) { // If the SST epoch is stale, it will not be ingested anymore. - delete_ssts.push(sst); + delete_ssts.push(sst.0); } + } else if sst.1 >= sst_importer::API_VERSION_2 { + // The write RPC of import sst service have make sure the region do exist at + // the write time, and now the region is not found, + // sst can be deleted because it won't be used by + // ingest in future. + delete_ssts.push(sst.0); } else { - // If the peer doesn't exist, we need to validate the SST through PD. - validate_ssts.push(sst); + // in the old protocol, we can't easily know if the SST will be used in the + // committed raft log, so we only delete the SST + // files that has not be modified for 1 week. + if let Ok(duration) = now.duration_since(sst.2) { + if duration > VERSION_1_SST_CLEANUP_DURATION { + warn!( + "found 1-week old SST file of version 1, will delete it"; + "sst_meta" => ?sst.0, + "last_modified" => ?sst.2 + ); + delete_ssts.push(sst.0); + } + } } } } @@ -2832,27 +2995,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - // When there is an import job running, the region which this sst belongs may - // has not been split from the origin region because the apply thread is so busy - // that it can not apply SplitRequest as soon as possible. So we can not - // delete this sst file. - if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { - let task = CleanupSstTask::ValidateSst { - ssts: validate_ssts, - }; - if let Err(e) = self - .ctx - .cleanup_scheduler - .schedule(CleanupTask::CleanupSst(task)) - { - error!( - "schedule to validate ssts failed"; - "store_id" => self.fsm.store.id, - "err" => ?e, - ); - } - } - Ok(()) } diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 5460a57ae0f..dc94a3afbe7 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, Mutex}; use collections::HashSet; -use prometheus::local::LocalHistogram; +use prometheus::local::{LocalHistogram, LocalIntCounter}; use raft::eraftpb::MessageType; use tikv_util::time::{Duration, Instant}; use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; @@ -68,35 +68,81 @@ impl RaftSendMessageMetrics { } } +/// Buffered statistics for recording local raftstore message duration. +/// +/// As it's only used for recording local raftstore message duration, +/// and it will be manually reset preiodically, so it's not necessary +/// to use `LocalHistogram`. #[derive(Default)] -pub struct RaftCommitLogStatistics { - pub last_commit_log_duration_sum: Duration, - pub last_commit_log_count_sum: u64, +struct LocalHealthStatistics { + duration_sum: Duration, + count: u64, } -impl RaftCommitLogStatistics { +impl LocalHealthStatistics { #[inline] - pub fn record(&mut self, dur: Duration) { - self.last_commit_log_count_sum += 1; - self.last_commit_log_duration_sum += dur; + fn observe(&mut self, dur: Duration) { + self.count += 1; + self.duration_sum += dur; } #[inline] - pub fn avg(&self) -> Duration { - if self.last_commit_log_count_sum > 0 { - Duration::from_micros( - self.last_commit_log_duration_sum.as_micros() as u64 - / self.last_commit_log_count_sum, - ) + fn avg(&self) -> Duration { + if self.count > 0 { + Duration::from_micros(self.duration_sum.as_micros() as u64 / self.count) } else { Duration::default() } } #[inline] + fn reset(&mut self) { + self.count = 0; + self.duration_sum = Duration::default(); + } +} + +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IoType { + Disk = 0, + Network = 1, +} + +/// Buffered statistics for recording the health of raftstore. +#[derive(Default)] +pub struct HealthStatistics { + // represents periodic latency on the disk io. + disk_io_dur: LocalHealthStatistics, + // represents the latency of the network io. + network_io_dur: LocalHealthStatistics, +} + +impl HealthStatistics { + #[inline] + pub fn observe(&mut self, dur: Duration, io_type: IoType) { + match io_type { + IoType::Disk => self.disk_io_dur.observe(dur), + IoType::Network => self.network_io_dur.observe(dur), + } + } + + #[inline] + pub fn avg(&self, io_type: IoType) -> Duration { + match io_type { + IoType::Disk => self.disk_io_dur.avg(), + IoType::Network => self.network_io_dur.avg(), + } + } + + #[inline] + /// Reset HealthStatistics. + /// + /// Should be manually reset when the metrics are + /// accepted by slowness inspector. pub fn reset(&mut self) { - self.last_commit_log_count_sum = 0; - self.last_commit_log_duration_sum = Duration::default(); + self.disk_io_dur.reset(); + self.network_io_dur.reset(); } } @@ -112,7 +158,10 @@ pub struct RaftMetrics { // local histogram pub store_time: LocalHistogram, + // the wait time for processing a raft command pub propose_wait_time: LocalHistogram, + // the wait time for processing a raft message + pub process_wait_time: LocalHistogram, pub process_ready: LocalHistogram, pub event_time: RaftEventDurationVec, pub peer_msg_len: LocalHistogram, @@ -130,8 +179,9 @@ pub struct RaftMetrics { pub wf_commit_not_persist_log: LocalHistogram, // local statistics for slowness - pub stat_commit_log: RaftCommitLogStatistics, + pub health_stats: HealthStatistics, + pub check_stale_peer: LocalIntCounter, pub leader_missing: Arc>>, last_flush_time: Instant, @@ -152,6 +202,7 @@ impl RaftMetrics { raft_log_gc_skipped: RaftLogGcSkippedCounterVec::from(&RAFT_LOG_GC_SKIPPED_VEC), store_time: STORE_TIME_HISTOGRAM.local(), propose_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), + process_wait_time: RAFT_MESSAGE_WAIT_TIME_HISTOGRAM.local(), process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), @@ -167,7 +218,8 @@ impl RaftMetrics { wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), - stat_commit_log: RaftCommitLogStatistics::default(), + health_stats: HealthStatistics::default(), + check_stale_peer: CHECK_STALE_PEER_COUNTER.local(), leader_missing: Arc::default(), last_flush_time: Instant::now_coarse(), } @@ -190,6 +242,7 @@ impl RaftMetrics { self.store_time.flush(); self.propose_wait_time.flush(); + self.process_wait_time.flush(); self.process_ready.flush(); self.event_time.flush(); self.peer_msg_len.flush(); @@ -206,6 +259,7 @@ impl RaftMetrics { self.wf_commit_not_persist_log.flush(); } + self.check_stale_peer.flush(); let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); missing.clear(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index a5aa164e63e..908b650469c 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -214,7 +214,10 @@ make_static_metric! { pub label_enum RaftEventDurationType { compact_check, + periodic_full_compact, + load_metrics_window, pd_store_heartbeat, + pd_report_min_resolved_ts, snap_gc, compact_lock_cf, consistency_check, @@ -551,6 +554,13 @@ lazy_static! { exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref RAFT_MESSAGE_WAIT_TIME_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_raft_msg_wait_time_duration_secs", + "Bucketed histogram of raft message wait time duration.", + exponential_buckets(0.00001, 2.0, 26).unwrap() + ).unwrap(); + pub static ref PEER_GC_RAFT_LOG_COUNTER: IntCounter = register_int_counter!( "tikv_raftstore_gc_raft_log_total", @@ -651,6 +661,11 @@ lazy_static! { "Total number of leader missed region." ).unwrap(); + pub static ref CHECK_STALE_PEER_COUNTER: IntCounter = register_int_counter!( + "tikv_raftstore_check_stale_peer", + "Total number of checking stale peers." + ).unwrap(); + pub static ref INGEST_SST_DURATION_SECONDS: Histogram = register_histogram!( "tikv_snapshot_ingest_sst_duration_seconds", diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index efd149e7c41..123289c2057 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -57,8 +57,8 @@ pub use self::{ }, peer::{ can_amend_read, get_sync_log_from_request, make_transfer_leader_response, - propose_read_index, should_renew_lease, Peer, PeerStat, ProposalContext, ProposalQueue, - RequestInspector, RequestPolicy, TRANSFER_LEADER_COMMAND_REPLY_CTX, + propose_read_index, should_renew_lease, DiskFullPeers, Peer, PeerStat, ProposalContext, + ProposalQueue, RequestInspector, RequestPolicy, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, @@ -85,14 +85,14 @@ pub use self::{ util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ metrics as worker_metrics, need_compact, AutoSplitController, BatchComponent, Bucket, - BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, CompactThreshold, - FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, LocalReader, - LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, - ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, - SplitConfig, SplitConfigManager, SplitInfo, StoreMetaDelegate, StoreStatsReporter, - TrackVer, WriteStats, WriterContoller, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, - DEFAULT_BIG_REGION_BYTE_THRESHOLD, DEFAULT_BIG_REGION_QPS_THRESHOLD, - DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + BucketRange, BucketStatsInfo, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, + CompactThreshold, FlowStatistics, FlowStatsReporter, FullCompactController, KeyEntry, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, WriterContoller, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }, }; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 1ed8934e0f0..52aed7d424f 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -10,7 +10,6 @@ use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; use kvproto::{ brpb::CheckAdminResponse, - import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, metapb::RegionEpoch, @@ -436,11 +435,14 @@ impl PeerTick { #[derive(Debug, Clone, Copy)] pub enum StoreTick { CompactCheck, + PeriodicFullCompact, + LoadMetricsWindow, PdStoreHeartbeat, SnapGc, CompactLockCf, ConsistencyCheck, CleanupImportSst, + PdReportMinResolvedTs, } impl StoreTick { @@ -448,11 +450,14 @@ impl StoreTick { pub fn tag(self) -> RaftEventDurationType { match self { StoreTick::CompactCheck => RaftEventDurationType::compact_check, + StoreTick::PeriodicFullCompact => RaftEventDurationType::periodic_full_compact, StoreTick::PdStoreHeartbeat => RaftEventDurationType::pd_store_heartbeat, StoreTick::SnapGc => RaftEventDurationType::snap_gc, StoreTick::CompactLockCf => RaftEventDurationType::compact_lock_cf, StoreTick::ConsistencyCheck => RaftEventDurationType::consistency_check, StoreTick::CleanupImportSst => RaftEventDurationType::cleanup_import_sst, + StoreTick::LoadMetricsWindow => RaftEventDurationType::load_metrics_window, + StoreTick::PdReportMinResolvedTs => RaftEventDurationType::pd_report_min_resolved_ts, } } } @@ -545,6 +550,7 @@ pub enum CasualMessage { split_keys: Vec>, callback: Callback, source: Cow<'static, str>, + share_source_region_size: bool, }, /// Hash result of ComputeHash command. @@ -557,12 +563,14 @@ pub enum CasualMessage { /// Approximate size of target region. This message can only be sent by /// split-check thread. RegionApproximateSize { - size: u64, + size: Option, + splitable: Option, }, /// Approximate key count of target region. RegionApproximateKeys { - keys: u64, + keys: Option, + splitable: Option, }, CompactionDeclinedBytes { bytes: u64, @@ -647,11 +655,19 @@ impl fmt::Debug for CasualMessage { KeysInfoFormatter(split_keys.iter()), source, ), - CasualMessage::RegionApproximateSize { size } => { - write!(fmt, "Region's approximate size [size: {:?}]", size) + CasualMessage::RegionApproximateSize { size, splitable } => { + write!( + fmt, + "Region's approximate size [size: {:?}], [splitable: {:?}]", + size, splitable + ) } - CasualMessage::RegionApproximateKeys { keys } => { - write!(fmt, "Region's approximate keys [keys: {:?}]", keys) + CasualMessage::RegionApproximateKeys { keys, splitable } => { + write!( + fmt, + "Region's approximate keys [keys: {:?}], [splitable: {:?}", + keys, splitable + ) } CasualMessage::CompactionDeclinedBytes { bytes } => { write!(fmt, "compaction declined bytes {}", bytes) @@ -740,7 +756,7 @@ pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage), + RaftMessage(InspectedRaftMessage, Option), /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. @@ -778,7 +794,7 @@ impl ResourceMetered for PeerMsg {} impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + PeerMsg::RaftMessage(..) => write!(fmt, "Raft Message"), PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), PeerMsg::Tick(tick) => write! { fmt, @@ -823,10 +839,6 @@ where { RaftMessage(InspectedRaftMessage), - ValidateSstResult { - invalid_ssts: Vec, - }, - // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. ClearRegionSizeInRange { @@ -883,7 +895,6 @@ where write!(fmt, "Store {} is unreachable", store_id) } StoreMsg::CompactedEvent(ref event) => write!(fmt, "CompactedEvent cf {}", event.cf()), - StoreMsg::ValidateSstResult { .. } => write!(fmt, "Validate SST Result"), StoreMsg::ClearRegionSizeInRange { ref start_key, ref end_key, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8c1a7ef61e9..904d35fec2f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -19,8 +19,8 @@ use bytes::Bytes; use collections::{HashMap, HashSet}; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_traits::{ - Engines, KvEngine, PerfContext, RaftEngine, Snapshot, WriteBatch, WriteOptions, CF_DEFAULT, - CF_LOCK, CF_WRITE, + Engines, KvEngine, PerfContext, RaftEngine, Snapshot, SnapshotContext, WriteBatch, + WriteOptions, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use fail::fail_point; @@ -43,7 +43,7 @@ use kvproto::{ }, }; use parking_lot::RwLockUpgradableReadGuard; -use pd_client::{BucketStat, INVALID_ID}; +use pd_client::INVALID_ID; use protobuf::Message; use raft::{ self, @@ -71,7 +71,7 @@ use uuid::Uuid; use super::{ cmd_resp, - local_metrics::RaftMetrics, + local_metrics::{IoType, RaftMetrics}, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, @@ -80,6 +80,7 @@ use super::{ self, check_req_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, + worker::BucketStatsInfo, DestroyPeerJob, LocalReadContext, }; use crate::{ @@ -126,6 +127,7 @@ pub enum StaleState { Valid, ToValidate, LeaderMissing, + MaybeLeaderMissing, } #[derive(Debug)] @@ -242,6 +244,7 @@ bitflags! { const SPLIT = 0b0000_0010; const PREPARE_MERGE = 0b0000_0100; const COMMIT_MERGE = 0b0000_1000; + const ROLLBACK_MERGE = 0b0001_0000; } } @@ -569,6 +572,119 @@ pub fn can_amend_read( false } +/// The SplitCheckTrigger maintains the internal status to determine +/// if a split check task should be triggered. +#[derive(Default, Debug)] +pub struct SplitCheckTrigger { + /// An inaccurate difference in region size since last reset. + /// It is used to decide whether split check is needed. + size_diff_hint: u64, + /// An inaccurate difference in region size after compaction. + /// It is used to trigger check split to update approximate size and keys + /// after space reclamation of deleted entries. + pub compaction_declined_bytes: u64, + /// Approximate size of the region. + pub approximate_size: Option, + may_split_size: Option, + /// Approximate keys of the region. + pub approximate_keys: Option, + may_split_keys: Option, + /// Whether this region has scheduled a split check task. If we just + /// splitted the region or ingested one file which may be overlapped + /// with the existed data, reset the flag so that the region can be + /// splitted again. + may_skip_split_check: bool, +} + +impl SplitCheckTrigger { + pub fn should_skip(&self, threshold: u64) -> bool { + self.may_skip_split_check + && self.compaction_declined_bytes < threshold + && self.size_diff_hint < threshold + } + + pub fn post_triggered(&mut self) { + self.size_diff_hint = 0; + self.compaction_declined_bytes = 0; + // The task is scheduled, the next tick may skip it only when the size and keys + // are small. + // If either size or keys are big enough to do a split, + // keep split check tick until split is done + if !matches!(self.may_split_size, Some(true)) && !matches!(self.may_split_keys, Some(true)) + { + self.may_skip_split_check = true; + } + } + + pub fn post_split(&mut self) { + self.size_diff_hint = 0; + self.may_split_keys = None; + self.may_split_size = None; + // It's not correct anymore, so set it to false to schedule a split check task. + self.may_skip_split_check = false; + } + + pub fn add_size_diff(&mut self, size_diff: i64) { + let diff = self.size_diff_hint as i64 + size_diff; + self.size_diff_hint = cmp::max(diff, 0) as u64; + } + + pub fn reset_skip_check(&mut self) { + self.may_skip_split_check = false; + } + + pub fn on_clear_region_size(&mut self) { + self.approximate_size = None; + self.approximate_keys = None; + self.may_split_size = None; + self.may_split_keys = None; + self.may_skip_split_check = false; + } + + pub fn on_approximate_region_size(&mut self, size: Option, splitable: Option) { + // If size is none, it means no estimated size + if size.is_some() { + self.approximate_size = size; + } + + if splitable.is_some() { + self.may_split_size = splitable; + } + + // If the region is truly splitable, + // may_skip_split_check should be false + if matches!(splitable, Some(true)) { + self.may_skip_split_check = false; + } + } + + pub fn on_approximate_region_keys(&mut self, keys: Option, splitable: Option) { + // if keys is none, it means no estimated keys + if keys.is_some() { + self.approximate_keys = keys; + } + + if splitable.is_some() { + self.may_split_keys = splitable; + } + + // If the region is truly splitable, + // may_skip_split_check should be false + if matches!(splitable, Some(true)) { + self.may_skip_split_check = false; + } + } + + pub fn on_ingest_sst_result(&mut self, size: u64, keys: u64) { + self.approximate_size = Some(self.approximate_size.unwrap_or_default() + size); + self.approximate_keys = Some(self.approximate_keys.unwrap_or_default() + keys); + + // The ingested file may be overlapped with the data in engine, so we need to + // check it again to get the accurate value. + self.may_skip_split_check = false; + } +} + #[derive(Getters, MutGetters)] pub struct Peer where @@ -593,6 +709,8 @@ where pub peer_heartbeats: HashMap, /// Record the waiting data status of each follower or learner peer. pub wait_data_peers: Vec, + /// This peer is created by a raft message from `create_by_peer`. + create_by_peer: Option, proposals: ProposalQueue>, leader_missing_time: Option, @@ -656,25 +774,10 @@ where pub peers_start_pending_time: Vec<(u64, Instant)>, /// A inaccurate cache about which peer is marked as down. down_peer_ids: Vec, - - /// An inaccurate difference in region size since last reset. - /// It is used to decide whether split check is needed. - pub size_diff_hint: u64, + /// the split check trigger + pub split_check_trigger: SplitCheckTrigger, /// The count of deleted keys since last reset. delete_keys_hint: u64, - /// An inaccurate difference in region size after compaction. - /// It is used to trigger check split to update approximate size and keys - /// after space reclamation of deleted entries. - pub compaction_declined_bytes: u64, - /// Approximate size of the region. - pub approximate_size: Option, - /// Approximate keys of the region. - pub approximate_keys: Option, - /// Whether this region has scheduled a split check task. If we just - /// splitted the region or ingested one file which may be overlapped - /// with the existed data, reset the flag so that the region can be - /// splitted again. - pub may_skip_split_check: bool, /// The state for consistency check. pub consistency_state: ConsistencyState, @@ -780,9 +883,8 @@ where persisted_number: u64, /// The context of applying snapshot. apply_snap_ctx: Option, - /// region buckets. - pub region_buckets: Option, - pub last_region_buckets: Option, + /// region buckets info in this region. + region_buckets_info: BucketStatsInfo, /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, @@ -805,6 +907,7 @@ where region: &metapb::Region, peer: metapb::Peer, wait_data: bool, + create_by_peer: Option, ) -> Result> { let peer_id = peer.get_id(); if peer_id == raft::INVALID_ID { @@ -859,14 +962,11 @@ where peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), wait_data_peers: Vec::default(), + create_by_peer, peers_start_pending_time: vec![], down_peer_ids: vec![], - size_diff_hint: 0, + split_check_trigger: SplitCheckTrigger::default(), delete_keys_hint: 0, - approximate_size: None, - approximate_keys: None, - may_skip_split_check: false, - compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, wait_data, @@ -931,8 +1031,7 @@ where unpersisted_ready: None, persisted_number: 0, apply_snap_ctx: None, - region_buckets: None, - last_region_buckets: None, + region_buckets_info: BucketStatsInfo::default(), lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, snapshot_recovery_state: None, @@ -965,7 +1064,10 @@ where return; } self.replication_mode_version = state.status().get_dr_auto_sync().state_id; - let enable = state.status().get_dr_auto_sync().get_state() != DrAutoSyncState::Async; + let enable = !matches!( + state.status().get_dr_auto_sync().get_state(), + DrAutoSyncState::Async | DrAutoSyncState::SyncRecover + ); self.raft_group.raft.enable_group_commit(enable); self.dr_auto_sync_state = state.status().get_dr_auto_sync().get_state(); } @@ -974,29 +1076,32 @@ where pub fn switch_replication_mode(&mut self, state: &Mutex) { self.replication_sync = false; let guard = state.lock().unwrap(); - let enable_group_commit = if guard.status().get_mode() == ReplicationMode::Majority { - self.replication_mode_version = 0; - self.dr_auto_sync_state = DrAutoSyncState::Async; - false - } else { - self.dr_auto_sync_state = guard.status().get_dr_auto_sync().get_state(); - self.replication_mode_version = guard.status().get_dr_auto_sync().state_id; - match guard.status().get_dr_auto_sync().get_state() { - // SyncRecover will enable group commit after it catches up logs. - DrAutoSyncState::Async | DrAutoSyncState::SyncRecover => false, - _ => true, - } - }; + let (enable_group_commit, calculate_group_id) = + if guard.status().get_mode() == ReplicationMode::Majority { + self.replication_mode_version = 0; + self.dr_auto_sync_state = DrAutoSyncState::Async; + (false, false) + } else { + self.dr_auto_sync_state = guard.status().get_dr_auto_sync().get_state(); + self.replication_mode_version = guard.status().get_dr_auto_sync().state_id; + match guard.status().get_dr_auto_sync().get_state() { + // SyncRecover will enable group commit after it catches up logs. + DrAutoSyncState::Async => (false, false), + DrAutoSyncState::SyncRecover => (false, true), + _ => (true, true), + } + }; drop(guard); - self.switch_group_commit(enable_group_commit, state); + self.switch_group_commit(enable_group_commit, calculate_group_id, state); } fn switch_group_commit( &mut self, enable_group_commit: bool, + calculate_group_id: bool, state: &Mutex, ) { - if enable_group_commit { + if enable_group_commit || calculate_group_id { let mut guard = state.lock().unwrap(); let ids = mem::replace( guard.calculate_commit_group( @@ -1086,6 +1191,8 @@ where // of term explicitly to get correct metadata. info!( "become follower for new logs"; + "first_log_term" => first.term, + "first_log_index" => first.index, "new_log_term" => last_log.term, "new_log_index" => last_log.index, "term" => self.term(), @@ -1312,6 +1419,16 @@ where self.get_store().region() } + #[inline] + pub fn region_buckets_info_mut(&mut self) -> &mut BucketStatsInfo { + &mut self.region_buckets_info + } + + #[inline] + pub fn region_buckets_info(&self) -> &BucketStatsInfo { + &self.region_buckets_info + } + /// Check whether the peer can be hibernated. /// /// This should be used with `check_after_tick` to get a correct conclusion. @@ -1721,7 +1838,7 @@ where let has_snap_task = self.get_store().has_gen_snap_task(); let pre_commit_index = self.raft_group.raft.raft_log.committed; self.raft_group.step(m)?; - self.report_commit_log_duration(pre_commit_index, &ctx.raft_metrics); + self.report_commit_log_duration(pre_commit_index, &mut ctx.raft_metrics); let mut for_balance = false; if !has_snap_task && self.get_store().has_gen_snap_task() { @@ -1743,7 +1860,7 @@ where Ok(()) } - fn report_persist_log_duration(&self, pre_persist_index: u64, metrics: &RaftMetrics) { + fn report_persist_log_duration(&self, pre_persist_index: u64, metrics: &mut RaftMetrics) { if !metrics.waterfall_metrics || self.proposals.is_empty() { return; } @@ -1766,7 +1883,7 @@ where } } - fn report_commit_log_duration(&self, pre_commit_index: u64, metrics: &RaftMetrics) { + fn report_commit_log_duration(&self, pre_commit_index: u64, metrics: &mut RaftMetrics) { if !metrics.waterfall_metrics || self.proposals.is_empty() { return; } @@ -1786,10 +1903,21 @@ where &metrics.wf_commit_not_persist_log }; for tracker in trackers { - tracker.observe(now, hist, |t| { + // Collect the metrics related to commit_log + // durations. + let duration = tracker.observe(now, hist, |t| { t.metrics.commit_not_persisted = !commit_persisted; &mut t.metrics.wf_commit_log_nanos }); + // Normally, commit_log_duration both contains the duraiton on persisting + // raft logs and transferring raft logs to other nodes. Therefore, it can + // reflects slowness of the node on I/Os, whatever the reason is. + // Here, health_stats uses the recorded commit_log_duration as the + // latency to perspect whether there exists jitters on network. It's not + // accurate, but it's proved that it's a good approximation. + metrics + .health_stats + .observe(Duration::from_nanos(duration), IoType::Network); } } } @@ -2001,7 +2129,6 @@ where self.leader_missing_time = None; return StaleState::Valid; } - let naive_peer = !self.is_initialized() || !self.raft_group.raft.promotable(); // Updates the `leader_missing_time` according to the current state. // // If we are checking this it means we suspect the leader might be missing. @@ -2021,13 +2148,18 @@ where StaleState::ToValidate } Some(instant) - if instant.saturating_elapsed() >= ctx.cfg.abnormal_leader_missing_duration.0 - && !naive_peer => + if instant.saturating_elapsed() >= ctx.cfg.abnormal_leader_missing_duration.0 => { // A peer is considered as in the leader missing state // if it's initialized but is isolated from its leader or // something bad happens that the raft group can not elect a leader. - StaleState::LeaderMissing + if self.is_initialized() && self.raft_group.raft.promotable() { + StaleState::LeaderMissing + } else { + // Uninitialized peer and learner may not have leader info, + // even if there is a valid leader. + StaleState::MaybeLeaderMissing + } } _ => StaleState::Valid, } @@ -2840,7 +2972,10 @@ where commit_term, committed_entries, cbs, - self.region_buckets.as_ref().map(|b| b.meta.clone()), + self.region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), ); apply.on_schedule(&ctx.raft_metrics); self.mut_store() @@ -3000,8 +3135,8 @@ where let pre_persist_index = self.raft_group.raft.raft_log.persisted; let pre_commit_index = self.raft_group.raft.raft_log.committed; self.raft_group.on_persist_ready(self.persisted_number); - self.report_persist_log_duration(pre_persist_index, &ctx.raft_metrics); - self.report_commit_log_duration(pre_commit_index, &ctx.raft_metrics); + self.report_persist_log_duration(pre_persist_index, &mut ctx.raft_metrics); + self.report_commit_log_duration(pre_commit_index, &mut ctx.raft_metrics); let persist_index = self.raft_group.raft.raft_log.persisted; self.mut_store().update_cache_persisted(persist_index); @@ -3045,8 +3180,8 @@ where let pre_persist_index = self.raft_group.raft.raft_log.persisted; let pre_commit_index = self.raft_group.raft.raft_log.committed; let mut light_rd = self.raft_group.advance_append(ready); - self.report_persist_log_duration(pre_persist_index, &ctx.raft_metrics); - self.report_commit_log_duration(pre_commit_index, &ctx.raft_metrics); + self.report_persist_log_duration(pre_persist_index, &mut ctx.raft_metrics); + self.report_commit_log_duration(pre_commit_index, &mut ctx.raft_metrics); let persist_index = self.raft_group.raft.raft_log.persisted; if self.is_in_force_leader() { @@ -3340,8 +3475,8 @@ where self.peer_stat.written_keys += apply_metrics.written_keys; self.peer_stat.written_bytes += apply_metrics.written_bytes; self.delete_keys_hint += apply_metrics.delete_keys_hint; - let diff = self.size_diff_hint as i64 + apply_metrics.size_diff_hint; - self.size_diff_hint = cmp::max(diff, 0) as u64; + self.split_check_trigger + .add_size_diff(apply_metrics.size_diff_hint); if self.has_pending_snapshot() && self.ready_to_handle_pending_snap() { has_ready = true; @@ -3373,17 +3508,14 @@ where } pub fn post_split(&mut self) { - // Reset delete_keys_hint and size_diff_hint. self.delete_keys_hint = 0; - self.size_diff_hint = 0; + self.split_check_trigger.post_split(); + self.reset_region_buckets(); } pub fn reset_region_buckets(&mut self) { - if self.region_buckets.is_some() { - self.last_region_buckets = self.region_buckets.take(); - self.region_buckets = None; - } + self.region_buckets_info_mut().set_bucket_stat(None); } /// Try to renew leader lease. @@ -4237,7 +4369,9 @@ where // Should not propose normal in force leader state. // In `pre_propose_raft_command`, it rejects all the requests expect conf-change // if in force leader state. - if self.force_leader.is_some() { + if self.force_leader.is_some() + && req.get_admin_request().get_cmd_type() != AdminCmdType::RollbackMerge + { poll_ctx.raft_metrics.invalid_proposal.force_leader.inc(); panic!( "{} propose normal in force leader state {:?}", @@ -4695,10 +4829,23 @@ where } } - let mut resp = reader.execute(&req, &Arc::new(region), read_index, None); + let snap_ctx = if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + Some(SnapshotContext { + region_id: self.region_id, + read_ts, + }) + } else { + None + }; + + let mut resp = reader.execute(&req, &Arc::new(region), read_index, snap_ctx, None); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); - snap.bucket_meta = self.region_buckets.as_ref().map(|b| b.meta.clone()); + snap.bucket_meta = self + .region_buckets_info() + .bucket_stat() + .as_ref() + .map(|s| s.meta.clone()); } resp.txn_extra_op = self.txn_extra_op.load(); cmd_resp::bind_term(&mut resp.response, self.term()); @@ -5047,6 +5194,15 @@ impl DiskFullPeers { pub fn majority(&self) -> bool { self.majority } + pub fn set_majority(&mut self, majority: bool) { + self.majority = majority; + } + pub fn peers(&self) -> &HashMap { + &self.peers + } + pub fn peers_mut(&mut self) -> &mut HashMap { + &mut self.peers + } pub fn has(&self, peer_id: u64) -> bool { !self.peers.is_empty() && self.peers.contains_key(&peer_id) } @@ -5127,7 +5283,7 @@ where // should enable group commit to promise `IntegrityOverLabel`. then safe // to switch to the `Sync` phase. if self.dr_auto_sync_state == DrAutoSyncState::SyncRecover { - self.switch_group_commit(true, &ctx.global_replication_state) + self.switch_group_commit(true, true, &ctx.global_replication_state) } self.replication_sync = true; } @@ -5178,8 +5334,8 @@ where pending_peers: self.collect_pending_peers(ctx), written_bytes: self.peer_stat.written_bytes, written_keys: self.peer_stat.written_keys, - approximate_size: self.approximate_size, - approximate_keys: self.approximate_keys, + approximate_size: self.split_check_trigger.approximate_size, + approximate_keys: self.split_check_trigger.approximate_keys, replication_status: self.region_replication_status(ctx), wait_data_peers: self.wait_data_peers.clone(), }); @@ -5309,9 +5465,17 @@ where &mut self, ctx: &mut PollContext, ) { - if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() { + ctx.raft_metrics.check_stale_peer.inc(); + if self.check_stale_conf_ver < self.region().get_region_epoch().get_conf_ver() + || self.region().get_region_epoch().get_conf_ver() == 0 + { self.check_stale_conf_ver = self.region().get_region_epoch().get_conf_ver(); self.check_stale_peers = self.region().get_peers().to_vec(); + if let Some(create_by_peer) = self.create_by_peer.as_ref() { + // Push create_by_peer in case the peer is removed before + // initialization which has no peer in region. + self.check_stale_peers.push(create_by_peer.clone()); + } } for peer in &self.check_stale_peers { if peer.get_id() == self.peer_id() { @@ -5580,8 +5744,12 @@ where &self.engines.kv } - fn get_snapshot(&mut self, _: &Option>) -> Arc { - Arc::new(self.engines.kv.snapshot()) + fn get_snapshot( + &mut self, + snap_ctx: Option, + _: &Option>, + ) -> Arc { + Arc::new(self.engines.kv.snapshot(snap_ctx)) } } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index d89eafc3a46..44ae3543e95 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -372,7 +372,7 @@ where #[inline] pub fn raw_snapshot(&self) -> EK::Snapshot { - self.engines.kv.snapshot() + self.engines.kv.snapshot(None) } #[inline] @@ -1017,6 +1017,9 @@ where // The `region` is updated after persisting in order to stay consistent with the // one in `StoreMeta::regions` (will be updated soon). // See comments in `apply_snapshot` for more details. + (|| { + fail_point!("before_set_region_on_peer_3", self.peer_id == 3, |_| {}); + })(); self.set_region(res.region.clone()); } } @@ -1604,7 +1607,7 @@ pub mod tests { .unwrap() .unwrap(); gen_task.generate_and_schedule_snapshot::( - engines.kv.clone().snapshot(), + engines.kv.clone().snapshot(None), entry.get_term(), apply_state, sched, diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index bc22dfbf586..5232675f14a 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -59,7 +59,7 @@ where where EK: KvEngine, { - RegionSnapshot::from_snapshot(Arc::new(db.snapshot()), Arc::new(region)) + RegionSnapshot::from_snapshot(Arc::new(db.snapshot(None)), Arc::new(region)) } pub fn from_snapshot(snap: Arc, region: Arc) -> RegionSnapshot { @@ -175,6 +175,11 @@ where pub fn get_end_key(&self) -> &[u8] { self.region.get_end_key() } + + #[cfg(test)] + pub fn snap(&self) -> Arc { + self.snap.clone() + } } impl Clone for RegionSnapshot diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index a303a586935..dd461e61867 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -49,7 +49,6 @@ where channels: Vec, size_limit: usize, write_type: WriteType, - notify_proposed: bool, } impl SimpleWriteReqEncoder @@ -57,14 +56,10 @@ where C: ErrorCallback + WriteCallback, { /// Create a request encoder. - /// - /// If `notify_proposed` is true, channels will be called `notify_proposed` - /// when it's appended. pub fn new( header: Box, bin: SimpleWriteBinary, size_limit: usize, - notify_proposed: bool, ) -> SimpleWriteReqEncoder { let mut buf = Vec::with_capacity(256); buf.push(MAGIC_PREFIX); @@ -77,7 +72,6 @@ where channels: vec![], size_limit, write_type: bin.write_type, - notify_proposed, } } @@ -112,18 +106,10 @@ where } #[inline] - pub fn add_response_channel(&mut self, mut ch: C) { - if self.notify_proposed { - ch.notify_proposed(); - } + pub fn add_response_channel(&mut self, ch: C) { self.channels.push(ch); } - #[inline] - pub fn notify_proposed(&self) -> bool { - self.notify_proposed - } - #[inline] pub fn header(&self) -> &RaftRequestHeader { &self.header @@ -558,7 +544,6 @@ mod tests { header.clone(), bin, usize::MAX, - false, ); let mut encoder = SimpleWriteEncoder::with_capacity(512); @@ -570,7 +555,6 @@ mod tests { header.clone(), bin, 0, - false, ); let (bytes, _) = req_encoder.encode(); @@ -619,9 +603,8 @@ mod tests { .collect(); encoder.ingest(exp.clone()); let bin = encoder.encode(); - let req_encoder = SimpleWriteReqEncoder::>::new( - header, bin, 0, false, - ); + let req_encoder = + SimpleWriteReqEncoder::>::new(header, bin, 0); let (bytes, _) = req_encoder.encode(); let mut decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); @@ -683,7 +666,6 @@ mod tests { header.clone(), bin.clone(), 512, - false, ); let mut header2 = Box::::default(); @@ -700,7 +682,6 @@ mod tests { header.clone(), bin2.clone(), 512, - false, ); assert!(!req_encoder2.amend(&header, &bin)); @@ -735,7 +716,6 @@ mod tests { header.clone(), SimpleWriteEncoder::with_capacity(512).encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); assert_eq!( @@ -753,7 +733,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -771,7 +750,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -788,7 +766,6 @@ mod tests { header.clone(), encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) @@ -816,7 +793,6 @@ mod tests { header, encoder.encode(), 512, - false, ); let (bin, _) = req_encoder.encode(); let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 6fe21fe9750..e7e7c6ccb10 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -15,8 +15,8 @@ use std::{ }; use collections::{HashMap, HashMapEntry as Entry}; -use encryption::{create_aes_ctr_crypter, from_engine_encryption_method, DataKeyManager, Iv}; -use engine_traits::{CfName, EncryptionKeyManager, KvEngine, CF_DEFAULT, CF_LOCK, CF_WRITE}; +use encryption::{create_aes_ctr_crypter, DataKeyManager, Iv}; +use engine_traits::{CfName, KvEngine, CF_DEFAULT, CF_LOCK, CF_WRITE}; use error_code::{self, ErrorCode, ErrorCodeExt}; use fail::fail_point; use file_system::{ @@ -92,6 +92,12 @@ impl From for Error { } } +impl From for Error { + fn from(e: engine_traits::Error) -> Self { + Error::Other(Box::new(e)) + } +} + pub type Result = result::Result; impl ErrorCodeExt for Error { @@ -614,7 +620,7 @@ impl Snapshot { if let Some(mgr) = &s.mgr.encryption_key_manager { let enc_info = mgr.new_file(&file_paths[idx])?; - let mthd = from_engine_encryption_method(enc_info.method); + let mthd = enc_info.method; if mthd != EncryptionMethod::Plaintext { let file_for_recving = cf_file.file_for_recving.last_mut().unwrap(); file_for_recving.encrypter = Some( @@ -873,8 +879,13 @@ impl Snapshot { self.switch_to_cf_file(cf)?; let cf_file = &mut self.cf_files[self.cf_index]; let cf_stat = if plain_file_used(cf_file.cf) { - let key_mgr = self.mgr.encryption_key_manager.as_ref(); - snap_io::build_plain_cf_file::(cf_file, key_mgr, kv_snap, &begin_key, &end_key)? + snap_io::build_plain_cf_file::( + cf_file, + self.mgr.encryption_key_manager.as_ref(), + kv_snap, + &begin_key, + &end_key, + )? } else { snap_io::build_sst_cf_file_list::( cf_file, @@ -885,6 +896,7 @@ impl Snapshot { self.mgr .get_actual_max_per_file_size(allow_multi_files_snapshot), &self.mgr.limiter, + self.mgr.encryption_key_manager.clone(), )? }; SNAPSHOT_LIMIT_GENERATE_BYTES.inc_by(cf_stat.total_size as u64); @@ -1212,7 +1224,7 @@ impl Snapshot { if file_for_recving.written_size != cf_file.size[i] { return Err(io::Error::new( - ErrorKind::Other, + ErrorKind::InvalidData, format!( "snapshot file {} for cf {} size mismatches, \ real size {}, expected size {}", @@ -1227,7 +1239,7 @@ impl Snapshot { let checksum = file_for_recving.write_digest.finalize(); if checksum != cf_file.checksum[i] { return Err(io::Error::new( - ErrorKind::Other, + ErrorKind::InvalidData, format!( "snapshot file {} for cf {} checksum \ mismatches, real checksum {}, expected \ @@ -2627,7 +2639,7 @@ pub mod tests { .tempdir() .unwrap(); let db = get_db(src_db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let src_dir = Builder::new() .prefix("test-snap-file-db-src") @@ -2735,7 +2747,7 @@ pub mod tests { .tempdir() .unwrap(); let db = get_db(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let dir = Builder::new() .prefix("test-snap-validation") @@ -2888,7 +2900,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let dir = Builder::new() .prefix("test-snap-corruption") @@ -2963,7 +2975,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db_with_100keys(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let dir = Builder::new() .prefix("test-snap-corruption-meta") @@ -3044,7 +3056,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let key1 = SnapKey::new(1, 1, 1); let mgr_core = create_manager_core(&path, u64::MAX); let mut s1 = Snapshot::new_for_building(&path, &key1, &mgr_core).unwrap(); @@ -3115,7 +3127,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(src_db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let key = SnapKey::new(1, 1, 1); let region = gen_test_region(1, 1, 1); @@ -3197,7 +3209,7 @@ pub mod tests { .max_total_size(max_total_size) .build::<_>(snapfiles_path.path().to_str().unwrap()); snap_mgr.init().unwrap(); - let snapshot = engine.kv.snapshot(); + let snapshot = engine.kv.snapshot(None); // Add an oldest snapshot for receiving. let recv_key = SnapKey::new(100, 100, 100); @@ -3322,7 +3334,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(kv_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let key = SnapKey::new(1, 1, 1); let region = gen_test_region(1, 1, 1); diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 3cdee1e40f1..48919474135 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -8,18 +8,17 @@ use std::{ usize, }; -use encryption::{ - from_engine_encryption_method, DataKeyManager, DecrypterReader, EncrypterWriter, Iv, -}; +use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter, Iv}; use engine_traits::{ - CfName, EncryptionKeyManager, Error as EngineError, Iterable, KvEngine, Mutable, - SstCompressionType, SstWriter, SstWriterBuilder, WriteBatch, + CfName, Error as EngineError, Iterable, KvEngine, Mutable, SstCompressionType, SstReader, + SstWriter, SstWriterBuilder, WriteBatch, }; +use fail::fail_point; use kvproto::encryptionpb::EncryptionMethod; use tikv_util::{ box_try, codec::bytes::{BytesEncoder, CompactBytesFromFileDecoder}, - debug, info, + debug, error, info, time::{Instant, Limiter}, }; @@ -60,7 +59,7 @@ where if let Some(key_mgr) = key_mgr { let enc_info = box_try!(key_mgr.new_file(path)); - let mthd = from_engine_encryption_method(enc_info.method); + let mthd = enc_info.method; if mthd != EncryptionMethod::Plaintext { let writer = box_try!(EncrypterWriter::new( file.take().unwrap(), @@ -116,6 +115,7 @@ pub fn build_sst_cf_file_list( end_key: &[u8], raw_size_per_file: u64, io_limiter: &Limiter, + key_mgr: Option>, ) -> Result where E: KvEngine, @@ -133,6 +133,48 @@ where let sst_writer = RefCell::new(create_sst_file_writer::(engine, cf, &path)?); let mut file_length: usize = 0; + let finish_sst_writer = |sst_writer: E::SstWriter, + path: String, + key_mgr: Option>| + -> Result<(), Error> { + sst_writer.finish()?; + (|| { + fail_point!("inject_sst_file_corruption", |_| { + static CALLED: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(false); + if CALLED + .compare_exchange( + false, + true, + std::sync::atomic::Ordering::SeqCst, + std::sync::atomic::Ordering::SeqCst, + ) + .is_err() + { + return; + } + // overwrite the file to break checksum + let mut f = OpenOptions::new().write(true).open(&path).unwrap(); + f.write_all(b"x").unwrap(); + }); + })(); + + let sst_reader = E::SstReader::open(&path, key_mgr)?; + if let Err(e) = sst_reader.verify_checksum() { + // use sst reader to verify block checksum, it would detect corrupted SST due to + // memory bit-flip + fs::remove_file(&path)?; + error!( + "failed to pass block checksum verification"; + "file" => path, + "err" => ?e, + ); + return Err(io::Error::new(io::ErrorKind::InvalidData, e).into()); + } + File::open(&path).and_then(|f| f.sync_all())?; + Ok(()) + }; + let instant = Instant::now(); box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { let entry_len = key.len() + value.len(); @@ -151,8 +193,7 @@ where match result { Ok(new_sst_writer) => { let old_writer = sst_writer.replace(new_sst_writer); - box_try!(old_writer.finish()); - box_try!(File::open(prev_path).and_then(|f| f.sync_all())); + box_try!(finish_sst_writer(old_writer, prev_path, key_mgr.clone())); } Err(e) => { let io_error = io::Error::new(io::ErrorKind::Other, e); @@ -178,9 +219,8 @@ where Ok(true) })); if stats.key_count > 0 { + box_try!(finish_sst_writer(sst_writer.into_inner(), path, key_mgr)); cf_file.add_file(file_id); - box_try!(sst_writer.into_inner().finish()); - box_try!(File::open(path).and_then(|f| f.sync_all())); info!( "build_sst_cf_file_list builds {} files in cf {}. Total keys {}, total size {}. raw_size_per_file {}, total takes {:?}", file_id + 1, @@ -287,7 +327,7 @@ pub fn get_decrypter_reader( encryption_key_manager: &DataKeyManager, ) -> Result, Error> { let enc_info = box_try!(encryption_key_manager.get_file(file)); - let mthd = from_engine_encryption_method(enc_info.method); + let mthd = enc_info.method; debug!( "get_decrypter_reader gets enc_info for {:?}, method: {:?}", file, mthd @@ -338,7 +378,7 @@ mod tests { .unwrap(); let db1: KvTestEngine = open_test_empty_db(dir1.path(), db_opt, None).unwrap(); - let snap = db.snapshot(); + let snap = db.snapshot(None); for cf in SNAPSHOT_CFS { let snap_cf_dir = Builder::new().prefix("test-snap-cf").tempdir().unwrap(); let mut cf_file = CfFile { @@ -422,11 +462,12 @@ mod tests { let stats = build_sst_cf_file_list::( &mut cf_file, &db, - &db.snapshot(), + &db.snapshot(None), &keys::data_key(b"a"), &keys::data_key(b"z"), *max_file_size, &limiter, + db_opt.as_ref().and_then(|opt| opt.get_key_manager()), ) .unwrap(); if stats.key_count == 0 { diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 7f10e7cd249..2ca19fbe5fe 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::sync::mpsc; +use std::sync::{mpsc, Mutex}; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; @@ -46,6 +46,13 @@ where fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()>; } +impl<'a, T: SignificantRouter, EK: KvEngine> SignificantRouter for &'a Mutex { + #[inline] + fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { + Mutex::lock(self).unwrap().significant_send(region_id, msg) + } +} + /// Routes proposal to target region. pub trait ProposalRouter where @@ -79,6 +86,13 @@ where } } +impl<'a, EK: KvEngine, T: CasualRouter> CasualRouter for &'a Mutex { + #[inline] + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::send(&*Mutex::lock(self).unwrap(), region_id, msg) + } +} + impl SignificantRouter for RaftRouter where EK: KvEngine, diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 0091fd4e7bb..ae352ea08ab 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -1,16 +1,16 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + collections::{BTreeMap, Bound}, fmt, sync::atomic::{AtomicU64, Ordering}, }; -use collections::HashMap; use kvproto::metapb; use lazy_static::lazy_static; use parking_lot::RwLock; use prometheus::{register_int_gauge, IntGauge}; -use txn_types::{Key, PessimisticLock}; +use txn_types::{Key, Lock, PessimisticLock}; /// Transaction extensions related to a peer. #[derive(Default)] @@ -106,7 +106,7 @@ pub struct PeerPessimisticLocks { /// skipped because of version mismatch. So, no lock should be deleted. /// It's correct that we include the locks that are marked deleted in the /// commit merge request. - map: HashMap, + map: BTreeMap, /// Status of the pessimistic lock map. /// The map is writable only in the Normal state. pub status: LocksStatus, @@ -143,7 +143,7 @@ impl fmt::Debug for PeerPessimisticLocks { impl Default for PeerPessimisticLocks { fn default() -> Self { PeerPessimisticLocks { - map: HashMap::default(), + map: BTreeMap::default(), status: LocksStatus::Normal, term: 0, version: 0, @@ -192,7 +192,7 @@ impl PeerPessimisticLocks { } pub fn clear(&mut self) { - self.map = HashMap::default(); + self.map = BTreeMap::default(); GLOBAL_MEM_SIZE.sub(self.memory_size as i64); self.memory_size = 0; } @@ -244,12 +244,20 @@ impl PeerPessimisticLocks { // Locks that are marked deleted still need to be moved to the new regions, // and the deleted mark should also be cleared. // Refer to the comment in `PeerPessimisticLocks` for details. - let removed_locks = self.map.drain_filter(|key, _| { - let key = &**key.as_encoded(); + // There is no drain_filter for BtreeMap, so extra clone are needed. + let mut removed_locks = Vec::new(); + self.map.retain(|key, value| { + let key_ref = key.as_encoded().as_slice(); let (start_key, end_key) = (derived.get_start_key(), derived.get_end_key()); - key < start_key || (!end_key.is_empty() && key >= end_key) + if key_ref < start_key || (!end_key.is_empty() && key_ref >= end_key) { + removed_locks.push((key.clone(), value.clone())); + false + } else { + true + } }); - for (key, (lock, _)) in removed_locks { + + for (key, (lock, _)) in removed_locks.into_iter() { let idx = match regions .binary_search_by_key(&&**key.as_encoded(), |region| region.get_start_key()) { @@ -264,6 +272,37 @@ impl PeerPessimisticLocks { res } + /// Scan and return locks in the current pessimistic lock map, the map + /// should be locked first before calling this method. + pub fn scan_locks( + &self, + start: Option<&Key>, + end: Option<&Key>, + filter: F, + limit: usize, + ) -> (Vec<(Key, Lock)>, bool) + where + F: Fn(&Key, &PessimisticLock) -> bool, + { + if let (Some(start_key), Some(end_key)) = (start, end) { + assert!(end_key >= start_key); + } + let mut locks = Vec::with_capacity(limit); + let mut iter = self.map.range(( + start.map_or(Bound::Unbounded, |k| Bound::Included(k)), + end.map_or(Bound::Unbounded, |k| Bound::Excluded(k)), + )); + while let Some((key, (lock, _))) = iter.next() { + if filter(key, lock) { + locks.push((key.clone(), lock.clone().into_lock())); + } + if limit > 0 && locks.len() >= limit { + return (locks, iter.next().is_some()); + } + } + (locks, false) + } + #[cfg(test)] fn from_locks(locks: impl IntoIterator) -> Self { let mut res = PeerPessimisticLocks::default(); @@ -277,7 +316,7 @@ impl PeerPessimisticLocks { impl<'a> IntoIterator for &'a PeerPessimisticLocks { type Item = (&'a Key, &'a (PessimisticLock, bool)); - type IntoIter = std::collections::hash_map::Iter<'a, Key, (PessimisticLock, bool)>; + type IntoIter = std::collections::btree_map::Iter<'a, Key, (PessimisticLock, bool)>; fn into_iter(self) -> Self::IntoIter { self.map.iter() @@ -331,6 +370,24 @@ mod tests { } } + fn lock_with_key(key: &[u8], deleted: bool) -> (Key, (PessimisticLock, bool)) { + ( + Key::from_raw(key), + ( + PessimisticLock { + primary: key.to_vec().into_boxed_slice(), + start_ts: 10.into(), + ttl: 1000, + for_update_ts: 10.into(), + min_commit_ts: 20.into(), + last_change: LastChange::make_exist(5.into(), 2), + is_locked_with_conflict: false, + }, + deleted, + ), + ) + } + #[test] fn test_memory_size() { let _guard = TEST_MUTEX.lock().unwrap(); @@ -418,23 +475,6 @@ mod tests { #[test] fn test_group_locks_by_regions() { - fn lock(key: &[u8], deleted: bool) -> (Key, (PessimisticLock, bool)) { - ( - Key::from_raw(key), - ( - PessimisticLock { - primary: key.to_vec().into_boxed_slice(), - start_ts: 10.into(), - ttl: 1000, - for_update_ts: 10.into(), - min_commit_ts: 20.into(), - last_change: LastChange::make_exist(5.into(), 2), - is_locked_with_conflict: false, - }, - deleted, - ), - ) - } fn region(start_key: &[u8], end_key: &[u8]) -> metapb::Region { let mut region = metapb::Region::default(); region.set_start_key(start_key.to_vec()); @@ -445,11 +485,11 @@ mod tests { defer!(GLOBAL_MEM_SIZE.set(0)); let mut original = PeerPessimisticLocks::from_locks(vec![ - lock(b"a", true), - lock(b"c", false), - lock(b"e", true), - lock(b"g", false), - lock(b"i", false), + lock_with_key(b"a", true), + lock_with_key(b"c", false), + lock_with_key(b"e", true), + lock_with_key(b"g", false), + lock_with_key(b"i", false), ]); let regions = vec![ region(b"", b"b"), // test leftmost region @@ -460,10 +500,10 @@ mod tests { ]; let output = original.group_by_regions(®ions, ®ions[4]); let expected: Vec<_> = vec![ - vec![lock(b"a", false)], + vec![lock_with_key(b"a", false)], vec![], - vec![lock(b"c", false)], - vec![lock(b"e", false), lock(b"g", false)], + vec![lock_with_key(b"c", false)], + vec![lock_with_key(b"e", false), lock_with_key(b"g", false)], vec![], // the position of the derived region is empty ] .into_iter() @@ -473,7 +513,159 @@ mod tests { // The lock that belongs to the derived region is kept in the original map. assert_eq!( original, - PeerPessimisticLocks::from_locks(vec![lock(b"i", false)]) + PeerPessimisticLocks::from_locks(vec![lock_with_key(b"i", false)]) ); } + + #[test] + fn test_scan_memory_lock() { + // Create a sample PeerPessimisticLocks instance with some locks. + let peer_locks = PeerPessimisticLocks::from_locks(vec![ + lock_with_key(b"key1", false), + lock_with_key(b"key2", false), + lock_with_key(b"key3", false), + ]); + + fn txn_lock(key: &[u8], deleted: bool) -> Lock { + let (_, (pessimistic_lock, _)) = lock_with_key(key, deleted); + pessimistic_lock.into_lock() + } + + let filter_pass_all = |_key: &Key, _lock: &PessimisticLock| true; + let filter_pass_key2 = + |key: &Key, _lock: &PessimisticLock| key.as_encoded().starts_with(b"key2"); + + // Case parameter: start_key, end_key, filter, limit, expected results, expected + // has more. + type LockFilter = fn(&Key, &PessimisticLock) -> bool; + let cases: [( + Option, + Option, + LockFilter, + usize, + Vec<(Key, Lock)>, + bool, + ); 12] = [ + ( + None, + None, + filter_pass_all, + 1, + vec![(Key::from_raw(b"key1"), txn_lock(b"key1", false))], + true, + ), + ( + None, + None, + filter_pass_all, + 10, + vec![ + (Key::from_raw(b"key1"), txn_lock(b"key1", false)), + (Key::from_raw(b"key2"), txn_lock(b"key2", false)), + (Key::from_raw(b"key3"), txn_lock(b"key3", false)), + ], + false, + ), + ( + Some(Key::from_raw(b"key0")), + Some(Key::from_raw(b"key1")), + filter_pass_all, + 10, + vec![], + false, + ), + ( + Some(Key::from_raw(b"key0")), + Some(Key::from_raw(b"key2")), + filter_pass_all, + 10, + vec![(Key::from_raw(b"key1"), txn_lock(b"key1", false))], + false, + ), + ( + Some(Key::from_raw(b"key1")), + Some(Key::from_raw(b"key3")), + filter_pass_all, + 10, + vec![ + (Key::from_raw(b"key1"), txn_lock(b"key1", false)), + (Key::from_raw(b"key2"), txn_lock(b"key2", false)), + ], + false, + ), + ( + Some(Key::from_raw(b"key1")), + Some(Key::from_raw(b"key4")), + filter_pass_all, + 2, + vec![ + (Key::from_raw(b"key1"), txn_lock(b"key1", false)), + (Key::from_raw(b"key2"), txn_lock(b"key2", false)), + ], + true, + ), + ( + Some(Key::from_raw(b"key1")), + Some(Key::from_raw(b"key4")), + filter_pass_all, + 10, + vec![ + (Key::from_raw(b"key1"), txn_lock(b"key1", false)), + (Key::from_raw(b"key2"), txn_lock(b"key2", false)), + (Key::from_raw(b"key3"), txn_lock(b"key3", false)), + ], + false, + ), + ( + Some(Key::from_raw(b"key2")), + Some(Key::from_raw(b"key4")), + filter_pass_all, + 10, + vec![ + (Key::from_raw(b"key2"), txn_lock(b"key2", false)), + (Key::from_raw(b"key3"), txn_lock(b"key3", false)), + ], + false, + ), + ( + Some(Key::from_raw(b"key4")), + Some(Key::from_raw(b"key4")), + filter_pass_all, + 10, + vec![], + false, + ), + ( + None, + None, + filter_pass_key2, + 10, + vec![(Key::from_raw(b"key2"), txn_lock(b"key2", false))], + false, + ), + ( + Some(Key::from_raw(b"key2")), + None, + filter_pass_key2, + 1, + vec![(Key::from_raw(b"key2"), txn_lock(b"key2", false))], + true, + ), + ( + None, + Some(Key::from_raw(b"key2")), + filter_pass_key2, + 1, + vec![], + false, + ), + ]; + + for (start_key, end_key, filter, limit, expected_locks, expected_has_more) in cases { + let (locks, has_more) = + peer_locks.scan_locks(start_key.as_ref(), end_key.as_ref(), filter, limit); + assert_eq!(locks, expected_locks); + assert_eq!(has_more, expected_has_more); + } + } } diff --git a/components/raftstore/src/store/unsafe_recovery.rs b/components/raftstore/src/store/unsafe_recovery.rs index f98fcaea581..28943ae7339 100644 --- a/components/raftstore/src/store/unsafe_recovery.rs +++ b/components/raftstore/src/store/unsafe_recovery.rs @@ -241,7 +241,7 @@ pub struct UnsafeRecoveryForceLeaderSyncer(Arc); impl UnsafeRecoveryForceLeaderSyncer { pub fn new(report_id: u64, router: Arc) -> Self { let inner = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, force leader finished."); + info!("Unsafe recovery, force leader finished."; "report_id" => report_id); start_unsafe_recovery_report(router, report_id, false); }))); UnsafeRecoveryForceLeaderSyncer(Arc::new(inner)) @@ -260,11 +260,11 @@ impl UnsafeRecoveryExecutePlanSyncer { let abort = Arc::new(Mutex::new(false)); let abort_clone = abort.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, plan execution finished"); if *abort_clone.lock().unwrap() { - warn!("Unsafe recovery, plan execution aborted"); + warn!("Unsafe recovery, plan execution aborted"; "report_id" => report_id); return; } + info!("Unsafe recovery, plan execution finished"; "report_id" => report_id); start_unsafe_recovery_report(router, report_id, true); }))); UnsafeRecoveryExecutePlanSyncer { @@ -330,7 +330,7 @@ impl UnsafeRecoveryWaitApplySyncer { let abort_clone = abort.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { if *abort_clone.lock().unwrap() { - warn!("Unsafe recovery, wait apply aborted"); + warn!("Unsafe recovery, wait apply aborted"; "report_id" => report_id); return; } info!("Unsafe recovery, wait apply finished"); @@ -363,7 +363,7 @@ impl UnsafeRecoveryFillOutReportSyncer { let reports = Arc::new(Mutex::new(vec![])); let reports_clone = reports.clone(); let closure = InvokeClosureOnDrop(Some(Box::new(move || { - info!("Unsafe recovery, peer reports collected"); + info!("Unsafe recovery, peer reports collected"; "report_id" => report_id); let mut store_report = StoreReport::default(); { let mut reports_ptr = reports_clone.lock().unwrap(); @@ -420,6 +420,9 @@ pub enum UnsafeRecoveryState { }, Destroy(UnsafeRecoveryExecutePlanSyncer), WaitInitialize(UnsafeRecoveryExecutePlanSyncer), + // DemoteFailedVoter may fail due to some reasons. It's just a marker to avoid exiting force + // leader state + Failed, } impl UnsafeRecoveryState { @@ -429,6 +432,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => syncer.time, + UnsafeRecoveryState::Failed => return false, }; time.saturating_elapsed() >= timeout } @@ -439,6 +443,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => &syncer.abort, + UnsafeRecoveryState::Failed => return true, }; *abort.lock().unwrap() } @@ -449,6 +454,7 @@ impl UnsafeRecoveryState { UnsafeRecoveryState::DemoteFailedVoters { syncer, .. } | UnsafeRecoveryState::Destroy(syncer) | UnsafeRecoveryState::WaitInitialize(syncer) => syncer.abort(), + UnsafeRecoveryState::Failed => (), } } } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 880a394fdae..6eef4c61686 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -160,6 +160,20 @@ pub fn new_empty_snapshot( snapshot } +pub fn gen_bucket_version(term: u64, current_version: u64) -> u64 { + // term logical counter + // |-----------|-----------| + // high bits low bits + // term: given 10s election timeout, the 32 bit means 1362 year running time + let current_version_term = current_version >> 32; + let bucket_version: u64 = if current_version_term == term { + current_version + 1 + } else { + term << 32 + }; + bucket_version +} + const STR_CONF_CHANGE_ADD_NODE: &str = "AddNode"; const STR_CONF_CHANGE_REMOVE_NODE: &str = "RemoveNode"; const STR_CONF_CHANGE_ADDLEARNER_NODE: &str = "AddLearner"; @@ -306,7 +320,7 @@ pub fn compare_region_epoch( // tells TiDB with a epoch not match error contains the latest target Region // info, TiDB updates its region cache and sends requests to TiKV B, // and TiKV B has not applied commit merge yet, since the region epoch in - // request is higher than TiKV B, the request must be denied due to epoch + // request is higher than TiKV B, the request must be suspended due to epoch // not match, so it does not read on a stale snapshot, thus avoid the // KeyNotInRegion error. let current_epoch = region.get_region_epoch(); @@ -1179,14 +1193,15 @@ impl RegionReadProgressRegistry { } // Get the minimum `resolved_ts` which could ensure that there will be no more - // locks whose `start_ts` is greater than it. + // locks whose `commit_ts` is smaller than it. pub fn get_min_resolved_ts(&self) -> u64 { self.registry .lock() .unwrap() .iter() .map(|(_, rrp)| rrp.resolved_ts()) - .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized + //TODO: the uninitialized peer should be taken into consideration instead of skipping it(https://github.com/tikv/tikv/issues/15506). + .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized, .min() .unwrap_or(0) } @@ -1439,7 +1454,6 @@ impl RegionReadProgress { self.safe_ts() } - // Dump the `LeaderInfo` and the peer list pub fn get_core(&self) -> MutexGuard<'_, RegionReadProgressCore> { self.core.lock().unwrap() } @@ -1720,13 +1734,38 @@ pub struct RaftstoreDuration { } impl RaftstoreDuration { + #[inline] pub fn sum(&self) -> std::time::Duration { - self.store_wait_duration.unwrap_or_default() - + self.store_process_duration.unwrap_or_default() + self.delays_on_disk_io(true) + self.delays_on_net_io() + } + + #[inline] + /// Returns the delayed duration on Disk I/O. + pub fn delays_on_disk_io(&self, include_wait_duration: bool) -> std::time::Duration { + let duration = self.store_process_duration.unwrap_or_default() + self.store_write_duration.unwrap_or_default() - + self.store_commit_duration.unwrap_or_default() - + self.apply_wait_duration.unwrap_or_default() - + self.apply_process_duration.unwrap_or_default() + + self.apply_process_duration.unwrap_or_default(); + if include_wait_duration { + duration + + self.store_wait_duration.unwrap_or_default() + + self.apply_wait_duration.unwrap_or_default() + } else { + duration + } + } + + #[inline] + /// Returns the delayed duration on Network I/O. + /// + /// Normally, it can be reflected by the duraiton on + /// `store_commit_duraiton`. + pub fn delays_on_net_io(&self) -> std::time::Duration { + // The `store_commit_duration` serves as an indicator for latency + // during the duration of transferring Raft logs to peers and appending + // logs. In most scenarios, instances of latency fluctuations in the + // network are reflected by this duration. Hence, it is selected as a + // representative of network latency. + self.store_commit_duration.unwrap_or_default() } } diff --git a/components/raftstore/src/store/worker/cleanup.rs b/components/raftstore/src/store/worker/cleanup.rs index 632e85f40cc..da2f004f47c 100644 --- a/components/raftstore/src/store/worker/cleanup.rs +++ b/components/raftstore/src/store/worker/cleanup.rs @@ -3,7 +3,6 @@ use std::fmt::{self, Display, Formatter}; use engine_traits::{KvEngine, RaftEngine}; -use pd_client::PdClient; use tikv_util::worker::Runnable; use super::{ @@ -11,7 +10,6 @@ use super::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, }; -use crate::store::StoreRouter; pub enum Task { Compact(CompactTask), @@ -29,29 +27,26 @@ impl Display for Task { } } -pub struct Runner +pub struct Runner where E: KvEngine, R: RaftEngine, - S: StoreRouter, { compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, } -impl Runner +impl Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { pub fn new( compact: CompactRunner, - cleanup_sst: CleanupSstRunner, + cleanup_sst: CleanupSstRunner, gc_snapshot: GcSnapshotRunner, - ) -> Runner { + ) -> Runner { Runner { compact, cleanup_sst, @@ -60,12 +55,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where E: KvEngine, R: RaftEngine, - C: PdClient, - S: StoreRouter, { type Task = Task; diff --git a/components/raftstore/src/store/worker/cleanup_sst.rs b/components/raftstore/src/store/worker/cleanup_sst.rs index 8174b872f4b..ca139a562a2 100644 --- a/components/raftstore/src/store/worker/cleanup_sst.rs +++ b/components/raftstore/src/store/worker/cleanup_sst.rs @@ -1,62 +1,31 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error, fmt, marker::PhantomData, sync::Arc}; +use std::{fmt, sync::Arc}; use engine_traits::KvEngine; -use kvproto::{import_sstpb::SstMeta, metapb::Region}; -use pd_client::PdClient; +use kvproto::import_sstpb::SstMeta; use sst_importer::SstImporter; -use tikv_util::{error, worker::Runnable}; - -use crate::store::{util::is_epoch_stale, StoreMsg, StoreRouter}; - -type Result = std::result::Result>; +use tikv_util::worker::Runnable; pub enum Task { DeleteSst { ssts: Vec }, - ValidateSst { ssts: Vec }, } impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Task::DeleteSst { ref ssts } => write!(f, "Delete {} ssts", ssts.len()), - Task::ValidateSst { ref ssts } => write!(f, "Validate {} ssts", ssts.len()), } } } -pub struct Runner -where - EK: KvEngine, - S: StoreRouter, -{ - store_id: u64, - store_router: S, - importer: Arc, - pd_client: Arc, - _engine: PhantomData, +pub struct Runner { + importer: Arc>, } -impl Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ - pub fn new( - store_id: u64, - store_router: S, - importer: Arc, - pd_client: Arc, - ) -> Runner { - Runner { - store_id, - store_router, - importer, - pd_client, - _engine: PhantomData, - } +impl Runner { + pub fn new(importer: Arc>) -> Self { + Runner { importer } } /// Deletes SST files from the importer. @@ -65,78 +34,9 @@ where let _ = self.importer.delete(sst); } } - - fn get_region_by_meta(&self, sst: &SstMeta) -> Result { - // The SST meta has been delivered with a range, use it directly. - // For now, no case will reach this. But this still could be a guard for - // reducing the superise in the future... - if !sst.get_range().get_start().is_empty() || !sst.get_range().get_end().is_empty() { - return self - .pd_client - .get_region(sst.get_range().get_start()) - .map_err(Into::into); - } - // Once there isn't range provided. - let query_by_start_key_of_full_meta = || { - let start_key = self - .importer - .load_start_key_by_meta::(sst)? - .ok_or_else(|| -> Box { - "failed to load start key from sst, the sst might be empty".into() - })?; - let region = self.pd_client.get_region(&start_key)?; - Result::Ok(region) - }; - query_by_start_key_of_full_meta() - .map_err(|err| - format!("failed to load full sst meta from disk for {:?} and there isn't extra information provided: {err}", sst.get_uuid()).into() - ) - } - - /// Validates whether the SST is stale or not. - fn handle_validate_sst(&self, ssts: Vec) { - let store_id = self.store_id; - let mut invalid_ssts = Vec::new(); - for sst in ssts { - match self.get_region_by_meta(&sst) { - Ok(r) => { - // The region id may or may not be the same as the - // SST file, but it doesn't matter, because the - // epoch of a range will not decrease anyway. - if is_epoch_stale(r.get_region_epoch(), sst.get_region_epoch()) { - // Region has not been updated. - continue; - } - if r.get_id() == sst.get_region_id() - && r.get_peers().iter().any(|p| p.get_store_id() == store_id) - { - // The SST still belongs to this store. - continue; - } - invalid_ssts.push(sst); - } - Err(e) => { - error!("get region failed"; "err" => %e); - } - } - } - - // We need to send back the result to check for the stale - // peer, which may ingest the stale SST before it is - // destroyed. - let msg = StoreMsg::ValidateSstResult { invalid_ssts }; - if let Err(e) = self.store_router.send(msg) { - error!(%e; "send validate sst result failed"); - } - } } -impl Runnable for Runner -where - EK: KvEngine, - C: PdClient, - S: StoreRouter, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { @@ -144,9 +44,6 @@ where Task::DeleteSst { ssts } => { self.handle_delete_sst(ssts); } - Task::ValidateSst { ssts } => { - self.handle_validate_sst(ssts); - } } } } diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 3b2a2ec0404..45fd7e586e7 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -4,18 +4,34 @@ use std::{ collections::VecDeque, error::Error as StdError, fmt::{self, Display, Formatter}, + sync::atomic::{AtomicBool, Ordering}, + time::Duration, }; use engine_traits::{KvEngine, RangeStats, CF_WRITE}; use fail::fail_point; +use futures_util::compat::Future01CompatExt; use thiserror::Error; -use tikv_util::{box_try, error, info, time::Instant, warn, worker::Runnable}; +use tikv_util::{ + box_try, debug, error, info, time::Instant, timer::GLOBAL_TIMER_HANDLE, warn, worker::Runnable, +}; +use yatp::Remote; -use super::metrics::COMPACT_RANGE_CF; +use super::metrics::{ + COMPACT_RANGE_CF, FULL_COMPACT, FULL_COMPACT_INCREMENTAL, FULL_COMPACT_PAUSE, +}; type Key = Vec; +static FULL_COMPACTION_IN_PROCESS: AtomicBool = AtomicBool::new(false); + pub enum Task { + PeriodicFullCompact { + // Ranges, or empty if we wish to compact the entire store + ranges: Vec<(Key, Key)>, + compact_load_controller: FullCompactController, + }, + Compact { cf_name: String, start_key: Option, // None means smallest key @@ -32,6 +48,65 @@ pub enum Task { }, } +type CompactPredicateFn = Box bool + Send + Sync>; + +pub struct FullCompactController { + /// Initial delay between retries for ``FullCompactController::pause``. + pub initial_pause_duration_secs: u64, + /// Max delay between retries. + pub max_pause_duration_secs: u64, + /// Predicate function to evaluate that indicates if we can proceed with + /// full compaction. + pub incremental_compaction_pred: CompactPredicateFn, +} + +impl fmt::Debug for FullCompactController { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("FullCompactController") + .field( + "initial_pause_duration_secs", + &self.initial_pause_duration_secs, + ) + .field("max_pause_duration_secs", &self.max_pause_duration_secs) + .finish() + } +} +impl FullCompactController { + pub fn new( + initial_pause_duration_secs: u64, + max_pause_duration_secs: u64, + incremental_compaction_pred: CompactPredicateFn, + ) -> Self { + Self { + initial_pause_duration_secs, + max_pause_duration_secs, + incremental_compaction_pred, + } + } + + /// Pause until `incremental_compaction_pred` evaluates to `true`: delay + /// using exponential backoff (initial value + /// `initial_pause_duration_secs`, max value `max_pause_duration_secs`) + /// between retries. + pub async fn pause(&self) -> Result<(), Error> { + let mut duration_secs = self.initial_pause_duration_secs; + loop { + box_try!( + GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + Duration::from_secs(duration_secs)) + .compat() + .await + ); + if (self.incremental_compaction_pred)() { + break; + }; + duration_secs = self.max_pause_duration_secs.max(duration_secs * 2); + } + Ok(()) + } +} + +#[derive(Debug)] pub struct CompactThreshold { pub tombstones_num_threshold: u64, pub tombstones_percent_threshold: u64, @@ -58,6 +133,24 @@ impl CompactThreshold { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match *self { + Task::PeriodicFullCompact { + ref ranges, + ref compact_load_controller, + } => f + .debug_struct("PeriodicFullCompact") + .field( + "ranges", + &( + ranges + .first() + .map(|k| log_wrappers::Value::key(k.0.as_slice())), + ranges + .last() + .map(|k| log_wrappers::Value::key(k.1.as_slice())), + ), + ) + .field("compact_load_controller", compact_load_controller) + .finish(), Task::Compact { ref cf_name, ref start_key, @@ -117,14 +210,89 @@ pub enum Error { pub struct Runner { engine: E, + remote: Remote, } impl Runner where E: KvEngine, { - pub fn new(engine: E) -> Runner { - Runner { engine } + pub fn new(engine: E, remote: Remote) -> Runner { + Runner { engine, remote } + } + + /// Periodic full compaction. + /// Note: this does not accept a `&self` due to async lifetime issues. + /// + /// NOTE this is an experimental feature! + /// + /// TODO: Support stopping a full compaction. + async fn full_compact( + engine: E, + ranges: Vec<(Key, Key)>, + compact_controller: FullCompactController, + ) -> Result<(), Error> { + fail_point!("on_full_compact"); + info!("full compaction started"); + let mut ranges: VecDeque<_> = ranges + .iter() + .map(|(start, end)| (Some(start.as_slice()), Some(end.as_slice()))) + .collect(); + if ranges.is_empty() { + ranges.push_front((None, None)) + } + + let timer = Instant::now(); + let full_compact_timer = FULL_COMPACT.start_coarse_timer(); + + while let Some(range) = ranges.pop_front() { + debug!( + "incremental range full compaction started"; + "start_key" => ?range.0.map(log_wrappers::Value::key), + "end_key" => ?range.1.map(log_wrappers::Value::key), + ); + let incremental_timer = FULL_COMPACT_INCREMENTAL.start_coarse_timer(); + box_try!(engine.compact_range( + range.0, range.1, // Compact the entire key range. + false, // non-exclusive + 1, // number of threads threads + )); + incremental_timer.observe_duration(); + debug!( + "finished incremental range full compaction"; + "remaining" => ranges.len(), + ); + // If there is at least one range remaining in `ranges` remaining, evaluate + // `compact_controller.incremental_compaction_pred`. If `true`, proceed to next + // range; otherwise, pause this task + // (see `FullCompactController::pause` for details) until predicate + // evaluates to true. + if let Some(next_range) = ranges.front() { + if !(compact_controller.incremental_compaction_pred)() { + info!("pausing full compaction before next increment"; + "finished_start_key" => ?range.0.map(log_wrappers::Value::key), + "finished_end_key" => ?range.1.map(log_wrappers::Value::key), + "next_range_start_key" => ?next_range.0.map(log_wrappers::Value::key), + "next_range_end_key" => ?next_range.1.map(log_wrappers::Value::key), + "remaining" => ranges.len(), + ); + let pause_started = Instant::now(); + let pause_timer = FULL_COMPACT_PAUSE.start_coarse_timer(); + compact_controller.pause().await?; + pause_timer.observe_duration(); + info!("resuming incremental full compaction"; + "paused" => ?pause_started.saturating_elapsed(), + ); + } + } + } + + full_compact_timer.observe_duration(); + info!( + "full compaction finished"; + "time_takes" => ?timer.saturating_elapsed(), + ); + Ok(()) } /// Sends a compact range command to RocksDB to compact the range of the cf. @@ -163,6 +331,29 @@ where fn run(&mut self, task: Task) { match task { + Task::PeriodicFullCompact { + ranges, + compact_load_controller, + } => { + // Since periodic full compaction is submitted as a task to the background + // worker pool, verify we will not start full compaction if + // another full compaction is running in the background. + if FULL_COMPACTION_IN_PROCESS.load(Ordering::SeqCst) + || FULL_COMPACTION_IN_PROCESS.swap(true, Ordering::SeqCst) + { + info!("full compaction is already in process, not starting"); + return; + }; + let engine = self.engine.clone(); + self.remote.spawn(async move { + if let Err(e) = + Self::full_compact(engine, ranges, compact_load_controller).await + { + error!("periodic full compaction failed"; "err" => %e); + } + FULL_COMPACTION_IN_PROCESS.store(false, Ordering::SeqCst); + }); + } Task::Compact { cf_name, start_key, @@ -282,10 +473,19 @@ mod tests { }; use keys::data_key; use tempfile::Builder; + use tikv_util::yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}; use txn_types::{Key, TimeStamp, Write, WriteType}; use super::*; + fn make_compact_runner(engine: E) -> (FuturePool, Runner) + where + E: KvEngine, + { + let pool = YatpPoolBuilder::new(DefaultTicker::default()).build_future_pool(); + (pool.clone(), Runner::new(engine, pool.remote().clone())) + } + #[test] fn test_compact_range() { let path = Builder::new() @@ -293,8 +493,7 @@ mod tests { .tempdir() .unwrap(); let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); - - let mut runner = Runner::new(db.clone()); + let (_pool, mut runner) = make_compact_runner(db.clone()); // Generate the first SST file. let mut wb = db.write_batch(); @@ -456,4 +655,105 @@ mod tests { .unwrap(); assert_eq!(ranges_need_to_compact, expected_ranges); } + + #[test] + fn test_full_compact_deletes() { + let tmp_dir = Builder::new().prefix("test").tempdir().unwrap(); + let engine = open_db(tmp_dir.path().to_str().unwrap()); + let (_pool, mut runner) = make_compact_runner(engine.clone()); + + // mvcc_put 0..5 + for i in 0..5 { + let (k, v) = (format!("k{}", i), format!("value{}", i)); + mvcc_put(&engine, k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + } + engine.flush_cf(CF_WRITE, true).unwrap(); + + let (start, end) = (data_key(b"k0"), data_key(b"k5")); + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries, stats.num_versions); + + for i in 0..5 { + let k = format!("k{}", i); + delete(&engine, k.as_bytes(), 3.into()); + } + engine.flush_cf(CF_WRITE, true).unwrap(); + + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries - stats.num_versions, 5); + + runner.run(Task::PeriodicFullCompact { + ranges: Vec::new(), + compact_load_controller: FullCompactController::new(0, 0, Box::new(|| true)), + }); + std::thread::sleep(Duration::from_millis(500)); + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries - stats.num_versions, 0); + } + + #[test] + fn test_full_compact_incremental_pausable() { + let tmp_dir = Builder::new().prefix("test").tempdir().unwrap(); + let engine = open_db(tmp_dir.path().to_str().unwrap()); + let (_pool, mut runner) = make_compact_runner(engine.clone()); + + // mvcc_put 0..100 + for i in 0..100 { + let (k, v) = (format!("k{}", i), format!("value{}", i)); + mvcc_put(&engine, k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + } + engine.flush_cf(CF_WRITE, true).unwrap(); + + let (start, end) = (data_key(b"k0"), data_key(b"k5")); + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries, stats.num_versions); + + for i in 0..100 { + let k = format!("k{}", i); + delete(&engine, k.as_bytes(), 3.into()); + } + engine.flush_cf(CF_WRITE, true).unwrap(); + + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries - stats.num_versions, 100); + + let started_at = Instant::now(); + let pred_fn: CompactPredicateFn = + Box::new(move || Instant::now() - started_at > Duration::from_millis(500)); + let ranges = vec![ + (data_key(b"k0"), data_key(b"k25")), + (data_key(b"k25"), data_key(b"k50")), + (data_key(b"k50"), data_key(b"k100")), + ]; + runner.run(Task::PeriodicFullCompact { + ranges, + compact_load_controller: FullCompactController::new(1, 5, pred_fn), + }); + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries - stats.num_versions, 100); + std::thread::sleep(Duration::from_secs(2)); + let stats = engine + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(stats.num_entries - stats.num_versions, 0); + } } diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index fef2bae332c..d034cd8604f 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -162,7 +162,7 @@ mod tests { index: 10, context: vec![ConsistencyCheckMethod::Raw as u8], region: region.clone(), - snap: db.snapshot(), + snap: db.snapshot(None), }); let mut checksum_bytes = vec![]; checksum_bytes.write_u32::(sum).unwrap(); diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index fd3f54d239d..2b10bc3e053 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -70,6 +70,8 @@ make_static_metric! { pub struct LocalReadMetrics { pub local_executed_requests: LocalIntCounter, pub local_executed_stale_read_requests: LocalIntCounter, + pub local_executed_stale_read_fallback_success_requests: LocalIntCounter, + pub local_executed_stale_read_fallback_failure_requests: LocalIntCounter, pub local_executed_replica_read_requests: LocalIntCounter, pub local_executed_snapshot_cache_hit: LocalIntCounter, pub reject_reason: LocalReadRejectCounter, @@ -82,6 +84,8 @@ thread_local! { LocalReadMetrics { local_executed_requests: LOCAL_READ_EXECUTED_REQUESTS.local(), local_executed_stale_read_requests: LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.local(), + local_executed_stale_read_fallback_success_requests: LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_SUCCESS_REQUESTS.local(), + local_executed_stale_read_fallback_failure_requests: LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_FAILURE_REQUESTS.local(), local_executed_replica_read_requests: LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS.local(), local_executed_snapshot_cache_hit: LOCAL_READ_EXECUTED_CACHE_REQUESTS.local(), reject_reason: LocalReadRejectCounter::from(&LOCAL_READ_REJECT_VEC), @@ -100,6 +104,10 @@ pub fn maybe_tls_local_read_metrics_flush() { if m.last_flush_time.saturating_elapsed() >= Duration::from_millis(METRICS_FLUSH_INTERVAL) { m.local_executed_requests.flush(); m.local_executed_stale_read_requests.flush(); + m.local_executed_stale_read_fallback_success_requests + .flush(); + m.local_executed_stale_read_fallback_failure_requests + .flush(); m.local_executed_replica_read_requests.flush(); m.local_executed_snapshot_cache_hit.flush(); m.reject_reason.flush(); @@ -152,6 +160,26 @@ lazy_static! { &["cf"] ) .unwrap(); + pub static ref FULL_COMPACT: Histogram = register_histogram!( + "tikv_storage_full_compact_duration_seconds", + "Bucketed histogram of full compaction for the storage." + ) + .unwrap(); + pub static ref FULL_COMPACT_INCREMENTAL: Histogram = register_histogram!( + "tikv_storage_full_compact_increment_duration_seconds", + "Bucketed histogram of full compaction increments for the storage." + ) + .unwrap(); + pub static ref FULL_COMPACT_PAUSE: Histogram = register_histogram!( + "tikv_storage_full_compact_pause_duration_seconds", + "Bucketed histogram of full compaction pauses for the storage." + ) + .unwrap(); + pub static ref PROCESS_STAT_CPU_USAGE: Gauge = register_gauge!( + "tikv_storage_process_stat_cpu_usage", + "CPU usage measured over a 30 second window", + ) + .unwrap(); pub static ref REGION_HASH_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_hash_duration_seconds", "Bucketed histogram of raftstore hash computation duration" @@ -189,6 +217,18 @@ lazy_static! { "Total number of stale read requests directly executed by local reader." ) .unwrap(); + pub static ref LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_SUCCESS_REQUESTS: IntCounter = + register_int_counter!( + "tikv_raftstore_local_read_executed_stale_read_fallback_success_requests", + "Total number of stale read requests executed by local leader peer as snapshot read." + ) + .unwrap(); + pub static ref LOCAL_READ_EXECUTED_STALE_READ_FALLBACK_FAILURE_REQUESTS: IntCounter = + register_int_counter!( + "tikv_raftstore_local_read_executed_stale_read_fallback_failure_requests", + "Total number of stale read requests failed to be executed by local leader peer as snapshot read." + ) + .unwrap(); pub static ref LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS: IntCounter = register_int_counter!( "tikv_raftstore_local_read_executed_replica_read_requests", "Total number of stale read requests directly executed by local reader." diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index e79f37a4bc4..c47461d62ff 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -23,7 +23,10 @@ pub use self::{ cleanup::{Runner as CleanupRunner, Task as CleanupTask}, cleanup_snapshot::{Runner as GcSnapshotRunner, Task as GcSnapshotTask}, cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, - compact::{need_compact, CompactThreshold, Runner as CompactRunner, Task as CompactTask}, + compact::{ + need_compact, CompactThreshold, FullCompactController, Runner as CompactRunner, + Task as CompactTask, + }, consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, @@ -42,7 +45,8 @@ pub use self::{ }, region::{Runner as RegionRunner, Task as RegionTask}, split_check::{ - Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, + Bucket, BucketRange, BucketStatsInfo, KeyEntry, Runner as SplitCheckRunner, + Task as SplitCheckTask, }, split_config::{ SplitConfig, SplitConfigManager, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index d812830569a..71ab6a9e2a9 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -41,7 +41,7 @@ use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, store::QueryStats, - sys::thread::StdThreadBuildWrapper, + sys::{thread::StdThreadBuildWrapper, SysQuota}, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, @@ -68,7 +68,7 @@ use crate::{ AutoSplitController, ReadStats, SplitConfigChange, WriteStats, }, Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, - RegionReadProgressRegistry, SnapManager, StoreInfo, StoreMsg, TxnExt, + SnapManager, StoreInfo, StoreMsg, TxnExt, }, }; @@ -141,6 +141,7 @@ where peer: metapb::Peer, // If true, right Region derives origin region_id. right_derive: bool, + share_source_region_size: bool, callback: Callback, }, AskBatchSplit { @@ -149,6 +150,7 @@ where peer: metapb::Peer, // If true, right Region derives origin region_id. right_derive: bool, + share_source_region_size: bool, callback: Callback, }, AutoSplit { @@ -223,6 +225,9 @@ pub struct StoreStat { pub store_cpu_usages: RecordPairVec, pub store_read_io_rates: RecordPairVec, pub store_write_io_rates: RecordPairVec, + + store_cpu_quota: f64, // quota of cpu usage + store_cpu_busy_thd: f64, } impl Default for StoreStat { @@ -247,10 +252,33 @@ impl Default for StoreStat { store_cpu_usages: RecordPairVec::default(), store_read_io_rates: RecordPairVec::default(), store_write_io_rates: RecordPairVec::default(), + + store_cpu_quota: 0.0_f64, + store_cpu_busy_thd: 0.8_f64, } } } +impl StoreStat { + fn set_cpu_quota(&mut self, cpu_cores: f64, busy_thd: f64) { + self.store_cpu_quota = cpu_cores * 100.0; + self.store_cpu_busy_thd = busy_thd; + } + + fn maybe_busy(&self) -> bool { + if self.store_cpu_quota < 1.0 || self.store_cpu_busy_thd > 1.0 { + return false; + } + + let mut cpu_usage = 0_u64; + for record in self.store_cpu_usages.iter() { + cpu_usage += record.get_value(); + } + + (cpu_usage as f64 / self.store_cpu_quota) >= self.store_cpu_busy_thd + } +} + #[derive(Default)] pub struct PeerStat { pub read_bytes: u64, @@ -448,16 +476,6 @@ fn default_collect_tick_interval() -> Duration { DEFAULT_COLLECT_TICK_INTERVAL } -fn config(interval: Duration) -> Duration { - fail_point!("mock_min_resolved_ts_interval", |_| { - Duration::from_millis(50) - }); - fail_point!("mock_min_resolved_ts_interval_disable", |_| { - Duration::from_millis(0) - }); - interval -} - #[inline] fn convert_record_pairs(m: HashMap) -> RecordPairVec { m.into_iter() @@ -560,7 +578,6 @@ where collect_store_infos_interval: Duration, load_base_split_check_interval: Duration, collect_tick_interval: Duration, - report_min_resolved_ts_interval: Duration, inspect_latency_interval: Duration, } @@ -568,12 +585,7 @@ impl StatsMonitor where T: StoreStatsReporter, { - pub fn new( - interval: Duration, - report_min_resolved_ts_interval: Duration, - inspect_latency_interval: Duration, - reporter: T, - ) -> Self { + pub fn new(interval: Duration, inspect_latency_interval: Duration, reporter: T) -> Self { StatsMonitor { reporter, handle: None, @@ -585,7 +597,6 @@ where DEFAULT_LOAD_BASE_SPLIT_CHECK_INTERVAL, interval, ), - report_min_resolved_ts_interval: config(report_min_resolved_ts_interval), // Use `inspect_latency_interval` as the minimal limitation for collecting tick. collect_tick_interval: cmp::min( inspect_latency_interval, @@ -600,9 +611,7 @@ where pub fn start( &mut self, mut auto_split_controller: AutoSplitController, - region_read_progress: RegionReadProgressRegistry, collector_reg_handle: CollectorRegHandle, - store_id: u64, ) -> Result<(), io::Error> { if self.collect_tick_interval < cmp::min( @@ -623,9 +632,6 @@ where let load_base_split_check_interval = self .load_base_split_check_interval .div_duration_f64(tick_interval) as u64; - let report_min_resolved_ts_interval = self - .report_min_resolved_ts_interval - .div_duration_f64(tick_interval) as u64; let update_latency_stats_interval = self .inspect_latency_interval .div_duration_f64(tick_interval) as u64; @@ -658,7 +664,7 @@ where // Register the region CPU records collector. if auto_split_controller .cfg - .region_cpu_overload_threshold_ratio + .region_cpu_overload_threshold_ratio() > 0.0 { region_cpu_records_collector = @@ -684,12 +690,6 @@ where &mut region_cpu_records_collector, ); } - if is_enable_tick(timer_cnt, report_min_resolved_ts_interval) { - reporter.report_min_resolved_ts( - store_id, - region_read_progress.get_min_resolved_ts(), - ); - } if is_enable_tick(timer_cnt, update_latency_stats_interval) { reporter.update_latency_stats(timer_cnt); } @@ -862,14 +862,14 @@ impl SlowScore { } } - fn record(&mut self, id: u64, duration: Duration) { + fn record(&mut self, id: u64, duration: Duration, not_busy: bool) { self.last_record_time = Instant::now(); if id != self.last_tick_id { return; } self.last_tick_finished = true; self.total_requests += 1; - if duration >= self.inspect_interval { + if not_busy && duration >= self.inspect_interval { self.timeout_requests += 1; } } @@ -917,6 +917,78 @@ impl SlowScore { } } +struct SlowTrendStatistics { + net_io_factor: f64, + /// Detector to detect NetIo&DiskIo jitters. + slow_cause: Trend, + /// Reactor as an assistant detector to detect the QPS jitters. + slow_result: Trend, + slow_result_recorder: RequestPerSecRecorder, +} + +impl SlowTrendStatistics { + #[inline] + fn new(cfg: &Config) -> Self { + Self { + slow_cause: Trend::new( + // Disable SpikeFilter for now + Duration::from_secs(0), + STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), + STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), + Duration::from_secs(180), + Duration::from_secs(30), + Duration::from_secs(120), + Duration::from_secs(600), + 1, + tikv_util::time::duration_to_us(Duration::from_micros(500)), + STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L1"]), + STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L2"]), + cfg.slow_trend_unsensitive_cause, + ), + slow_result: Trend::new( + // Disable SpikeFilter for now + Duration::from_secs(0), + STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), + STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), + Duration::from_secs(120), + Duration::from_secs(15), + Duration::from_secs(60), + Duration::from_secs(300), + 1, + 2000, + STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC + .with_label_values(&["L1"]), + STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC + .with_label_values(&["L2"]), + cfg.slow_trend_unsensitive_result, + ), + slow_result_recorder: RequestPerSecRecorder::new(), + net_io_factor: cfg.slow_trend_network_io_factor, /* FIXME: add extra parameter in + * Config to control it. */ + } + } + + #[inline] + fn record(&mut self, duration: RaftstoreDuration) { + // TODO: It's more appropriate to divide the factor into `Disk IO factor` and + // `Net IO factor`. + // Currently, when `network ratio == 1`, it summarizes all factors by `sum` + // simplily, approved valid to common cases when there exists IO jitters on + // Network or Disk. + let latency = || -> u64 { + if self.net_io_factor as u64 >= 1 { + return tikv_util::time::duration_to_us(duration.sum()); + } + let disk_io_latency = + tikv_util::time::duration_to_us(duration.delays_on_disk_io(true)) as f64; + let network_io_latency = + tikv_util::time::duration_to_us(duration.delays_on_net_io()) as f64; + (disk_io_latency + network_io_latency * self.net_io_factor) as u64 + }(); + self.slow_cause.record(latency, Instant::now()); + } +} + pub struct Runner where EK: KvEngine, @@ -947,9 +1019,7 @@ where snap_mgr: SnapManager, remote: Remote, slow_score: SlowScore, - slow_trend_cause: Trend, - slow_trend_result: Trend, - slow_trend_result_recorder: RequestPerSecRecorder, + slow_trend: SlowTrendStatistics, // The health status of the store is updated by the slow score mechanism. health_service: Option, @@ -978,26 +1048,21 @@ where snap_mgr: SnapManager, remote: Remote, collector_reg_handle: CollectorRegHandle, - region_read_progress: RegionReadProgressRegistry, health_service: Option, coprocessor_host: CoprocessorHost, causal_ts_provider: Option>, // used for rawkv apiv2 grpc_service_manager: GrpcServiceManager, ) -> Runner { + let mut store_stat = StoreStat::default(); + store_stat.set_cpu_quota(SysQuota::cpu_cores_quota(), cfg.inspect_cpu_util_thd); let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( interval, - cfg.report_min_resolved_ts_interval.0, cfg.inspect_interval.0, WrappedScheduler(scheduler.clone()), ); - if let Err(e) = stats_monitor.start( - auto_split_controller, - region_read_progress, - collector_reg_handle, - store_id, - ) { + if let Err(e) = stats_monitor.start(auto_split_controller, collector_reg_handle) { error!("failed to start stats collector, error = {:?}", e); } @@ -1008,7 +1073,7 @@ where is_hb_receiver_scheduled: false, region_peers: HashMap::default(), region_buckets: HashMap::default(), - store_stat: StoreStat::default(), + store_stat, start_ts: UnixSecs::now(), scheduler, store_heartbeat_interval, @@ -1018,39 +1083,7 @@ where snap_mgr, remote, slow_score: SlowScore::new(cfg.inspect_interval.0), - slow_trend_cause: Trend::new( - // Disable SpikeFilter for now - Duration::from_secs(0), - STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), - STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), - Duration::from_secs(180), - Duration::from_secs(30), - Duration::from_secs(120), - Duration::from_secs(600), - 1, - tikv_util::time::duration_to_us(Duration::from_micros(500)), - STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L1"]), - STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L2"]), - cfg.slow_trend_unsensitive_cause, - ), - slow_trend_result: Trend::new( - // Disable SpikeFilter for now - Duration::from_secs(0), - STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), - STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), - Duration::from_secs(120), - Duration::from_secs(15), - Duration::from_secs(60), - Duration::from_secs(300), - 1, - 2000, - STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC - .with_label_values(&["L1"]), - STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC - .with_label_values(&["L2"]), - cfg.slow_trend_unsensitive_result, - ), - slow_trend_result_recorder: RequestPerSecRecorder::new(), + slow_trend: SlowTrendStatistics::new(cfg), health_service, curr_health_status: ServingStatus::Serving, coprocessor_host, @@ -1066,6 +1099,7 @@ where split_key: Vec, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, callback: Callback, task: String, ) { @@ -1087,6 +1121,7 @@ where resp.get_new_region_id(), resp.take_new_peer_ids(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -1121,6 +1156,7 @@ where mut split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + share_source_region_size: bool, callback: Callback, task: String, remote: Remote, @@ -1146,6 +1182,7 @@ where split_keys, resp.take_ids().into(), right_derive, + share_source_region_size, ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); @@ -1174,6 +1211,7 @@ where split_key: split_keys.pop().unwrap(), peer, right_derive, + share_source_region_size, callback, }; if let Err(ScheduleError::Stopped(t)) = scheduler.schedule(task) { @@ -1323,7 +1361,8 @@ where .engine_total_query_num .sub_query_stats(&self.store_stat.engine_last_query_num); let total_query_num = self - .slow_trend_result_recorder + .slow_trend + .slow_result_recorder .record_and_get_current_rps(res.get_all_query_num(), Instant::now()); stats.set_query_stats(res.0); @@ -1352,15 +1391,9 @@ where self.store_stat.region_bytes_read.flush(); self.store_stat.region_keys_read.flush(); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["capacity"]) - .set(capacity as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["available"]) - .set(available as i64); - STORE_SIZE_GAUGE_VEC - .with_label_values(&["used"]) - .set(used_size as i64); + STORE_SIZE_EVENT_INT_VEC.capacity.set(capacity as i64); + STORE_SIZE_EVENT_INT_VEC.available.set(available as i64); + STORE_SIZE_EVENT_INT_VEC.used.set(used_size as i64); let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); @@ -1456,16 +1489,17 @@ where stats: &mut pdpb::StoreStats, total_query_num: Option, ) { - let slow_trend_cause_rate = self.slow_trend_cause.increasing_rate(); + let slow_trend_cause_rate = self.slow_trend.slow_cause.increasing_rate(); STORE_SLOW_TREND_GAUGE.set(slow_trend_cause_rate); let mut slow_trend = pdpb::SlowTrend::default(); slow_trend.set_cause_rate(slow_trend_cause_rate); - slow_trend.set_cause_value(self.slow_trend_cause.l0_avg()); + slow_trend.set_cause_value(self.slow_trend.slow_cause.l0_avg()); if let Some(total_query_num) = total_query_num { - self.slow_trend_result + self.slow_trend + .slow_result .record(total_query_num as u64, Instant::now()); - slow_trend.set_result_value(self.slow_trend_result.l0_avg()); - let slow_trend_result_rate = self.slow_trend_result.increasing_rate(); + slow_trend.set_result_value(self.slow_trend.slow_result.l0_avg()); + let slow_trend_result_rate = self.slow_trend.slow_result.increasing_rate(); slow_trend.set_result_rate(slow_trend_result_rate); STORE_SLOW_TREND_RESULT_GAUGE.set(slow_trend_result_rate); STORE_SLOW_TREND_RESULT_VALUE_GAUGE.set(total_query_num); @@ -1478,23 +1512,25 @@ where } fn write_slow_trend_metrics(&mut self) { - STORE_SLOW_TREND_L0_GAUGE.set(self.slow_trend_cause.l0_avg()); - STORE_SLOW_TREND_L1_GAUGE.set(self.slow_trend_cause.l1_avg()); - STORE_SLOW_TREND_L2_GAUGE.set(self.slow_trend_cause.l2_avg()); - STORE_SLOW_TREND_L0_L1_GAUGE.set(self.slow_trend_cause.l0_l1_rate()); - STORE_SLOW_TREND_L1_L2_GAUGE.set(self.slow_trend_cause.l1_l2_rate()); - STORE_SLOW_TREND_L1_MARGIN_ERROR_GAUGE.set(self.slow_trend_cause.l1_margin_error_base()); - STORE_SLOW_TREND_L2_MARGIN_ERROR_GAUGE.set(self.slow_trend_cause.l2_margin_error_base()); + STORE_SLOW_TREND_L0_GAUGE.set(self.slow_trend.slow_cause.l0_avg()); + STORE_SLOW_TREND_L1_GAUGE.set(self.slow_trend.slow_cause.l1_avg()); + STORE_SLOW_TREND_L2_GAUGE.set(self.slow_trend.slow_cause.l2_avg()); + STORE_SLOW_TREND_L0_L1_GAUGE.set(self.slow_trend.slow_cause.l0_l1_rate()); + STORE_SLOW_TREND_L1_L2_GAUGE.set(self.slow_trend.slow_cause.l1_l2_rate()); + STORE_SLOW_TREND_L1_MARGIN_ERROR_GAUGE + .set(self.slow_trend.slow_cause.l1_margin_error_base()); + STORE_SLOW_TREND_L2_MARGIN_ERROR_GAUGE + .set(self.slow_trend.slow_cause.l2_margin_error_base()); // Report results of all slow Trends. - STORE_SLOW_TREND_RESULT_L0_GAUGE.set(self.slow_trend_result.l0_avg()); - STORE_SLOW_TREND_RESULT_L1_GAUGE.set(self.slow_trend_result.l1_avg()); - STORE_SLOW_TREND_RESULT_L2_GAUGE.set(self.slow_trend_result.l2_avg()); - STORE_SLOW_TREND_RESULT_L0_L1_GAUGE.set(self.slow_trend_result.l0_l1_rate()); - STORE_SLOW_TREND_RESULT_L1_L2_GAUGE.set(self.slow_trend_result.l1_l2_rate()); + STORE_SLOW_TREND_RESULT_L0_GAUGE.set(self.slow_trend.slow_result.l0_avg()); + STORE_SLOW_TREND_RESULT_L1_GAUGE.set(self.slow_trend.slow_result.l1_avg()); + STORE_SLOW_TREND_RESULT_L2_GAUGE.set(self.slow_trend.slow_result.l2_avg()); + STORE_SLOW_TREND_RESULT_L0_L1_GAUGE.set(self.slow_trend.slow_result.l0_l1_rate()); + STORE_SLOW_TREND_RESULT_L1_L2_GAUGE.set(self.slow_trend.slow_result.l1_l2_rate()); STORE_SLOW_TREND_RESULT_L1_MARGIN_ERROR_GAUGE - .set(self.slow_trend_result.l1_margin_error_base()); + .set(self.slow_trend.slow_result.l1_margin_error_base()); STORE_SLOW_TREND_RESULT_L2_MARGIN_ERROR_GAUGE - .set(self.slow_trend_result.l2_margin_error_base()); + .set(self.slow_trend.slow_result.l2_margin_error_base()); } fn handle_report_batch_split(&self, regions: Vec) { @@ -1566,8 +1602,14 @@ where } } Ok(None) => { - // splitted Region has not yet reported to PD. - // TODO: handle merge + // Splitted region has not yet reported to PD. + // + // Or region has been merged. This case is handled by + // message `MsgCheckStalePeer`, stale peers will be + // removed eventually. + PD_VALIDATE_PEER_COUNTER_VEC + .with_label_values(&["region not found"]) + .inc(); } Err(e) => { error!("get region failed"; "err" => ?e); @@ -1645,6 +1687,7 @@ where split_keys: split_region.take_keys().into(), callback: Callback::None, source: "pd".into(), + share_source_region_size: false, } } else { CasualMessage::HalfSplitRegion { @@ -2048,12 +2091,14 @@ where split_key, peer, right_derive, + share_source_region_size, callback, } => self.handle_ask_split( region, split_key, peer, right_derive, + share_source_region_size, callback, String::from("ask_split"), ), @@ -2062,6 +2107,7 @@ where split_keys, peer, right_derive, + share_source_region_size, callback, } => Self::handle_ask_batch_split( self.router.clone(), @@ -2071,6 +2117,7 @@ where split_keys, peer, right_derive, + share_source_region_size, callback, String::from("batch_split"), self.remote.clone(), @@ -2095,6 +2142,7 @@ where vec![split_key], split_info.peer, true, + false, Callback::None, String::from("auto_split"), remote.clone(), @@ -2248,11 +2296,13 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => { - self.slow_score.record(id, duration.sum()); - self.slow_trend_cause.record( - tikv_util::time::duration_to_us(duration.store_wait_duration.unwrap()), - Instant::now(), + // Fine-tuned, `SlowScore` only takes the I/O jitters on the disk into account. + self.slow_score.record( + id, + duration.delays_on_disk_io(false), + !self.store_stat.maybe_busy(), ); + self.slow_trend.record(duration); } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { @@ -2281,7 +2331,7 @@ where { fn on_timeout(&mut self) { // Record a fairly great value when timeout - self.slow_trend_cause.record(500_000, Instant::now()); + self.slow_trend.slow_cause.record(500_000, Instant::now()); // The health status is recovered to serving as long as any tick // does not timeout. @@ -2291,7 +2341,12 @@ where self.update_health_status(ServingStatus::Serving); } if !self.slow_score.last_tick_finished { - self.slow_score.record_timeout(); + // If the last tick is not finished, it means that the current store might + // be busy on handling requests or delayed on I/O operations. And only when + // the current store is not busy, it should record the last_tick as a timeout. + if !self.store_stat.maybe_busy() { + self.slow_score.record_timeout(); + } // If the last slow_score already reached abnormal state and was delayed for // reporting by `store-heartbeat` to PD, we should report it here manually as // a FAKE `store-heartbeat`. @@ -2320,21 +2375,25 @@ where let inspector = LatencyInspector::new( id, Box::new(move |id, duration| { - let dur = duration.sum(); - STORE_INSPECT_DURATION_HISTOGRAM .with_label_values(&["store_process"]) .observe(tikv_util::time::duration_to_sec( - duration.store_process_duration.unwrap(), + duration.store_process_duration.unwrap_or_default(), )); STORE_INSPECT_DURATION_HISTOGRAM .with_label_values(&["store_wait"]) .observe(tikv_util::time::duration_to_sec( - duration.store_wait_duration.unwrap(), + duration.store_wait_duration.unwrap_or_default(), )); + STORE_INSPECT_DURATION_HISTOGRAM + .with_label_values(&["store_commit"]) + .observe(tikv_util::time::duration_to_sec( + duration.store_commit_duration.unwrap_or_default(), + )); + STORE_INSPECT_DURATION_HISTOGRAM .with_label_values(&["all"]) - .observe(tikv_util::time::duration_to_sec(dur)); + .observe(tikv_util::time::duration_to_sec(duration.sum())); if let Err(e) = scheduler.schedule(Task::UpdateSlowScore { id, duration }) { warn!("schedule pd task failed"; "err" => ?e); } @@ -2385,6 +2444,7 @@ fn new_split_region_request( new_region_id: u64, peer_ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::Split); @@ -2392,6 +2452,8 @@ fn new_split_region_request( req.mut_split().set_new_region_id(new_region_id); req.mut_split().set_new_peer_ids(peer_ids); req.mut_split().set_right_derive(right_derive); + req.mut_split() + .set_share_source_region_size(share_source_region_size); req } @@ -2399,10 +2461,13 @@ fn new_batch_split_region_request( split_keys: Vec>, ids: Vec, right_derive: bool, + share_source_region_size: bool, ) -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::BatchSplit); req.mut_splits().set_right_derive(right_derive); + req.mut_splits() + .set_share_source_region_size(share_source_region_size); let mut requests = Vec::with_capacity(ids.len()); for (mut id, key) in ids.into_iter().zip(split_keys) { let mut split = SplitRequest::default(); @@ -2577,15 +2642,21 @@ fn collect_engine_size( } else { store_info.capacity }; - let used_size = snap_mgr_size - + store_info - .kv_engine - .get_engine_used_size() - .expect("kv engine used size") - + store_info - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + let raft_size = store_info + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + + let kv_size = store_info + .kv_engine + .get_engine_used_size() + .expect("kv engine used size"); + + STORE_SIZE_EVENT_INT_VEC.raft_size.set(raft_size as i64); + STORE_SIZE_EVENT_INT_VEC.snap_size.set(snap_mgr_size as i64); + STORE_SIZE_EVENT_INT_VEC.kv_size.set(kv_size as i64); + + let used_size = snap_mgr_size + kv_size + raft_size; let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. @@ -2616,8 +2687,6 @@ mod tests { use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use tikv_util::worker::LazyWorker; - use crate::store::fsm::StoreMeta; - struct RunnerTest { store_stat: Arc>, stats_monitor: StatsMonitor>, @@ -2631,17 +2700,12 @@ mod tests { ) -> RunnerTest { let mut stats_monitor = StatsMonitor::new( Duration::from_secs(interval), - Duration::from_secs(0), Duration::from_secs(interval), WrappedScheduler(scheduler), ); - let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); if let Err(e) = stats_monitor.start( AutoSplitController::default(), - region_read_progress, CollectorRegHandle::new_for_test(), - 1, ) { error!("failed to start stats collector, error = {:?}", e); } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 0c4641770be..778f4ce45f0 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -12,7 +12,7 @@ use std::{ }; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; -use engine_traits::{KvEngine, Peekable, RaftEngine}; +use engine_traits::{KvEngine, Peekable, RaftEngine, SnapshotContext}; use fail::fail_point; use kvproto::{ errorpb, @@ -30,7 +30,7 @@ use tikv_util::{ }; use time::Timespec; use tracker::GLOBAL_TRACKERS; -use txn_types::TimeStamp; +use txn_types::{TimeStamp, WriteBatchFlags}; use super::metrics::*; use crate::{ @@ -57,6 +57,7 @@ pub trait ReadExecutor { /// Currently, only multi-rocksdb version may return `None`. fn get_snapshot( &mut self, + snap_ctx: Option, read_context: &Option>, ) -> Arc<::Snapshot>; @@ -64,6 +65,7 @@ pub trait ReadExecutor { &mut self, req: &Request, region: &metapb::Region, + snap_ctx: Option, read_context: &Option>, ) -> Result { let key = req.get_get().get_key(); @@ -71,7 +73,7 @@ pub trait ReadExecutor { util::check_key_in_region(key, region)?; let mut resp = Response::default(); - let snapshot = self.get_snapshot(read_context); + let snapshot = self.get_snapshot(snap_ctx, read_context); let res = if !req.get_get().get_cf().is_empty() { let cf = req.get_get().get_cf(); snapshot @@ -109,6 +111,7 @@ pub trait ReadExecutor { msg: &RaftCmdRequest, region: &Arc, read_index: Option, + snap_ctx: Option, local_read_ctx: Option>, ) -> ReadResponse<::Snapshot> { let requests = msg.get_requests(); @@ -121,20 +124,22 @@ pub trait ReadExecutor { for req in requests { let cmd_type = req.get_cmd_type(); let mut resp = match cmd_type { - CmdType::Get => match self.get_value(req, region.as_ref(), &local_read_ctx) { - Ok(resp) => resp, - Err(e) => { - error!(?e; - "failed to execute get command"; - "region_id" => region.get_id(), - ); - response.response = cmd_resp::new_error(e); - return response; + CmdType::Get => { + match self.get_value(req, region.as_ref(), snap_ctx.clone(), &local_read_ctx) { + Ok(resp) => resp, + Err(e) => { + error!(?e; + "failed to execute get command"; + "region_id" => region.get_id(), + ); + response.response = cmd_resp::new_error(e); + return response; + } } - }, + } CmdType::Snap => { let snapshot = RegionSnapshot::from_snapshot( - self.get_snapshot(&local_read_ctx), + self.get_snapshot(snap_ctx.clone(), &local_read_ctx), region.clone(), ); response.snapshot = Some(snapshot); @@ -226,9 +231,16 @@ where } } - /// Update the snapshot in the `snap_cache` if the read_id is None or does - /// not match. - fn maybe_update_snapshot(&mut self, engine: &E, delegate_last_valid_ts: Timespec) -> bool { + // Update the snapshot in the `snap_cache` if the read_id is None or does + // not match. + // snap_ctx is used (if not None) to acquire the snapshot of the relevant region + // from region cache engine + fn maybe_update_snapshot( + &mut self, + engine: &E, + snap_ctx: Option, + delegate_last_valid_ts: Timespec, + ) -> bool { // When the read_id is None, it means the `snap_cache` has been cleared // before and the `cached_read_id` of it is None because only a consecutive // requests will have the same cache and the cache will be cleared after the @@ -242,7 +254,7 @@ where } self.snap_cache.cached_read_id = self.read_id.clone(); - self.snap_cache.snapshot = Some(Arc::new(engine.snapshot())); + self.snap_cache.snapshot = Some(Arc::new(engine.snapshot(snap_ctx))); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); @@ -250,7 +262,7 @@ where } else { // read_id being None means the snapshot acquired will only be used in this // request - self.snapshot = Some(Arc::new(engine.snapshot())); + self.snapshot = Some(Arc::new(engine.snapshot(snap_ctx))); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); @@ -440,7 +452,11 @@ impl ReadDelegate { read_progress: peer.read_progress.clone(), pending_remove: false, wait_data: false, - bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), + bucket_meta: peer + .region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), track_ver: TrackVer::new(), } } @@ -824,10 +840,21 @@ where return Ok(None); } - // Check witness - if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::IsWitness(region_id)); + match find_peer_by_id(&delegate.region, delegate.peer_id) { + // Check witness + Some(peer) => { + if peer.is_witness { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::IsWitness(region_id)); + } + } + // This (rarely) happen in witness disabled clusters while the conf change applied but + // region not removed. We shouldn't return `IsWitness` here because our client back off + // for a long time while encountering that. + None => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + return Err(Error::RegionNotFound(region_id)); + } } // Check non-witness hasn't finish applying snapshot yet. @@ -963,80 +990,162 @@ where cmd.callback.set_result(read_resp); } + /// Try to handle the read request using local read, if the leader is valid + /// the read response is returned, otherwise None is returned. + fn try_local_leader_read( + &mut self, + req: &RaftCmdRequest, + delegate: &mut CachedReadDelegate, + snap_ctx: Option, + read_id: Option, + snap_updated: &mut bool, + last_valid_ts: Timespec, + ) -> Option> { + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, read_id); + + (*snap_updated) = local_read_ctx.maybe_update_snapshot( + delegate.get_tablet(), + snap_ctx.clone(), + last_valid_ts, + ); + + let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); + if !delegate.is_in_leader_lease(snapshot_ts) { + return None; + } + + let region = Arc::clone(&delegate.region); + let mut response = delegate.execute(req, ®ion, None, snap_ctx, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Try renew lease in advance + delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); + Some(response) + } + + /// Try to handle the stale read request, if the read_ts < safe_ts the read + /// response is returned, otherwise the raft command response with + /// `DataIsNotReady` error is returned. + fn try_local_stale_read( + &mut self, + req: &RaftCmdRequest, + delegate: &mut CachedReadDelegate, + snap_updated: &mut bool, + last_valid_ts: Timespec, + ) -> std::result::Result, RaftCmdResponse> { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + delegate.check_stale_read_safe(read_ts)?; + + // Stale read does not use cache, so we pass None for read_id + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); + (*snap_updated) = + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), None, last_valid_ts); + + let region = Arc::clone(&delegate.region); + // Getting the snapshot + let mut response = delegate.execute(req, ®ion, None, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } + // Double check in case `safe_ts` change after the first check and before + // getting snapshot + delegate.check_stale_read_safe(read_ts)?; + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + Ok(response) + } + pub fn propose_raft_command( &mut self, + snap_ctx: Option, read_id: Option, - req: RaftCmdRequest, + mut req: RaftCmdRequest, cb: Callback, ) { match self.pre_propose_raft_command(&req) { Ok(Some((mut delegate, policy))) => { - let snap_updated; + let mut snap_updated = false; let last_valid_ts = delegate.last_valid_ts; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { - let mut local_read_ctx = - LocalReadContext::new(&mut self.snap_cache, read_id); - - snap_updated = local_read_ctx - .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - - let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); - if !delegate.is_in_leader_lease(snapshot_ts) { + if let Some(read_resp) = self.try_local_leader_read( + &req, + &mut delegate, + snap_ctx, + read_id, + &mut snap_updated, + last_valid_ts, + ) { + read_resp + } else { fail_point!("localreader_before_redirect", |_| {}); // Forward to raftstore. self.redirect(RaftCommand::new(req, cb)); return; } - - let region = Arc::clone(&delegate.region); - let mut response = - delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - if let Some(snap) = response.snapshot.as_mut() { - snap.bucket_meta = delegate.bucket_meta.clone(); - } - // Try renew lease in advance - delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); - response } // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { - let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.set_result(ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }); - return; - } - - // Stale read does not use cache, so we pass None for read_id - let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); - snap_updated = local_read_ctx - .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - - let region = Arc::clone(&delegate.region); - // Getting the snapshot - let mut response = - delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - if let Some(snap) = response.snapshot.as_mut() { - snap.bucket_meta = delegate.bucket_meta.clone(); + match self.try_local_stale_read( + &req, + &mut delegate, + &mut snap_updated, + last_valid_ts, + ) { + Ok(read_resp) => read_resp, + Err(err_resp) => { + // It's safe to change the header of the `RaftCmdRequest`, as it + // would not affect the `SnapCtx` used in upper layer like. + let unset_stale_flag = req.get_header().get_flags() + & (!WriteBatchFlags::STALE_READ.bits()); + req.mut_header().set_flags(unset_stale_flag); + let mut inspector = Inspector { + delegate: &delegate, + }; + // The read request could be handled using snapshot read if the + // local peer is a valid leader. + let allow_fallback_leader_read = inspector + .inspect(&req) + .map_or(false, |r| r == RequestPolicy::ReadLocal); + if !allow_fallback_leader_read { + cb.set_result(ReadResponse { + response: err_resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); + return; + } + if let Some(read_resp) = self.try_local_leader_read( + &req, + &mut delegate, + None, + None, + &mut snap_updated, + last_valid_ts, + ) { + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut() + .local_executed_stale_read_fallback_success_requests + .inc() + }); + read_resp + } else { + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut() + .local_executed_stale_read_fallback_failure_requests + .inc() + }); + cb.set_result(ReadResponse { + response: err_resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); + return; + } + } } - // Double check in case `safe_ts` change after the first check and before - // getting snapshot - if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.set_result(ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }); - return; - } - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); - response } _ => unreachable!(), }; @@ -1090,11 +1199,12 @@ where #[inline] pub fn read( &mut self, + snap_ctx: Option, read_id: Option, req: RaftCmdRequest, cb: Callback, ) { - self.propose_raft_command(read_id, req, cb); + self.propose_raft_command(snap_ctx, read_id, req, cb); maybe_tls_local_read_metrics_flush(); } @@ -1128,7 +1238,11 @@ where &self.kv_engine } - fn get_snapshot(&mut self, read_context: &Option>) -> Arc { + fn get_snapshot( + &mut self, + _: Option, + read_context: &Option>, + ) -> Arc { read_context.as_ref().unwrap().snapshot().unwrap() } } @@ -1176,7 +1290,9 @@ mod tests { use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; use engine_traits::{MiscExt, Peekable, SyncMutable, ALL_CFS}; + use hybrid_engine::{HybridEngine, HybridEngineSnapshot}; use kvproto::{metapb::RegionEpoch, raft_cmdpb::*}; + use region_cache_memory_engine::RegionCacheMemoryEngine; use tempfile::{Builder, TempDir}; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; use time::Duration; @@ -1261,6 +1377,7 @@ mod tests { cmd: RaftCmdRequest, ) { reader.propose_raft_command( + None, None, cmd.clone(), Callback::read(Box::new(|resp| { @@ -1289,7 +1406,7 @@ mod tests { task: RaftCommand, read_id: Option, ) { - reader.propose_raft_command(read_id, task.request, task.callback); + reader.propose_raft_command(None, read_id, task.request, task.callback); assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); } @@ -1422,6 +1539,7 @@ mod tests { .mut_peer() .set_store_id(store_id + 1); reader.propose_raft_command( + None, None, cmd_store_id, Callback::read(Box::new(move |resp: ReadResponse| { @@ -1446,6 +1564,7 @@ mod tests { .mut_peer() .set_id(leader2.get_id() + 1); reader.propose_raft_command( + None, None, cmd_peer_id, Callback::read(Box::new(move |resp: ReadResponse| { @@ -1471,6 +1590,7 @@ mod tests { let mut cmd_term = cmd.clone(); cmd_term.mut_header().set_term(term6 - 2); reader.propose_raft_command( + None, None, cmd_term, Callback::read(Box::new(move |resp: ReadResponse| { @@ -1507,8 +1627,9 @@ mod tests { ); // Channel full. - reader.propose_raft_command(None, cmd.clone(), Callback::None); + reader.propose_raft_command(None, None, cmd.clone(), Callback::None); reader.propose_raft_command( + None, None, cmd.clone(), Callback::read(Box::new(move |resp: ReadResponse| { @@ -1541,6 +1662,7 @@ mod tests { .update(Progress::applied_term(term6 + 3)); } reader.propose_raft_command( + None, None, cmd9.clone(), Callback::read(Box::new(|resp| { @@ -1587,6 +1709,8 @@ mod tests { read_progress.update_safe_ts(1, 1); assert_eq!(read_progress.safe_ts(), 1); + // Expire lease manually to avoid local retry on leader peer. + lease.expire(); let data = { let mut d = [0u8; 8]; (&mut d[..]).encode_u64(2).unwrap(); @@ -1744,13 +1868,14 @@ mod tests { assert_eq!(kv_engine.path(), tablet.path()); } - fn prepare_read_delegate( + fn prepare_read_delegate_with_lease( store_id: u64, region_id: u64, term: u64, pr_ids: Vec, region_epoch: RegionEpoch, store_meta: Arc>, + max_lease: Duration, ) { let mut region = metapb::Region::default(); region.set_id(region_id); @@ -1759,7 +1884,7 @@ mod tests { let leader = prs[0].clone(); region.set_region_epoch(region_epoch); - let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. + let mut lease = Lease::new(max_lease, Duration::milliseconds(250)); // 1s is long enough. let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, 1)); // Register region @@ -1788,6 +1913,25 @@ mod tests { } } + fn prepare_read_delegate( + store_id: u64, + region_id: u64, + term: u64, + pr_ids: Vec, + region_epoch: RegionEpoch, + store_meta: Arc>, + ) { + prepare_read_delegate_with_lease( + store_id, + region_id, + term, + pr_ids, + region_epoch, + store_meta, + Duration::seconds(1), + ) + } + #[test] fn test_snap_across_regions() { let store_id = 2; @@ -1895,7 +2039,7 @@ mod tests { let compare_ts = monotonic_raw_now(); // Case 1: snap_cache_context.read_id is None - assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.maybe_update_snapshot(&db, None, Timespec::new(0, 0))); assert!(read_context.snapshot_ts().unwrap() > compare_ts); assert_eq!( read_context @@ -1910,7 +2054,7 @@ mod tests { // snap_cache_context is *not* created with read_id, so calling // `maybe_update_snapshot` again will update the snapshot let compare_ts = monotonic_raw_now(); - assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.maybe_update_snapshot(&db, None, Timespec::new(0, 0))); assert!(read_context.snapshot_ts().unwrap() > compare_ts); let read_id = ThreadReadId::new(); @@ -1920,7 +2064,7 @@ mod tests { let compare_ts = monotonic_raw_now(); // Case 2: snap_cache_context.read_id is not None but not equals to the // snap_cache.cached_read_id - assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.maybe_update_snapshot(&db, None, Timespec::new(0, 0))); assert!(read_context.snapshot_ts().unwrap() > compare_ts); let snap_ts = read_context.snapshot_ts().unwrap(); assert_eq!( @@ -1938,7 +2082,7 @@ mod tests { // `maybe_update_snapshot` again will *not* update the snapshot // Case 3: snap_cache_context.read_id is not None and equals to the // snap_cache.cached_read_id - assert!(!read_context.maybe_update_snapshot(&db2, Timespec::new(0, 0))); + assert!(!read_context.maybe_update_snapshot(&db2, None, Timespec::new(0, 0))); assert_eq!(read_context.snapshot_ts().unwrap(), snap_ts); assert_eq!( read_context @@ -1953,7 +2097,7 @@ mod tests { // Case 4: delegate.last_valid_ts is larger than create_time of read_id let mut last_valid_ts = read_id_clone.create_time; last_valid_ts = last_valid_ts.add(Duration::nanoseconds(1)); - assert!(read_context.maybe_update_snapshot(&db2, last_valid_ts)); + assert!(read_context.maybe_update_snapshot(&db2, None, last_valid_ts)); assert!(read_context.snapshot_ts().unwrap() > snap_ts); assert!( read_context @@ -2154,4 +2298,306 @@ mod tests { must_not_redirect(&mut reader, &rx, task); notify_rx.recv().unwrap(); } + + #[test] + fn test_stale_read_local_leader_fallback() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader( + "test-stale-local-leader-fallback", + store_id, + store_meta.clone(), + ); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1. + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + // Ensure the leader lease is long enough so the fallback would work. + prepare_read_delegate_with_lease( + store_id, + 1, + term6, + pr_ids1.clone(), + epoch13.clone(), + store_meta.clone(), + Duration::seconds(10), + ); + let leader1 = prs1[0].clone(); + + // Local read. + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13.clone()); + header.set_term(term6); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // A peer can serve read_ts < safe_ts. + let safe_ts = TimeStamp::compose(2, 0); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_safe_ts(1, safe_ts.into_inner()); + assert_eq!(delegate.read_progress.safe_ts(), safe_ts.into_inner()); + } + let read_ts_1 = TimeStamp::compose(1, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_1.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // When read_ts > safe_ts, the leader peer could still serve if its lease is + // valid. + let read_ts_2 = TimeStamp::compose(safe_ts.physical() + 201, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_2.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // The fallback would not happen if the lease is not valid. + prepare_read_delegate_with_lease( + store_id, + 1, + term6, + pr_ids1, + epoch13, + store_meta, + Duration::milliseconds(1), + ); + thread::sleep(std::time::Duration::from_millis(50)); + let (snap_tx, snap_rx) = channel(); + let task2 = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task2); + assert!( + snap_rx + .recv() + .unwrap() + .response + .get_header() + .get_error() + .has_data_is_not_ready() + ); + } + + type HybridTestEnigne = HybridEngine; + type HybridEngineTestSnapshot = HybridEngineSnapshot; + + struct HybridEngineMockRouter { + p_router: SyncSender>, + c_router: SyncSender<(u64, CasualMessage)>, + } + + impl HybridEngineMockRouter { + #[allow(clippy::type_complexity)] + fn new() -> ( + HybridEngineMockRouter, + Receiver>, + Receiver<(u64, CasualMessage)>, + ) { + let (p_ch, p_rx) = sync_channel(1); + let (c_ch, c_rx) = sync_channel(1); + ( + HybridEngineMockRouter { + p_router: p_ch, + c_router: c_ch, + }, + p_rx, + c_rx, + ) + } + } + + impl ProposalRouter for HybridEngineMockRouter { + fn send( + &self, + cmd: RaftCommand, + ) -> std::result::Result<(), TrySendError>> { + ProposalRouter::send(&self.p_router, cmd) + } + } + + impl CasualRouter for HybridEngineMockRouter { + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::send(&self.c_router, region_id, msg) + } + } + + #[allow(clippy::type_complexity)] + fn new_hybrid_engine_reader( + path: &str, + store_id: u64, + store_meta: Arc>, + ) -> ( + TempDir, + LocalReader, + Receiver>, + RegionCacheMemoryEngine, + ) { + let path = Builder::new().prefix(path).tempdir().unwrap(); + let disk_engine = + engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); + let (ch, rx, _) = HybridEngineMockRouter::new(); + let memory_engine = RegionCacheMemoryEngine::default(); + let engine = HybridEngine::new(disk_engine, memory_engine.clone()); + let mut reader = LocalReader::new( + engine.clone(), + StoreMetaDelegate::new(store_meta, engine), + ch, + ); + reader.local_reader.store_id = Cell::new(Some(store_id)); + (path, reader, rx, memory_engine) + } + + fn get_snapshot( + snap_ctx: Option, + reader: &mut LocalReader, + request: RaftCmdRequest, + rx: &Receiver>, + ) -> Arc { + let (sender, receiver) = channel(); + reader.propose_raft_command( + snap_ctx, + None, + request, + Callback::read(Box::new(move |snap| { + sender.send(snap).unwrap(); + })), + ); + // no direct is expected + assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); + receiver.recv().unwrap().snapshot.unwrap().snap() + } + + #[test] + fn test_hybrid_engine_read() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx, memory_engine) = new_hybrid_engine_reader( + "test-local-hybrid-engine-reader", + store_id, + store_meta.clone(), + ); + + // set up region so we can acquire snapshot from local reader + let mut region1 = metapb::Region::default(); + region1.set_id(1); + let prs = new_peers(store_id, vec![2, 3, 4]); + region1.set_peers(prs.clone().into()); + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let leader2 = prs[0].clone(); + region1.set_region_epoch(epoch13.clone()); + let term6 = 6; + let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); + + lease.renew(monotonic_raw_now()); + let remote = lease.maybe_new_remote_lease(term6).unwrap(); + { + let mut meta = store_meta.lock().unwrap(); + let read_delegate = ReadDelegate { + tag: String::new(), + region: Arc::new(region1.clone()), + peer_id: leader2.get_id(), + term: term6, + applied_term: term6, + leader_lease: Some(remote), + last_valid_ts: Timespec::new(0, 0), + txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), + txn_ext: Arc::new(TxnExt::default()), + read_progress, + pending_remove: false, + wait_data: false, + track_ver: TrackVer::new(), + bucket_meta: None, + }; + meta.readers.insert(1, read_delegate); + } + + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader2); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + let s = get_snapshot(None, &mut reader, cmd.clone(), &rx); + assert!(!s.region_cache_snapshot_available()); + + memory_engine.new_region(1); + { + let mut core = memory_engine.core().lock().unwrap(); + core.mut_region_meta(1).unwrap().set_can_read(true); + core.mut_region_meta(1).unwrap().set_safe_ts(10); + } + + let mut snap_ctx = SnapshotContext { + read_ts: 15, + region_id: 1, + }; + + let s = get_snapshot(Some(snap_ctx.clone()), &mut reader, cmd.clone(), &rx); + assert!(s.region_cache_snapshot_available()); + + { + let mut core = memory_engine.core().lock().unwrap(); + core.mut_region_meta(1).unwrap().set_can_read(false); + } + let s = get_snapshot(Some(snap_ctx.clone()), &mut reader, cmd.clone(), &rx); + assert!(!s.region_cache_snapshot_available()); + + { + let mut core = memory_engine.core().lock().unwrap(); + core.mut_region_meta(1).unwrap().set_can_read(true); + } + snap_ctx.read_ts = 5; + assert!(!s.region_cache_snapshot_available()); + } } diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 068904b2a67..dd2c8f90de1 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -1091,7 +1091,7 @@ pub(crate) mod tests { ranges.push(key); } engine.kv.put(b"k1", b"v1").unwrap(); - let snap = engine.kv.snapshot(); + let snap = engine.kv.snapshot(None); engine.kv.put(b"k2", b"v2").unwrap(); sched @@ -1204,7 +1204,7 @@ pub(crate) mod tests { sched .schedule(Task::Gen { region_id: id, - kv_snap: engine.kv.snapshot(), + kv_snap: engine.kv.snapshot(None), last_applied_term: entry.get_term(), last_applied_state: apply_state, canceled: Arc::new(AtomicBool::new(false)), diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 4ff853f70a0..e3c0042acf0 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -5,6 +5,7 @@ use std::{ collections::BinaryHeap, fmt::{self, Display, Formatter}, mem, + sync::Arc, }; use engine_traits::{ @@ -12,21 +13,23 @@ use engine_traits::{ }; use file_system::{IoType, WithIoType}; use itertools::Itertools; -use kvproto::{metapb::Region, pdpb::CheckPolicy}; +use kvproto::{ + metapb::{Region, RegionEpoch}, + pdpb::CheckPolicy, +}; use online_config::{ConfigChange, OnlineConfig}; +use pd_client::{BucketMeta, BucketStat}; use tikv_util::{ box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable, Either, }; use txn_types::Key; use super::metrics::*; -#[cfg(any(test, feature = "testexport"))] -use crate::coprocessor::Config; use crate::{ coprocessor::{ dispatcher::StoreHandle, split_observer::{is_valid_split_key, strip_timestamp_if_exists}, - CoprocessorHost, SplitCheckerHost, + Config, CoprocessorHost, SplitCheckerHost, }, Result, }; @@ -144,6 +147,216 @@ pub struct Bucket { pub size: u64, } +#[derive(Debug, Clone, Default)] +pub struct BucketStatsInfo { + // the stats is increment flow. + bucket_stat: Option, + // the report bucket stat records the increment stats after last report pd. + // it will be reset after report pd. + report_bucket_stat: Option, + // avoid the version roll back, it record the last bucket version if bucket stat isn't none. + last_bucket_version: u64, +} + +impl BucketStatsInfo { + /// returns all bucket ranges those's write_bytes exceed the given + /// diff_size_threshold. + pub fn gen_bucket_range_for_update( + &self, + region_bucket_max_size: u64, + ) -> Option> { + let region_buckets = self.bucket_stat.as_ref()?; + let stats = ®ion_buckets.stats; + let keys = ®ion_buckets.meta.keys; + let sizes = ®ion_buckets.meta.sizes; + + let mut suspect_bucket_ranges = vec![]; + assert_eq!(keys.len(), stats.write_bytes.len() + 1); + for i in 0..stats.write_bytes.len() { + let estimated_bucket_size = stats.write_bytes[i] + sizes[i]; + if estimated_bucket_size >= region_bucket_max_size { + suspect_bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); + } + } + Some(suspect_bucket_ranges) + } + + #[inline] + pub fn version(&self) -> u64 { + self.bucket_stat + .as_ref() + .map_or(self.last_bucket_version, |b| b.meta.version) + } + + #[inline] + pub fn add_bucket_flow(&mut self, delta: &Option) { + if let (Some(buckets), Some(report_buckets), Some(delta)) = ( + self.bucket_stat.as_mut(), + self.report_bucket_stat.as_mut(), + delta, + ) { + buckets.merge(delta); + report_buckets.merge(delta); + } + } + + #[inline] + pub fn set_bucket_stat(&mut self, buckets: Option) { + self.bucket_stat = buckets.clone(); + if let Some(new_buckets) = buckets { + self.last_bucket_version = new_buckets.meta.version; + let mut new_report_buckets = BucketStat::from_meta(new_buckets.meta); + if let Some(old) = &mut self.report_bucket_stat { + new_report_buckets.merge(old); + *old = new_report_buckets; + } else { + self.report_bucket_stat = Some(new_report_buckets); + } + } else { + self.report_bucket_stat = None; + } + } + + #[inline] + pub fn report_bucket_stat(&mut self) -> BucketStat { + let current = self.report_bucket_stat.as_mut().unwrap(); + let delta = current.clone(); + current.clear_stats(); + delta + } + + #[inline] + pub fn bucket_stat(&self) -> &Option { + &self.bucket_stat + } + + #[inline] + pub fn bucket_stat_mut(&mut self) -> Option<&mut BucketStat> { + self.bucket_stat.as_mut() + } + + pub fn on_refresh_region_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + region: &Region, + bucket_ranges: Option>, + ) -> bool { + let change_bucket_version: bool; + // The region buckets reset after this region happened split or merge. + // The message should be dropped if it's epoch is lower than the regions. + // The bucket ranges is none when the region buckets is also none. + // So this condition indicates that the region buckets needs to refresh not + // renew. + if let Some(bucket_ranges) = bucket_ranges&&self.bucket_stat.is_some(){ + assert_eq!(buckets.len(), bucket_ranges.len()); + change_bucket_version=self.update_buckets(cfg, next_bucket_version, buckets, region_epoch, &bucket_ranges); + }else{ + change_bucket_version = true; + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + self.init_buckets(cfg, next_bucket_version, buckets, region_epoch, region); + } + change_bucket_version + } + + fn update_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + buckets: Vec, + region_epoch: RegionEpoch, + bucket_ranges: &Vec, + ) -> bool { + let origin_region_buckets = self.bucket_stat.as_ref().unwrap(); + let mut change_bucket_version = false; + let mut meta_idx = 0; + let mut region_buckets = origin_region_buckets.clone(); + let mut meta = (*region_buckets.meta).clone(); + meta.region_epoch = region_epoch; + + // bucket stats will clean if the bucket size is updated. + for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { + // the bucket ranges maybe need to split or merge not all the meta keys, so it + // needs to find the first keys. + while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { + meta_idx += 1; + } + // meta_idx can't be not the last entry (which is end key) + if meta_idx >= meta.keys.len() - 1 { + break; + } + // the bucket size is small and does not have split keys, + // then it should be merged with its left neighbor + let region_bucket_merge_size = + cfg.region_bucket_merge_size_ratio * (cfg.region_bucket_size.0 as f64); + if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { + meta.sizes[meta_idx] = bucket.size; + region_buckets.clean_stats(meta_idx); + // the region has more than one bucket + // and the left neighbor + current bucket size is not very big + if meta.keys.len() > 2 + && meta_idx != 0 + && meta.sizes[meta_idx - 1] + bucket.size < cfg.region_bucket_size.0 * 2 + { + // bucket is too small + region_buckets.left_merge(meta_idx); + meta.left_merge(meta_idx); + change_bucket_version = true; + continue; + } + } else { + // update size + meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; + region_buckets.clean_stats(meta_idx); + // insert new bucket keys (split the original bucket) + for bucket_key in bucket.keys { + meta_idx += 1; + region_buckets.split(meta_idx); + meta.split(meta_idx, bucket_key); + change_bucket_version = true; + } + } + meta_idx += 1; + } + if change_bucket_version { + meta.version = next_bucket_version; + } + region_buckets.meta = Arc::new(meta); + self.set_bucket_stat(Some(region_buckets)); + change_bucket_version + } + + fn init_buckets( + &mut self, + cfg: &Config, + next_bucket_version: u64, + mut buckets: Vec, + region_epoch: RegionEpoch, + region: &Region, + ) { + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + assert_eq!(buckets.len(), 1); + let bucket_keys = buckets.pop().unwrap().keys; + let bucket_count = bucket_keys.len() + 1; + let mut meta = BucketMeta { + region_id: region.get_id(), + region_epoch, + version: next_bucket_version, + keys: bucket_keys, + sizes: vec![cfg.region_bucket_size.0; bucket_count], + }; + // padding the boundary keys and initialize the flow. + meta.keys.insert(0, region.get_start_key().to_vec()); + meta.keys.push(region.get_end_key().to_vec()); + let bucket_stats = BucketStat::from_meta(Arc::new(meta)); + self.set_bucket_stat(Some(bucket_stats)); + } +} + pub enum Task { SplitCheckTask { region: Region, @@ -482,6 +695,19 @@ impl Runner { }; if !split_keys.is_empty() { + // Notify peer that if the region is truly splitable. + // If it's truly splitable, then skip_split_check should be false; + self.router.update_approximate_size( + region.get_id(), + None, + Some(!split_keys.is_empty()), + ); + self.router.update_approximate_keys( + region.get_id(), + None, + Some(!split_keys.is_empty()), + ); + let region_epoch = region.get_region_epoch().clone(); self.router .ask_split(region_id, region_epoch, split_keys, "split checker".into()); @@ -523,6 +749,7 @@ impl Runner { } else { (!host.enable_region_bucket(), &empty_bucket) }; + let mut split_keys = vec![]; MergedIterator::<::Iterator>::new( tablet, LARGE_CFS, start_key, end_key, false, @@ -535,6 +762,7 @@ impl Runner { let mut skip_on_kv = false; while let Some(e) = iter.next() { if skip_on_kv && skip_check_bucket { + split_keys = host.split_keys(); return; } if !skip_on_kv && host.on_kv(region, &e) { @@ -597,6 +825,8 @@ impl Runner { } } + split_keys = host.split_keys(); + // if we scan the whole range, we can update approximate size and keys with // accurate value. if is_key_range { @@ -610,8 +840,17 @@ impl Runner { "bucket_count" => buckets.len(), "bucket_size" => bucket_size, ); - self.router.update_approximate_size(region.get_id(), size); - self.router.update_approximate_keys(region.get_id(), keys); + + self.router.update_approximate_size( + region.get_id(), + Some(size), + Some(!split_keys.is_empty()), + ); + self.router.update_approximate_keys( + region.get_id(), + Some(keys), + Some(!split_keys.is_empty()), + ); })?; if host.enable_region_bucket() { @@ -626,7 +865,7 @@ impl Runner { } timer.observe_duration(); - Ok(host.split_keys()) + Ok(split_keys) } fn change_cfg(&mut self, change: ConfigChange) { @@ -702,3 +941,178 @@ where } } } + +#[cfg(test)] +mod tests { + use super::*; + + // create BucketStatsInfo include three keys: ["","100","200",""]. + fn mock_bucket_stats_info() -> BucketStatsInfo { + let mut bucket_stats_info = BucketStatsInfo::default(); + let cfg = Config::default(); + let next_bucket_version = 1; + let bucket_ranges = None; + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(1); + region_epoch.set_version(1); + let mut region = Region::default(); + region.set_id(1); + + let mut buckets = vec![]; + let mut bucket = Bucket::default(); + bucket.keys.push(vec![100]); + bucket.keys.push(vec![200]); + buckets.insert(0, bucket); + + let _ = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + bucket_stats_info + } + + #[test] + pub fn test_version() { + let mut bucket_stats_info = mock_bucket_stats_info(); + assert_eq!(1, bucket_stats_info.version()); + bucket_stats_info.set_bucket_stat(None); + assert_eq!(1, bucket_stats_info.version()); + + let mut meta = BucketMeta::default(); + meta.version = 2; + meta.keys.push(vec![]); + meta.keys.push(vec![]); + let bucket_stat = BucketStat::from_meta(Arc::new(meta)); + bucket_stats_info.set_bucket_stat(Some(bucket_stat)); + assert_eq!(2, bucket_stats_info.version()); + } + + #[test] + pub fn test_insert_new_buckets() { + let bucket_stats_info = mock_bucket_stats_info(); + + let cfg = Config::default(); + let bucket_stat = bucket_stats_info.bucket_stat.unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stat.meta.keys + ); + for i in 0..bucket_stat.stats.write_bytes.len() { + assert_eq!(cfg.region_bucket_size.0, bucket_stat.meta.sizes[i]); + assert_eq!(0, bucket_stat.stats.write_bytes[i]); + } + } + + #[test] + pub fn test_report_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats.clone())); + let bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![0, 0, 0], report_bucket_stats.stats.write_bytes); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + } + + #[test] + pub fn test_spilt_and_merge_buckets() { + let mut bucket_stats_info = mock_bucket_stats_info(); + let next_bucket_version = 2; + let mut region = Region::default(); + region.set_id(1); + let cfg = Config::default(); + let bucket_size = cfg.region_bucket_size.0; + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + let region_epoch = bucket_stats.meta.region_epoch.clone(); + + // step1: update buckets flow + let mut delta_bucket_stats = bucket_stats.clone(); + delta_bucket_stats.write_key(&[1], 1); + delta_bucket_stats.write_key(&[201], 1); + bucket_stats_info.add_bucket_flow(&Some(delta_bucket_stats)); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step2: tick not affect anything + let bucket_ranges = Some(vec![]); + let buckets = vec![]; + let mut change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch.clone(), + ®ion, + bucket_ranges, + ); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert!(!change_bucket_version); + assert_eq!(vec![2, 0, 2], bucket_stats.stats.write_bytes); + + // step3: split key 50 + let mut bucket_ranges = Some(vec![BucketRange(vec![], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![vec![50]]; + bucket.size = bucket_size; + let mut buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets.clone(), + region_epoch.clone(), + ®ion, + bucket_ranges.clone(), + ); + assert!(change_bucket_version); + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![50], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 0, 2], bucket_stats.stats.write_bytes); + + // step4: merge [50-100] to [0-50], + bucket_ranges = Some(vec![BucketRange(vec![50], vec![100])]); + let mut bucket = Bucket::default(); + bucket.keys = vec![]; + bucket.size = 0; + buckets = vec![bucket]; + change_bucket_version = bucket_stats_info.on_refresh_region_buckets( + &cfg, + next_bucket_version, + buckets, + region_epoch, + ®ion, + bucket_ranges, + ); + assert!(change_bucket_version); + + let bucket_stats = bucket_stats_info.bucket_stat().as_ref().unwrap(); + assert_eq!( + vec![vec![], vec![100], vec![200], vec![]], + bucket_stats.meta.keys + ); + assert_eq!( + vec![bucket_size / 2, bucket_size, bucket_size], + bucket_stats.meta.sizes + ); + assert_eq!(vec![0, 0, 2], bucket_stats.stats.write_bytes); + + // report buckets doesn't be affected by the split and merge. + let report_bucket_stats = bucket_stats_info.report_bucket_stat(); + assert_eq!(vec![4, 0, 2], report_bucket_stats.stats.write_bytes); + } +} diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 8fec853bb00..2d29bd21a89 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -68,18 +68,18 @@ pub fn get_sample_num() -> usize { #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct SplitConfig { - pub qps_threshold: usize, + pub qps_threshold: Option, pub split_balance_score: f64, pub split_contained_score: f64, pub detect_times: u64, pub sample_num: usize, pub sample_threshold: u64, - pub byte_threshold: usize, + pub byte_threshold: Option, #[doc(hidden)] pub grpc_thread_cpu_overload_threshold_ratio: f64, #[doc(hidden)] pub unified_read_pool_thread_cpu_overload_threshold_ratio: f64, - pub region_cpu_overload_threshold_ratio: f64, + pub region_cpu_overload_threshold_ratio: Option, // deprecated. #[online_config(skip)] #[doc(hidden)] @@ -95,18 +95,18 @@ pub struct SplitConfig { impl Default for SplitConfig { fn default() -> SplitConfig { SplitConfig { - qps_threshold: DEFAULT_QPS_THRESHOLD, + qps_threshold: None, split_balance_score: DEFAULT_SPLIT_BALANCE_SCORE, split_contained_score: DEFAULT_SPLIT_CONTAINED_SCORE, detect_times: DEFAULT_DETECT_TIMES, sample_num: DEFAULT_SAMPLE_NUM, sample_threshold: DEFAULT_SAMPLE_THRESHOLD, - byte_threshold: DEFAULT_BYTE_THRESHOLD, + byte_threshold: None, grpc_thread_cpu_overload_threshold_ratio: DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, unified_read_pool_thread_cpu_overload_threshold_ratio: DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, - region_cpu_overload_threshold_ratio: REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + region_cpu_overload_threshold_ratio: None, size_threshold: None, // deprecated. key_threshold: None, // deprecated. } @@ -124,7 +124,7 @@ impl SplitConfig { ("split_balance_score or split_contained_score should be between 0 and 1.").into(), ); } - if self.sample_num >= self.qps_threshold { + if self.sample_num >= self.qps_threshold() { return Err( ("sample_num should be less than qps_threshold for load-base-split.").into(), ); @@ -133,20 +133,52 @@ impl SplitConfig { || self.grpc_thread_cpu_overload_threshold_ratio < 0.0 || self.unified_read_pool_thread_cpu_overload_threshold_ratio > 1.0 || self.unified_read_pool_thread_cpu_overload_threshold_ratio < 0.0 - || self.region_cpu_overload_threshold_ratio > 1.0 - || self.region_cpu_overload_threshold_ratio < 0.0 + || self.region_cpu_overload_threshold_ratio() > 1.0 + || self.region_cpu_overload_threshold_ratio() < 0.0 { return Err(("threshold ratio should be between 0 and 1.").into()); } Ok(()) } + pub fn qps_threshold(&self) -> usize { + self.qps_threshold.unwrap_or(DEFAULT_QPS_THRESHOLD) + } + + pub fn byte_threshold(&self) -> usize { + self.byte_threshold.unwrap_or(DEFAULT_BYTE_THRESHOLD) + } + + pub fn region_cpu_overload_threshold_ratio(&self) -> f64 { + self.region_cpu_overload_threshold_ratio + .unwrap_or(REGION_CPU_OVERLOAD_THRESHOLD_RATIO) + } + pub fn optimize_for(&mut self, region_size: ReadableSize) { const LARGE_REGION_SIZE_IN_MB: u64 = 4096; - if region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB { - self.qps_threshold = DEFAULT_BIG_REGION_QPS_THRESHOLD; - self.region_cpu_overload_threshold_ratio = BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO; - self.byte_threshold = DEFAULT_BIG_REGION_BYTE_THRESHOLD; + let big_size = region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB; + if self.qps_threshold.is_none() { + self.qps_threshold = Some(if big_size { + DEFAULT_BIG_REGION_QPS_THRESHOLD + } else { + DEFAULT_QPS_THRESHOLD + }); + } + + if self.byte_threshold.is_none() { + self.byte_threshold = Some(if big_size { + DEFAULT_BIG_REGION_BYTE_THRESHOLD + } else { + DEFAULT_BYTE_THRESHOLD + }); + } + + if self.region_cpu_overload_threshold_ratio.is_none() { + self.region_cpu_overload_threshold_ratio = Some(if big_size { + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO + } else { + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + }); } } } diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index d432f264e01..4bbcc773763 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -608,7 +608,7 @@ impl AutoSplitController { } fn should_check_region_cpu(&self) -> bool { - self.cfg.region_cpu_overload_threshold_ratio > 0.0 + self.cfg.region_cpu_overload_threshold_ratio() > 0.0 } fn is_grpc_poll_busy(&self, avg_grpc_thread_usage: f64) -> bool { @@ -643,7 +643,7 @@ impl AutoSplitController { return false; } region_cpu_usage / unified_read_pool_thread_usage - >= self.cfg.region_cpu_overload_threshold_ratio + >= self.cfg.region_cpu_overload_threshold_ratio() } // collect the read stats from read_stats_vec and dispatch them to a Region @@ -787,9 +787,9 @@ impl AutoSplitController { debug!("load base split params"; "region_id" => region_id, "qps" => qps, - "qps_threshold" => self.cfg.qps_threshold, + "qps_threshold" => self.cfg.qps_threshold(), "byte" => byte, - "byte_threshold" => self.cfg.byte_threshold, + "byte_threshold" => self.cfg.byte_threshold(), "cpu_usage" => cpu_usage, "is_region_busy" => is_region_busy, ); @@ -800,8 +800,8 @@ impl AutoSplitController { // 1. If the QPS or the byte does not meet the threshold, skip. // 2. If the Unified Read Pool or the region is not hot enough, skip. - if qps < self.cfg.qps_threshold - && byte < self.cfg.byte_threshold + if qps < self.cfg.qps_threshold() + && byte < self.cfg.byte_threshold() && (!is_unified_read_pool_busy || !is_region_busy) { self.recorders.remove_entry(®ion_id); @@ -917,13 +917,13 @@ impl AutoSplitController { pub fn refresh_and_check_cfg(&mut self) -> SplitConfigChange { let mut cfg_change = SplitConfigChange::Noop; if let Some(incoming) = self.cfg_tracker.any_new() { - if self.cfg.region_cpu_overload_threshold_ratio <= 0.0 - && incoming.region_cpu_overload_threshold_ratio > 0.0 + if self.cfg.region_cpu_overload_threshold_ratio() <= 0.0 + && incoming.region_cpu_overload_threshold_ratio() > 0.0 { cfg_change = SplitConfigChange::UpdateRegionCpuCollector(true); } - if self.cfg.region_cpu_overload_threshold_ratio > 0.0 - && incoming.region_cpu_overload_threshold_ratio <= 0.0 + if self.cfg.region_cpu_overload_threshold_ratio() > 0.0 + && incoming.region_cpu_overload_threshold_ratio() <= 0.0 { cfg_change = SplitConfigChange::UpdateRegionCpuCollector(false); } @@ -943,12 +943,12 @@ impl AutoSplitController { mod tests { use online_config::{ConfigChange, ConfigManager, ConfigValue}; use resource_metering::{RawRecord, TagInfos}; - use tikv_util::config::VersionTrack; + use tikv_util::config::{ReadableSize, VersionTrack}; use txn_types::Key; use super::*; use crate::store::worker::split_config::{ - DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_SAMPLE_NUM, }; enum Position { @@ -1193,7 +1193,7 @@ mod tests { fn check_split_key(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 1; + hub.cfg.qps_threshold = Some(1); hub.cfg.sample_threshold = 0; for i in 0..10 { @@ -1226,7 +1226,7 @@ mod tests { ) { let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 1; + hub.cfg.qps_threshold = Some(1); hub.cfg.sample_threshold = 0; for i in 0..10 { @@ -1291,7 +1291,7 @@ mod tests { #[test] fn test_sample_key_num() { let mut hub = AutoSplitController::default(); - hub.cfg.qps_threshold = 2000; + hub.cfg.qps_threshold = Some(2000); hub.cfg.sample_num = 2000; hub.cfg.sample_threshold = 0; @@ -1608,7 +1608,8 @@ mod tests { #[test] fn test_refresh_and_check_cfg() { - let split_config = SplitConfig::default(); + let mut split_config = SplitConfig::default(); + split_config.optimize_for(ReadableSize::mb(5000)); let mut split_cfg_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(split_config))); let mut auto_split_controller = @@ -1620,8 +1621,8 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO + .region_cpu_overload_threshold_ratio(), + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); // Set to zero. dispatch_split_cfg_change( @@ -1636,7 +1637,7 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, + .region_cpu_overload_threshold_ratio(), 0.0 ); assert_eq!( @@ -1647,7 +1648,7 @@ mod tests { dispatch_split_cfg_change( &mut split_cfg_manager, "region_cpu_overload_threshold_ratio", - ConfigValue::F64(REGION_CPU_OVERLOAD_THRESHOLD_RATIO), + ConfigValue::F64(0.1), ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), @@ -1656,8 +1657,8 @@ mod tests { assert_eq!( auto_split_controller .cfg - .region_cpu_overload_threshold_ratio, - REGION_CPU_OVERLOAD_THRESHOLD_RATIO + .region_cpu_overload_threshold_ratio(), + 0.1 ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), diff --git a/components/region_cache_memory_engine/Cargo.toml b/components/region_cache_memory_engine/Cargo.toml new file mode 100644 index 00000000000..ec19cbbeac4 --- /dev/null +++ b/components/region_cache_memory_engine/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "region_cache_memory_engine" +version = "0.0.1" +edition = "2021" +publish = false + +[features] +testexport = [] + +[dependencies] +bytes = "1.0" +collections = { workspace = true } +engine_traits = { workspace = true } +skiplist-rs = { git = "https://github.com/tikv/skiplist-rs.git", branch = "main" } +tikv_util = { workspace = true } diff --git a/components/region_cache_memory_engine/src/engine.rs b/components/region_cache_memory_engine/src/engine.rs new file mode 100644 index 00000000000..a8ee66a5b23 --- /dev/null +++ b/components/region_cache_memory_engine/src/engine.rs @@ -0,0 +1,787 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use core::slice::SlicePattern; +use std::{ + collections::BTreeMap, + fmt::{self, Debug}, + ops::Deref, + sync::{Arc, Mutex}, +}; + +use bytes::Bytes; +use collections::HashMap; +use engine_traits::{ + CfNamesExt, DbVector, Error, IterOptions, Iterable, Iterator, Mutable, Peekable, ReadOptions, + RegionCacheEngine, Result, Snapshot, SnapshotMiscExt, WriteBatch, WriteBatchExt, WriteOptions, + CF_DEFAULT, CF_LOCK, CF_WRITE, +}; +use skiplist_rs::{ByteWiseComparator, IterRef, Skiplist}; +use tikv_util::config::ReadableSize; + +fn cf_to_id(cf: &str) -> usize { + match cf { + CF_DEFAULT => 0, + CF_LOCK => 1, + CF_WRITE => 2, + _ => panic!("unrecognized cf {}", cf), + } +} + +/// RegionMemoryEngine stores data for a specific cached region +/// +/// todo: The skiplist used here currently is for test purpose. Replace it +/// with a formal implementation. +#[derive(Clone)] +pub struct RegionMemoryEngine { + data: [Arc>; 3], +} + +impl RegionMemoryEngine { + pub fn with_capacity(arena_size: usize) -> Self { + RegionMemoryEngine { + data: [ + Arc::new(Skiplist::with_capacity( + ByteWiseComparator::default(), + arena_size, + true, + )), + Arc::new(Skiplist::with_capacity( + ByteWiseComparator::default(), + arena_size, + true, + )), + Arc::new(Skiplist::with_capacity( + ByteWiseComparator::default(), + arena_size, + true, + )), + ], + } + } +} + +impl Default for RegionMemoryEngine { + fn default() -> Self { + RegionMemoryEngine::with_capacity(ReadableSize::mb(1).0 as usize) + } +} + +impl Debug for RegionMemoryEngine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Region Memory Engine") + } +} + +// read_ts -> ref_count +#[derive(Default)] +struct SnapshotList(BTreeMap); + +impl SnapshotList { + fn new_snapshot(&mut self, read_ts: u64) { + // snapshot with this ts may be granted before + let count = self.0.get(&read_ts).unwrap_or(&0) + 1; + self.0.insert(read_ts, count); + } + + fn remove_snapshot(&mut self, read_ts: u64) { + let count = self.0.get_mut(&read_ts).unwrap(); + assert!(*count >= 1); + if *count == 1 { + self.0.remove(&read_ts).unwrap(); + } else { + *count -= 1; + } + } +} + +#[derive(Default)] +pub struct RegionMemoryMeta { + // It records the snapshots that have been granted previsously with specific snapshot_ts. We + // should guarantee that the data visible to any one of the snapshot in it will not be removed. + snapshot_list: SnapshotList, + // It indicates whether the region is readable. False means integrity of the data in this + // cached region is not satisfied due to being evicted for instance. + can_read: bool, + // Request with read_ts below it is not eligible for granting snapshot. + // Note: different region can have different safe_ts. + safe_ts: u64, +} + +impl RegionMemoryMeta { + pub fn set_can_read(&mut self, can_read: bool) { + self.can_read = can_read; + } + + pub fn set_safe_ts(&mut self, safe_ts: u64) { + self.safe_ts = safe_ts; + } +} + +#[derive(Default)] +pub struct RegionCacheMemoryEngineCore { + engine: HashMap, + region_metas: HashMap, +} + +impl RegionCacheMemoryEngineCore { + pub fn mut_region_meta(&mut self, region_id: u64) -> Option<&mut RegionMemoryMeta> { + self.region_metas.get_mut(®ion_id) + } +} + +/// The RegionCacheMemoryEngine serves as a region cache, storing hot regions in +/// the leaders' store. Incoming writes that are written to disk engine (now, +/// RocksDB) are also written to the RegionCacheMemoryEngine, leading to a +/// mirrored data set in the cached regions with the disk engine. +/// +/// A load/evict unit manages the memory, deciding which regions should be +/// evicted when the memory used by the RegionCacheMemoryEngine reaches a +/// certain limit, and determining which regions should be loaded when there is +/// spare memory capacity. +/// +/// The safe point lifetime differs between RegionCacheMemoryEngine and the disk +/// engine, often being much shorter in RegionCacheMemoryEngine. This means that +/// RegionCacheMemoryEngine may filter out some keys that still exist in the +/// disk engine, thereby improving read performance as fewer duplicated keys +/// will be read. If there's a need to read keys that may have been filtered by +/// RegionCacheMemoryEngine (as indicated by read_ts and safe_point of the +/// cached region), we resort to using a the disk engine's snapshot instead. +#[derive(Clone, Default)] +pub struct RegionCacheMemoryEngine { + core: Arc>, +} + +impl RegionCacheMemoryEngine { + pub fn core(&self) -> &Arc> { + &self.core + } +} + +impl Debug for RegionCacheMemoryEngine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Region Cache Memory Engine") + } +} + +impl RegionCacheMemoryEngine { + pub fn new_region(&self, region_id: u64) { + let mut core = self.core.lock().unwrap(); + + assert!(core.engine.get(®ion_id).is_none()); + assert!(core.region_metas.get(®ion_id).is_none()); + core.engine.insert(region_id, RegionMemoryEngine::default()); + core.region_metas + .insert(region_id, RegionMemoryMeta::default()); + } +} + +impl RegionCacheEngine for RegionCacheMemoryEngine { + type Snapshot = RegionCacheSnapshot; + + // todo(SpadeA): add sequence number logic + fn snapshot(&self, region_id: u64, read_ts: u64, seq_num: u64) -> Option { + RegionCacheSnapshot::new(self.clone(), region_id, read_ts, seq_num) + } +} + +// todo: fill fields needed +pub struct RegionCacheWriteBatch; + +impl WriteBatchExt for RegionCacheMemoryEngine { + type WriteBatch = RegionCacheWriteBatch; + // todo: adjust it + const WRITE_BATCH_MAX_KEYS: usize = 256; + + fn write_batch(&self) -> Self::WriteBatch { + RegionCacheWriteBatch {} + } + + fn write_batch_with_cap(&self, _: usize) -> Self::WriteBatch { + RegionCacheWriteBatch {} + } +} + +pub struct RegionCacheIterator { + cf: String, + valid: bool, + prefix_same_as_start: bool, + prefix: Option>, + iter: IterRef, ByteWiseComparator>, + // The lower bound is inclusive while the upper bound is exclusive if set + lower_bound: Vec, + upper_bound: Vec, +} + +impl Iterable for RegionCacheMemoryEngine { + type Iterator = RegionCacheIterator; + + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { + unimplemented!() + } +} + +impl Iterator for RegionCacheIterator { + fn key(&self) -> &[u8] { + assert!(self.valid); + self.iter.key().as_slice() + } + + fn value(&self) -> &[u8] { + assert!(self.valid); + self.iter.value().as_slice() + } + + fn next(&mut self) -> Result { + assert!(self.valid); + self.iter.next(); + self.valid = self.iter.valid() && self.iter.key().as_slice() < self.upper_bound.as_slice(); + + if self.valid && self.prefix_same_as_start { + // todo(SpadeA): support prefix seek + unimplemented!() + } + Ok(self.valid) + } + + fn prev(&mut self) -> Result { + assert!(self.valid); + self.iter.prev(); + self.valid = self.iter.valid() && self.iter.key().as_slice() >= self.lower_bound.as_slice(); + if self.valid && self.prefix_same_as_start { + // todo(SpadeA): support prefix seek + unimplemented!() + } + Ok(self.valid) + } + + fn seek(&mut self, key: &[u8]) -> Result { + let seek_key = if key < self.lower_bound.as_slice() { + self.lower_bound.as_slice() + } else { + key + }; + self.iter.seek(seek_key); + self.valid = self.iter.valid() && self.iter.key().as_slice() < self.upper_bound.as_slice(); + + if self.valid && self.prefix_same_as_start { + // todo(SpadeA): support prefix seek + unimplemented!() + } + + Ok(self.valid) + } + + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + let end = if key > self.upper_bound.as_slice() { + self.upper_bound.as_slice() + } else { + key + }; + self.iter.seek_for_prev(end); + self.valid = self.iter.valid() && self.iter.key().as_slice() >= self.lower_bound.as_slice(); + + if self.valid && self.prefix_same_as_start { + // todo(SpadeA): support prefix seek + unimplemented!() + } + + Ok(self.valid) + } + + fn seek_to_first(&mut self) -> Result { + let lower_bound = self.lower_bound.clone(); + self.seek(lower_bound.as_slice()) + } + + fn seek_to_last(&mut self) -> Result { + let upper_bound = self.upper_bound.clone(); + self.seek_for_prev(upper_bound.as_slice()) + } + + fn valid(&self) -> Result { + Ok(self.valid) + } +} + +impl WriteBatch for RegionCacheWriteBatch { + fn write_opt(&mut self, _: &WriteOptions) -> Result { + unimplemented!() + } + + fn data_size(&self) -> usize { + unimplemented!() + } + + fn count(&self) -> usize { + unimplemented!() + } + + fn is_empty(&self) -> bool { + unimplemented!() + } + + fn should_write_to_engine(&self) -> bool { + unimplemented!() + } + + fn clear(&mut self) { + unimplemented!() + } + + fn set_save_point(&mut self) { + unimplemented!() + } + + fn pop_save_point(&mut self) -> Result<()> { + unimplemented!() + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + unimplemented!() + } + + fn merge(&mut self, _: Self) -> Result<()> { + unimplemented!() + } +} + +impl Mutable for RegionCacheWriteBatch { + fn put(&mut self, _: &[u8], _: &[u8]) -> Result<()> { + unimplemented!() + } + + fn put_cf(&mut self, _: &str, _: &[u8], _: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete(&mut self, _: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_cf(&mut self, _: &str, _: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_range(&mut self, _: &[u8], _: &[u8]) -> Result<()> { + unimplemented!() + } + + fn delete_range_cf(&mut self, _: &str, _: &[u8], _: &[u8]) -> Result<()> { + unimplemented!() + } +} + +#[derive(Clone, Debug)] +pub struct RegionCacheSnapshot { + region_id: u64, + snapshot_ts: u64, + // Sequence number is shared between RegionCacheEngine and disk KvEnigne to + // provide atomic write + sequence_number: u64, + region_memory_engine: RegionMemoryEngine, + engine: RegionCacheMemoryEngine, +} + +impl RegionCacheSnapshot { + pub fn new( + engine: RegionCacheMemoryEngine, + region_id: u64, + read_ts: u64, + seq_num: u64, + ) -> Option { + let mut core = engine.core.lock().unwrap(); + let region_meta = core.region_metas.get_mut(®ion_id)?; + if !region_meta.can_read { + return None; + } + + if read_ts <= region_meta.safe_ts { + // todo(SpadeA): add metrics for it + return None; + } + + region_meta.snapshot_list.new_snapshot(read_ts); + + Some(RegionCacheSnapshot { + region_id, + snapshot_ts: read_ts, + sequence_number: seq_num, + region_memory_engine: core.engine.get(®ion_id).unwrap().clone(), + engine: engine.clone(), + }) + } +} + +impl Drop for RegionCacheSnapshot { + fn drop(&mut self) { + let mut core = self.engine.core.lock().unwrap(); + let meta = core.region_metas.get_mut(&self.region_id).unwrap(); + meta.snapshot_list.remove_snapshot(self.snapshot_ts); + } +} + +impl Snapshot for RegionCacheSnapshot {} + +impl Iterable for RegionCacheSnapshot { + type Iterator = RegionCacheIterator; + + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { + let iter = self.region_memory_engine.data[cf_to_id(cf)].iter(); + let prefix_same_as_start = opts.prefix_same_as_start(); + let (lower_bound, upper_bound) = opts.build_bounds(); + // only support with lower/upper bound set + if lower_bound.is_none() || upper_bound.is_none() { + return Err(Error::BoundaryNotSet); + } + Ok(RegionCacheIterator { + cf: String::from(cf), + valid: false, + prefix_same_as_start, + prefix: None, + lower_bound: lower_bound.unwrap(), + upper_bound: upper_bound.unwrap(), + iter, + }) + } +} + +impl Peekable for RegionCacheSnapshot { + type DbVector = RegionCacheDbVector; + + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + self.get_value_cf_opt(opts, CF_DEFAULT, key) + } + + fn get_value_cf_opt( + &self, + _: &ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + Ok(self.region_memory_engine.data[cf_to_id(cf)] + .get(key) + .cloned() + .map(|v| RegionCacheDbVector(v))) + } +} + +impl CfNamesExt for RegionCacheSnapshot { + fn cf_names(&self) -> Vec<&str> { + unimplemented!() + } +} + +impl SnapshotMiscExt for RegionCacheSnapshot { + fn sequence_number(&self) -> u64 { + self.sequence_number + } +} + +#[derive(Debug)] +pub struct RegionCacheDbVector(Bytes); + +impl Deref for RegionCacheDbVector { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.0.as_slice() + } +} + +impl DbVector for RegionCacheDbVector {} + +impl<'a> PartialEq<&'a [u8]> for RegionCacheDbVector { + fn eq(&self, rhs: &&[u8]) -> bool { + self.0.as_slice() == *rhs + } +} + +#[cfg(test)] +mod tests { + use core::ops::Range; + use std::{iter::StepBy, sync::Arc}; + + use bytes::Bytes; + use engine_traits::{ + IterOptions, Iterable, Iterator, Peekable, ReadOptions, RegionCacheEngine, + }; + use skiplist_rs::{ByteWiseComparator, Skiplist}; + + use super::{cf_to_id, RegionCacheIterator}; + use crate::RegionCacheMemoryEngine; + + #[test] + fn test_snapshot() { + let engine = RegionCacheMemoryEngine::default(); + engine.new_region(1); + + let verify_snapshot_count = |snapshot_ts, count| { + let core = engine.core.lock().unwrap(); + if count > 0 { + assert_eq!( + *core + .region_metas + .get(&1) + .unwrap() + .snapshot_list + .0 + .get(&snapshot_ts) + .unwrap(), + count + ); + } else { + assert!( + core.region_metas + .get(&1) + .unwrap() + .snapshot_list + .0 + .get(&snapshot_ts) + .is_none() + ) + } + }; + + assert!(engine.snapshot(1, 5, u64::MAX).is_none()); + + { + let mut core = engine.core.lock().unwrap(); + core.region_metas.get_mut(&1).unwrap().can_read = true; + } + let s1 = engine.snapshot(1, 5, u64::MAX).unwrap(); + + { + let mut core = engine.core.lock().unwrap(); + core.region_metas.get_mut(&1).unwrap().safe_ts = 5; + } + assert!(engine.snapshot(1, 5, u64::MAX).is_none()); + let s2 = engine.snapshot(1, 10, u64::MAX).unwrap(); + + verify_snapshot_count(5, 1); + verify_snapshot_count(10, 1); + let s3 = engine.snapshot(1, 10, u64::MAX).unwrap(); + verify_snapshot_count(10, 2); + + drop(s1); + verify_snapshot_count(5, 0); + drop(s2); + verify_snapshot_count(10, 1); + let s4 = engine.snapshot(1, 10, u64::MAX).unwrap(); + verify_snapshot_count(10, 2); + drop(s4); + verify_snapshot_count(10, 1); + drop(s3); + verify_snapshot_count(10, 0); + } + + fn construct_key(i: i32) -> String { + format!("key-{:08}", i) + } + + fn construct_value(i: i32) -> String { + format!("value-{:08}", i) + } + + fn fill_data_in_skiplist(sl: Arc>, range: StepBy>) { + for i in range { + let key = construct_key(i); + let val = construct_value(i); + sl.put(Bytes::from(key), Bytes::from(val)); + } + } + + fn verify_key_value(k: &[u8], v: &[u8], i: i32) { + let key = construct_key(i); + let val = construct_value(i); + assert_eq!(k, key.as_bytes()); + assert_eq!(v, val.as_bytes()); + } + + fn verify_key_values( + iter: &mut RegionCacheIterator, + step: i32, + mut start_idx: i32, + end_idx: i32, + ) { + let forward = step > 0; + while iter.valid().unwrap() { + let k = iter.key(); + let val = iter.value(); + verify_key_value(k, val, start_idx); + if forward { + iter.next().unwrap(); + } else { + iter.prev().unwrap(); + } + start_idx += step; + } + + if forward { + assert!(start_idx - step < end_idx); + } else { + assert!(start_idx - step > end_idx); + } + } + + #[test] + fn test_get_value() { + let engine = RegionCacheMemoryEngine::default(); + engine.new_region(1); + + { + let mut core = engine.core.lock().unwrap(); + core.region_metas.get_mut(&1).unwrap().can_read = true; + core.region_metas.get_mut(&1).unwrap().safe_ts = 5; + let sl = core.engine.get_mut(&1).unwrap().data[cf_to_id("write")].clone(); + fill_data_in_skiplist(sl, (1..100).step_by(1)); + } + + let snapshot = engine.snapshot(1, 10, u64::MAX).unwrap(); + let opts = ReadOptions::default(); + for i in 1..100 { + let k = construct_key(i); + let v = snapshot + .get_value_cf_opt(&opts, "write", k.as_bytes()) + .unwrap() + .unwrap(); + verify_key_value(k.as_bytes(), &v, i); + } + + let k = construct_key(100); + assert!( + snapshot + .get_value_cf_opt(&opts, "write", k.as_bytes()) + .unwrap() + .is_none() + ); + } + + #[test] + fn test_iterator_forawrd() { + let engine = RegionCacheMemoryEngine::default(); + engine.new_region(1); + let step: i32 = 2; + + { + let mut core = engine.core.lock().unwrap(); + core.region_metas.get_mut(&1).unwrap().can_read = true; + core.region_metas.get_mut(&1).unwrap().safe_ts = 5; + let sl = core.engine.get_mut(&1).unwrap().data[cf_to_id("write")].clone(); + fill_data_in_skiplist(sl, (1..100).step_by(step as usize)); + } + + let mut iter_opt = IterOptions::default(); + let snapshot = engine.snapshot(1, 10, u64::MAX).unwrap(); + // boundaries are not set + assert!(snapshot.iterator_opt("lock", iter_opt.clone()).is_err()); + + let lower_bound = construct_key(1); + let upper_bound = construct_key(100); + iter_opt.set_upper_bound(upper_bound.as_bytes(), 0); + iter_opt.set_lower_bound(lower_bound.as_bytes(), 0); + + let mut iter = snapshot.iterator_opt("lock", iter_opt.clone()).unwrap(); + assert!(!iter.seek_to_first().unwrap()); + + let mut iter = snapshot.iterator_opt("default", iter_opt.clone()).unwrap(); + assert!(!iter.seek_to_first().unwrap()); + + let mut iter = snapshot.iterator_opt("write", iter_opt.clone()).unwrap(); + iter.seek_to_first().unwrap(); + verify_key_values(&mut iter, step, 1, i32::MAX); + + // seek key that is in the skiplist + let seek_key = construct_key(11); + iter.seek(seek_key.as_bytes()).unwrap(); + verify_key_values(&mut iter, step, 11, i32::MAX); + + // seek key that is not in the skiplist + let seek_key = construct_key(12); + iter.seek(seek_key.as_bytes()).unwrap(); + verify_key_values(&mut iter, step, 13, i32::MAX); + + // with bounds + let lower_bound = construct_key(20); + let upper_bound = construct_key(40); + iter_opt.set_upper_bound(upper_bound.as_bytes(), 0); + iter_opt.set_lower_bound(lower_bound.as_bytes(), 0); + let mut iter = snapshot.iterator_opt("write", iter_opt).unwrap(); + + assert!(iter.seek_to_first().unwrap()); + verify_key_values(&mut iter, step, 21, 40); + + // seek a key that is below the lower bound is the same with seek_to_first + let seek_key = construct_key(11); + assert!(iter.seek(seek_key.as_bytes()).unwrap()); + verify_key_values(&mut iter, step, 21, 40); + + // seek a key that is larger or equal to upper bound won't get any key + let seek_key = construct_key(40); + assert!(!iter.seek(seek_key.as_bytes()).unwrap()); + assert!(!iter.valid().unwrap()); + + let seek_key = construct_key(22); + assert!(iter.seek(seek_key.as_bytes()).unwrap()); + verify_key_values(&mut iter, step, 23, 40); + } + + #[test] + fn test_iterator_backward() { + let engine = RegionCacheMemoryEngine::default(); + engine.new_region(1); + let mut step: i32 = 2; + + { + let mut core = engine.core.lock().unwrap(); + core.region_metas.get_mut(&1).unwrap().can_read = true; + core.region_metas.get_mut(&1).unwrap().safe_ts = 5; + let sl = core.engine.get_mut(&1).unwrap().data[cf_to_id("write")].clone(); + fill_data_in_skiplist(sl, (1..100).step_by(step as usize)); + } + step = -step; + + let mut iter_opt = IterOptions::default(); + let lower_bound = construct_key(1); + let upper_bound = construct_key(100); + iter_opt.set_upper_bound(upper_bound.as_bytes(), 0); + iter_opt.set_lower_bound(lower_bound.as_bytes(), 0); + + let snapshot = engine.snapshot(1, 10, u64::MAX).unwrap(); + let mut iter = snapshot.iterator_opt("write", iter_opt.clone()).unwrap(); + assert!(iter.seek_to_last().unwrap()); + verify_key_values(&mut iter, step, 99, i32::MIN); + + // seek key that is in the skiplist + let seek_key = construct_key(81); + assert!(iter.seek_for_prev(seek_key.as_bytes()).unwrap()); + verify_key_values(&mut iter, step, 81, i32::MIN); + + // seek key that is in the skiplist + let seek_key = construct_key(80); + assert!(iter.seek_for_prev(seek_key.as_bytes()).unwrap()); + verify_key_values(&mut iter, step, 79, i32::MIN); + + let lower_bound = construct_key(20); + let upper_bound = construct_key(40); + iter_opt.set_upper_bound(upper_bound.as_bytes(), 0); + iter_opt.set_lower_bound(lower_bound.as_bytes(), 0); + let mut iter = snapshot.iterator_opt("write", iter_opt).unwrap(); + + assert!(iter.seek_to_last().unwrap()); + verify_key_values(&mut iter, step, 39, 20); + + // seek a key that is above the upper bound is the same with seek_to_last + let seek_key = construct_key(45); + assert!(iter.seek_for_prev(seek_key.as_bytes()).unwrap()); + verify_key_values(&mut iter, step, 39, 20); + + // seek a key that is less than the lower bound won't get any key + let seek_key = construct_key(19); + assert!(!iter.seek_for_prev(seek_key.as_bytes()).unwrap()); + assert!(!iter.valid().unwrap()); + + let seek_key = construct_key(38); + assert!(iter.seek_for_prev(seek_key.as_bytes()).unwrap()); + verify_key_values(&mut iter, step, 37, 20); + } +} diff --git a/components/region_cache_memory_engine/src/lib.rs b/components/region_cache_memory_engine/src/lib.rs new file mode 100644 index 00000000000..fe15f4f936b --- /dev/null +++ b/components/region_cache_memory_engine/src/lib.rs @@ -0,0 +1,9 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +#![allow(dead_code)] +#![allow(unused_variables)] +#![feature(let_chains)] +#![feature(slice_pattern)] + +mod engine; +pub use engine::RegionCacheMemoryEngine; diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 4428ed01a35..dd6e9c2002c 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -43,7 +43,7 @@ use tokio::{ }; use txn_types::TimeStamp; -use crate::{endpoint::Task, metrics::*}; +use crate::{endpoint::Task, metrics::*, TsSource}; pub(crate) const DEFAULT_CHECK_LEADER_TIMEOUT_DURATION: Duration = Duration::from_secs(5); // 5s const DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL: usize = 2; @@ -57,7 +57,7 @@ pub struct AdvanceTsWorker { scheduler: Scheduler, /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. - concurrency_manager: ConcurrencyManager, + pub(crate) concurrency_manager: ConcurrencyManager, // cache the last pd tso, used to approximate the next timestamp w/o an actual TSO RPC pub(crate) last_pd_tso: Arc>>, @@ -114,15 +114,17 @@ impl AdvanceTsWorker { if let Ok(mut last_pd_tso) = last_pd_tso.try_lock() { *last_pd_tso = Some((min_ts, Instant::now())); } + let mut ts_source = TsSource::PdTso; // Sync with concurrency manager so that it can work correctly when // optimizations like async commit is enabled. // Note: This step must be done before scheduling `Task::MinTs` task, and the // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); - if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { + if let Some((min_mem_lock_ts, lock)) = cm.global_min_lock() { if min_mem_lock_ts < min_ts { min_ts = min_mem_lock_ts; + ts_source = TsSource::MemoryLock(lock); } } @@ -131,6 +133,7 @@ impl AdvanceTsWorker { if let Err(e) = scheduler.schedule(Task::ResolvedTsAdvanced { regions, ts: min_ts, + ts_source, }) { info!("failed to schedule advance event"; "err" => ?e); } @@ -164,10 +167,7 @@ pub struct LeadershipResolver { // store_id -> check leader request, record the request to each stores. store_req_map: HashMap, - // region_id -> region, cache the information of regions. - region_map: HashMap>, - // region_id -> peers id, record the responses. - resp_map: HashMap>, + progresses: HashMap, checking_regions: HashSet, valid_regions: HashSet, @@ -193,8 +193,7 @@ impl LeadershipResolver { region_read_progress, store_req_map: HashMap::default(), - region_map: HashMap::default(), - resp_map: HashMap::default(), + progresses: HashMap::default(), valid_regions: HashSet::default(), checking_regions: HashSet::default(), last_gc_time: Instant::now_coarse(), @@ -206,8 +205,7 @@ impl LeadershipResolver { let now = Instant::now_coarse(); if now - self.last_gc_time > self.gc_interval { self.store_req_map = HashMap::default(); - self.region_map = HashMap::default(); - self.resp_map = HashMap::default(); + self.progresses = HashMap::default(); self.valid_regions = HashSet::default(); self.checking_regions = HashSet::default(); self.last_gc_time = now; @@ -219,10 +217,7 @@ impl LeadershipResolver { v.regions.clear(); v.ts = 0; } - for v in self.region_map.values_mut() { - v.clear(); - } - for v in self.resp_map.values_mut() { + for v in self.progresses.values_mut() { v.clear(); } self.checking_regions.clear(); @@ -249,8 +244,7 @@ impl LeadershipResolver { let store_id = self.store_id; let valid_regions = &mut self.valid_regions; - let region_map = &mut self.region_map; - let resp_map = &mut self.resp_map; + let progresses = &mut self.progresses; let store_req_map = &mut self.store_req_map; let checking_regions = &mut self.checking_regions; for region_id in ®ions { @@ -272,13 +266,13 @@ impl LeadershipResolver { } let leader_info = core.get_leader_info(); + let prog = progresses + .entry(*region_id) + .or_insert_with(|| RegionProgress::new(peer_list.len())); let mut unvotes = 0; for peer in peer_list { if peer.store_id == store_id && peer.id == leader_id { - resp_map - .entry(*region_id) - .or_insert_with(|| Vec::with_capacity(peer_list.len())) - .push(store_id); + prog.resps.push(store_id); } else { // It's still necessary to check leader on learners even if they don't vote // because performing stale read on learners require it. @@ -296,15 +290,14 @@ impl LeadershipResolver { } } } + // Check `region_has_quorum` here because `store_map` can be empty, // in which case `region_has_quorum` won't be called any more. - if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { + if unvotes == 0 && region_has_quorum(peer_list, &prog.resps) { + prog.resolved = true; valid_regions.insert(*region_id); } else { - region_map - .entry(*region_id) - .or_insert_with(|| Vec::with_capacity(peer_list.len())) - .extend_from_slice(peer_list); + prog.peers.extend_from_slice(peer_list); } } }); @@ -318,7 +311,6 @@ impl LeadershipResolver { .values() .find(|req| !req.regions.is_empty()) .map_or(0, |req| req.regions[0].compute_size()); - let store_count = store_req_map.len(); let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); for (store_id, req) in store_req_map { if req.regions.is_empty() { @@ -384,6 +376,7 @@ impl LeadershipResolver { .with_label_values(&["all"]) .observe(start.saturating_elapsed_secs()); }); + let rpc_count = check_leader_rpcs.len(); for _ in 0..rpc_count { // Use `select_all` to avoid the process getting blocked when some @@ -393,10 +386,16 @@ impl LeadershipResolver { match res { Ok((to_store, resp)) => { for region_id in resp.regions { - resp_map - .entry(region_id) - .or_insert_with(|| Vec::with_capacity(store_count)) - .push(to_store); + if let Some(prog) = progresses.get_mut(®ion_id) { + if prog.resolved { + continue; + } + prog.resps.push(to_store); + if region_has_quorum(&prog.peers, &prog.resps) { + prog.resolved = true; + valid_regions.insert(region_id); + } + } } } Err((to_store, reconnect, err)) => { @@ -406,24 +405,19 @@ impl LeadershipResolver { } } } - } - for (region_id, prs) in region_map { - if prs.is_empty() { - // The peer had the leadership before, but now it's no longer - // the case. Skip checking the region. - continue; - } - if let Some(resp) = resp_map.get(region_id) { - if resp.is_empty() { - // No response, maybe the peer lost leadership. - continue; - } - if region_has_quorum(prs, resp) { - valid_regions.insert(*region_id); - } + if valid_regions.len() >= progresses.len() { + break; } } - self.valid_regions.drain().collect() + let res: Vec = self.valid_regions.drain().collect(); + if res.len() != checking_regions.len() { + warn!( + "check leader returns valid regions different from checking regions"; + "valid_regions" => res.len(), + "checking_regions" => checking_regions.len(), + ); + } + res } } @@ -549,6 +543,27 @@ async fn get_tikv_client( Ok(cli) } +struct RegionProgress { + resolved: bool, + peers: Vec, + resps: Vec, +} + +impl RegionProgress { + fn new(len: usize) -> Self { + RegionProgress { + resolved: false, + peers: Vec::with_capacity(len), + resps: Vec::with_capacity(len), + } + } + fn clear(&mut self) { + self.resolved = false; + self.peers.clear(); + self.resps.clear(); + } +} + #[cfg(test)] mod tests { use std::{ diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 5d0dbdcd689..9de21b27d9e 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -1,20 +1,19 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp::min, collections::HashMap, fmt, marker::PhantomData, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, Mutex, - }, + sync::{Arc, Mutex, MutexGuard}, time::Duration, }; use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; +use futures::channel::oneshot::{channel, Receiver, Sender}; use grpcio::Environment; -use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; +use kvproto::{kvrpcpb::LeaderInfo, metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; use pd_client::PdClient; use raftstore::{ @@ -22,38 +21,135 @@ use raftstore::{ router::CdcHandle, store::{ fsm::store::StoreRegionMeta, - util::{self, RegionReadProgress, RegionReadProgressRegistry}, + util::{ + self, ReadState, RegionReadProgress, RegionReadProgressCore, RegionReadProgressRegistry, + }, }, }; use security::SecurityManager; use tikv::config::ResolvedTsConfig; use tikv_util::{ + memory::{HeapSize, MemoryQuota}, warn, worker::{Runnable, RunnableWithTimer, Scheduler}, }; -use tokio::sync::Notify; +use tokio::sync::{Notify, Semaphore}; use txn_types::{Key, TimeStamp}; use crate::{ advance::{AdvanceTsWorker, LeadershipResolver, DEFAULT_CHECK_LEADER_TIMEOUT_DURATION}, cmd::{ChangeLog, ChangeRow}, metrics::*, - resolver::Resolver, - scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, + resolver::{LastAttempt, Resolver}, + scanner::{ScanEntries, ScanTask, ScannerPool}, + Error, Result, TsSource, TxnLocks, ON_DROP_WARN_HEAP_SIZE, }; -/// grace period for logging safe-ts and resolved-ts gap in slow log +/// grace period for identifying identifying slow resolved-ts and safe-ts. const SLOW_LOG_GRACE_PERIOD_MS: u64 = 1000; +const MEMORY_QUOTA_EXCEEDED_BACKOFF: Duration = Duration::from_secs(30); enum ResolverStatus { Pending { tracked_index: u64, locks: Vec, - cancelled: Arc, + cancelled: Option>, + memory_quota: Arc, }, Ready, } +impl Drop for ResolverStatus { + fn drop(&mut self) { + let ResolverStatus::Pending { + locks, + memory_quota, + .. + } = self else { + return; + }; + if locks.is_empty() { + return; + } + + // Free memory quota used by pending locks and unlocks. + let mut bytes = 0; + let num_locks = locks.len(); + for lock in locks { + bytes += lock.heap_size(); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("drop huge ResolverStatus"; + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => memory_quota.in_use(), + "memory_quota_capacity" => memory_quota.capacity(), + ); + } + memory_quota.free(bytes); + } +} + +impl ResolverStatus { + fn push_pending_lock(&mut self, lock: PendingLock, region_id: u64) -> Result<()> { + let ResolverStatus::Pending { + locks, + memory_quota, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + // Check if adding a new lock or unlock will exceed the memory + // quota. + memory_quota.alloc(lock.heap_size()).map_err(|e| { + fail::fail_point!("resolved_ts_on_pending_locks_memory_quota_exceeded"); + Error::MemoryQuotaExceeded(e) + })?; + locks.push(lock); + Ok(()) + } + + fn update_tracked_index(&mut self, index: u64, region_id: u64) { + let ResolverStatus::Pending { + tracked_index, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + assert!( + *tracked_index < index, + "region {}, tracked_index: {}, incoming index: {}", + region_id, + *tracked_index, + index + ); + *tracked_index = index; + } + + fn drain_pending_locks( + &mut self, + region_id: u64, + ) -> (u64, impl Iterator + '_) { + let ResolverStatus::Pending { + locks, + memory_quota, + tracked_index, + .. + } = self else { + panic!("region {:?} resolver has ready", region_id) + }; + // Must take locks, otherwise it may double free memory quota on drop. + let locks = std::mem::take(locks); + ( + *tracked_index, + locks.into_iter().map(|lock| { + memory_quota.free(lock.heap_size()); + lock + }), + ) + } +} + #[allow(dead_code)] enum PendingLock { Track { @@ -67,6 +163,16 @@ enum PendingLock { }, } +impl HeapSize for PendingLock { + fn heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key, .. } => { + key.as_encoded().heap_size() + } + } + } +} + // Records information related to observed region. // observe_id is used for avoiding ABA problems in incremental scan task, // advance resolved ts task, and command observing. @@ -80,122 +186,121 @@ struct ObserveRegion { } impl ObserveRegion { - fn new(meta: Region, rrp: Arc) -> Self { + fn new( + meta: Region, + rrp: Arc, + memory_quota: Arc, + cancelled: Sender<()>, + ) -> Self { ObserveRegion { - resolver: Resolver::with_read_progress(meta.id, Some(rrp)), + resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota.clone()), meta, handle: ObserveHandle::new(), resolver_status: ResolverStatus::Pending { tracked_index: 0, locks: vec![], - cancelled: Arc::new(AtomicBool::new(false)), + cancelled: Some(cancelled), + memory_quota, }, } } - fn read_progress(&self) -> &RegionReadProgress { - self.resolver.read_progress.as_ref().unwrap() + fn read_progress(&self) -> &Arc { + self.resolver.read_progress().unwrap() } - fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> std::result::Result<(), String> { - match &mut self.resolver_status { - ResolverStatus::Pending { - locks, - tracked_index, - .. - } => { - for log in change_logs { - match log { - ChangeLog::Error(e) => { - debug!( - "skip change log error"; - "region" => self.meta.id, - "error" => ?e, - ); - continue; - } - ChangeLog::Admin(req_type) => { - // TODO: for admin cmd that won't change the region meta like peer list - // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to - // return error - return Err(format!( - "region met admin command {:?} while initializing resolver", - req_type - )); - } - ChangeLog::Rows { rows, index } => { - rows.iter().for_each(|row| match row { - ChangeRow::Prewrite { key, start_ts, .. } => { - locks.push(PendingLock::Track { - key: key.clone(), - start_ts: *start_ts, - }) - } + fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> Result<()> { + if matches!(self.resolver_status, ResolverStatus::Pending { .. }) { + for log in change_logs { + match log { + ChangeLog::Error(e) => { + debug!( + "skip change log error"; + "region" => self.meta.id, + "error" => ?e, + ); + continue; + } + ChangeLog::Admin(req_type) => { + // TODO: for admin cmd that won't change the region meta like peer list + // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to + // return error + return Err(box_err!( + "region met admin command {:?} while initializing resolver", + req_type + )); + } + ChangeLog::Rows { rows, index } => { + for row in rows { + let lock = match row { + ChangeRow::Prewrite { key, start_ts, .. } => PendingLock::Track { + key: key.clone(), + start_ts: *start_ts, + }, ChangeRow::Commit { key, start_ts, commit_ts, .. - } => locks.push(PendingLock::Untrack { + } => PendingLock::Untrack { key: key.clone(), start_ts: *start_ts, commit_ts: *commit_ts, - }), + }, // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => {} - ChangeRow::IngestSsT => {} - }); - assert!( - *tracked_index < *index, - "region {}, tracked_index: {}, incoming index: {}", - self.meta.id, - *tracked_index, - *index - ); - *tracked_index = *index; + ChangeRow::OnePc { .. } | ChangeRow::IngestSsT => continue, + }; + self.resolver_status.push_pending_lock(lock, self.meta.id)?; } + self.resolver_status + .update_tracked_index(*index, self.meta.id); } } } - ResolverStatus::Ready => { - for log in change_logs { - match log { - ChangeLog::Error(e) => { + } else { + for log in change_logs { + match log { + ChangeLog::Error(e) => { + debug!( + "skip change log error"; + "region" => self.meta.id, + "error" => ?e, + ); + continue; + } + ChangeLog::Admin(req_type) => match req_type { + AdminCmdType::Split + | AdminCmdType::BatchSplit + | AdminCmdType::PrepareMerge + | AdminCmdType::RollbackMerge + | AdminCmdType::CommitMerge => { + info!( + "region met split/merge command, stop tracking since key range changed, wait for re-register"; + "req_type" => ?req_type, + ); + // Stop tracking so that `tracked_index` larger than the split/merge + // command index won't be published until `RegionUpdate` event + // trigger the region re-register and re-scan the new key range + self.resolver.stop_tracking(); + } + _ => { debug!( - "skip change log error"; + "skip change log admin"; "region" => self.meta.id, - "error" => ?e, + "req_type" => ?req_type, ); - continue; } - ChangeLog::Admin(req_type) => match req_type { - AdminCmdType::Split - | AdminCmdType::BatchSplit - | AdminCmdType::PrepareMerge - | AdminCmdType::RollbackMerge - | AdminCmdType::CommitMerge => { - info!( - "region met split/merge command, stop tracking since key range changed, wait for re-register"; - "req_type" => ?req_type, - ); - // Stop tracking so that `tracked_index` larger than the split/merge - // command index won't be published until `RegionUpdate` event - // trigger the region re-register and re-scan the new key range - self.resolver.stop_tracking(); - } - _ => { - debug!( - "skip change log admin"; - "region" => self.meta.id, - "req_type" => ?req_type, - ); - } - }, - ChangeLog::Rows { rows, index } => { - rows.iter().for_each(|row| match row { - ChangeRow::Prewrite { key, start_ts, .. } => self - .resolver - .track_lock(*start_ts, key.to_raw().unwrap(), Some(*index)), + }, + ChangeLog::Rows { rows, index } => { + for row in rows { + match row { + ChangeRow::Prewrite { key, start_ts, .. } => { + self.resolver.track_lock( + *start_ts, + key.to_raw().unwrap(), + Some(*index), + )?; + } ChangeRow::Commit { key, .. } => self .resolver .untrack_lock(&key.to_raw().unwrap(), Some(*index)), @@ -206,7 +311,7 @@ impl ObserveRegion { ChangeRow::IngestSsT => { self.resolver.update_tracked_index(*index); } - }); + } } } } @@ -215,73 +320,324 @@ impl ObserveRegion { Ok(()) } - fn track_scan_locks(&mut self, entries: Vec, apply_index: u64) { - for es in entries { - match es { - ScanEntry::Lock(locks) => { - if let ResolverStatus::Ready = self.resolver_status { - panic!("region {:?} resolver has ready", self.meta.id) - } - for (key, lock) in locks { - self.resolver - .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index)); - } + /// Track locks in incoming scan entries. + fn track_scan_locks(&mut self, entries: ScanEntries, apply_index: u64) -> Result<()> { + match entries { + ScanEntries::Lock(locks) => { + if let ResolverStatus::Ready = self.resolver_status { + panic!("region {:?} resolver has ready", self.meta.id) } - ScanEntry::None => { - // Update the `tracked_index` to the snapshot's `apply_index` - self.resolver.update_tracked_index(apply_index); - let pending_tracked_index = - match std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready) { - ResolverStatus::Pending { - locks, - tracked_index, - .. - } => { - locks.into_iter().for_each(|lock| match lock { - PendingLock::Track { key, start_ts } => { - self.resolver.track_lock( - start_ts, - key.to_raw().unwrap(), - Some(tracked_index), - ) - } - PendingLock::Untrack { key, .. } => self - .resolver - .untrack_lock(&key.to_raw().unwrap(), Some(tracked_index)), - }); - tracked_index - } - ResolverStatus::Ready => { - panic!("region {:?} resolver has ready", self.meta.id) - } - }; - info!( - "Resolver initialized"; - "region" => self.meta.id, - "observe_id" => ?self.handle.id, - "snapshot_index" => apply_index, - "pending_data_index" => pending_tracked_index, - ); + for (key, lock) in locks { + self.resolver + .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index))?; } - ScanEntry::TxnEntry(_) => panic!("unexpected entry type"), + } + ScanEntries::None => { + // Update the `tracked_index` to the snapshot's `apply_index` + self.resolver.update_tracked_index(apply_index); + let mut resolver_status = + std::mem::replace(&mut self.resolver_status, ResolverStatus::Ready); + let (pending_tracked_index, pending_locks) = + resolver_status.drain_pending_locks(self.meta.id); + for lock in pending_locks { + match lock { + PendingLock::Track { key, start_ts } => { + self.resolver.track_lock( + start_ts, + key.to_raw().unwrap(), + Some(pending_tracked_index), + )?; + } + PendingLock::Untrack { key, .. } => self + .resolver + .untrack_lock(&key.to_raw().unwrap(), Some(pending_tracked_index)), + } + } + info!( + "Resolver initialized"; + "region" => self.meta.id, + "observe_id" => ?self.handle.id, + "snapshot_index" => apply_index, + "pending_data_index" => pending_tracked_index, + ); } } + Ok(()) } } pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, + memory_quota: Arc, advance_notify: Arc, store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, + scan_concurrency_semaphore: Arc, scheduler: Scheduler, advance_worker: AdvanceTsWorker, _phantom: PhantomData<(T, E)>, } +// methods that are used for metrics and logging +impl Endpoint +where + T: 'static + CdcHandle, + E: KvEngine, + S: StoreRegionMeta, +{ + fn collect_stats(&mut self) -> Stats { + fn is_leader(store_id: Option, leader_store_id: Option) -> bool { + store_id.is_some() && store_id == leader_store_id + } + + let store_id = self.get_or_init_store_id(); + let mut stats = Stats::default(); + self.region_read_progress.with(|registry| { + for (region_id, read_progress) in registry { + let (leader_info, leader_store_id) = read_progress.dump_leader_info(); + let core = read_progress.get_core(); + let resolved_ts = leader_info.get_read_state().get_safe_ts(); + let safe_ts = core.read_state().ts; + + if resolved_ts == 0 { + stats.zero_ts_count += 1; + continue; + } + + if is_leader(store_id, leader_store_id) { + // leader resolved-ts + if resolved_ts < stats.min_leader_resolved_ts.resolved_ts { + let resolver = self.regions.get_mut(region_id).map(|x| &mut x.resolver); + stats + .min_leader_resolved_ts + .set(*region_id, resolver, &core, &leader_info); + } + } else { + // follower safe-ts + if safe_ts > 0 && safe_ts < stats.min_follower_safe_ts.safe_ts { + stats.min_follower_safe_ts.set(*region_id, &core); + } + + // follower resolved-ts + if resolved_ts < stats.min_follower_resolved_ts.resolved_ts { + stats.min_follower_resolved_ts.set(*region_id, &core); + } + } + } + }); + + stats.resolver = self.collect_resolver_stats(); + stats.cm_min_lock = self.advance_worker.concurrency_manager.global_min_lock(); + stats + } + + fn collect_resolver_stats(&mut self) -> ResolverStats { + let mut stats = ResolverStats::default(); + for observed_region in self.regions.values() { + match &observed_region.resolver_status { + ResolverStatus::Pending { locks, .. } => { + for l in locks { + stats.heap_size += l.heap_size() as i64; + } + stats.unresolved_count += 1; + } + ResolverStatus::Ready { .. } => { + stats.heap_size += observed_region.resolver.approximate_heap_bytes() as i64; + stats.resolved_count += 1; + } + } + } + stats + } + + fn update_metrics(&self, stats: &Stats) { + let now = self.approximate_now_tso(); + // general + if stats.min_follower_resolved_ts.resolved_ts < stats.min_leader_resolved_ts.resolved_ts { + RTS_MIN_RESOLVED_TS.set(stats.min_follower_resolved_ts.resolved_ts as i64); + RTS_MIN_RESOLVED_TS_GAP.set(now.saturating_sub( + TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_RESOLVED_TS_REGION.set(stats.min_follower_resolved_ts.region_id as i64); + } else { + RTS_MIN_RESOLVED_TS.set(stats.min_leader_resolved_ts.resolved_ts as i64); + RTS_MIN_RESOLVED_TS_GAP.set(now.saturating_sub( + TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_RESOLVED_TS_REGION.set(stats.min_leader_resolved_ts.region_id as i64); + } + RTS_ZERO_RESOLVED_TS.set(stats.zero_ts_count); + + RTS_LOCK_HEAP_BYTES_GAUGE.set(stats.resolver.heap_size); + RTS_LOCK_QUOTA_IN_USE_BYTES_GAUGE.set(self.memory_quota.in_use() as i64); + RTS_REGION_RESOLVE_STATUS_GAUGE_VEC + .with_label_values(&["resolved"]) + .set(stats.resolver.resolved_count); + RTS_REGION_RESOLVE_STATUS_GAUGE_VEC + .with_label_values(&["unresolved"]) + .set(stats.resolver.unresolved_count); + + CONCURRENCY_MANAGER_MIN_LOCK_TS.set( + stats + .cm_min_lock + .clone() + .map(|(ts, _)| ts.into_inner()) + .unwrap_or_default() as i64, + ); + + // min follower safe ts + RTS_MIN_FOLLOWER_SAFE_TS_REGION.set(stats.min_follower_safe_ts.region_id as i64); + RTS_MIN_FOLLOWER_SAFE_TS.set(stats.min_follower_safe_ts.safe_ts as i64); + RTS_MIN_FOLLOWER_SAFE_TS_GAP.set( + now.saturating_sub(TimeStamp::from(stats.min_follower_safe_ts.safe_ts).physical()) + as i64, + ); + RTS_MIN_FOLLOWER_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER.set( + stats + .min_follower_safe_ts + .duration_to_last_consume_leader + .map(|x| x as i64) + .unwrap_or(-1), + ); + + // min leader resolved ts + RTS_MIN_LEADER_RESOLVED_TS.set(stats.min_leader_resolved_ts.resolved_ts as i64); + RTS_MIN_LEADER_RESOLVED_TS_REGION.set(stats.min_leader_resolved_ts.region_id as i64); + RTS_MIN_LEADER_RESOLVED_TS_REGION_MIN_LOCK_TS.set( + stats + .min_leader_resolved_ts + .min_lock + .as_ref() + .map(|(ts, _)| (*ts).into_inner() as i64) + .unwrap_or(-1), + ); + RTS_MIN_LEADER_RESOLVED_TS_GAP + .set(now.saturating_sub( + TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical(), + ) as i64); + RTS_MIN_LEADER_DUATION_TO_LAST_UPDATE_SAFE_TS.set( + stats + .min_leader_resolved_ts + .duration_to_last_update_ms + .map(|x| x as i64) + .unwrap_or(-1), + ); + + // min follower resolved ts + RTS_MIN_FOLLOWER_RESOLVED_TS.set(stats.min_follower_resolved_ts.resolved_ts as i64); + RTS_MIN_FOLLOWER_RESOLVED_TS_REGION.set(stats.min_follower_resolved_ts.region_id as i64); + RTS_MIN_FOLLOWER_RESOLVED_TS_GAP.set( + now.saturating_sub( + TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical(), + ) as i64, + ); + RTS_MIN_FOLLOWER_RESOLVED_TS_DURATION_TO_LAST_CONSUME_LEADER.set( + stats + .min_follower_resolved_ts + .duration_to_last_consume_leader + .map(|x| x as i64) + .unwrap_or(-1), + ); + } + + // Approximate a TSO from PD. It is better than local timestamp when clock skew + // exists. + // Returns the physical part. + fn approximate_now_tso(&self) -> u64 { + self.advance_worker + .last_pd_tso + .try_lock() + .map(|opt| { + opt.map(|(pd_ts, instant)| { + pd_ts.physical() + instant.saturating_elapsed().as_millis() as u64 + }) + .unwrap_or_else(|| TimeStamp::physical_now()) + }) + .unwrap_or_else(|_| TimeStamp::physical_now()) + } + + fn log_slow_regions(&self, stats: &Stats) { + let expected_interval = min( + self.cfg.advance_ts_interval.as_millis(), + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION.as_millis() as u64, + ) + self.cfg.advance_ts_interval.as_millis(); + let leader_threshold = expected_interval + SLOW_LOG_GRACE_PERIOD_MS; + let follower_threshold = 2 * expected_interval + SLOW_LOG_GRACE_PERIOD_MS; + let now = self.approximate_now_tso(); + + // min leader resolved ts + let min_leader_resolved_ts_gap = now + .saturating_sub(TimeStamp::from(stats.min_leader_resolved_ts.resolved_ts).physical()); + if min_leader_resolved_ts_gap > leader_threshold { + info!( + "the max gap of leader resolved-ts is large"; + "region_id" => stats.min_leader_resolved_ts.region_id, + "gap" => format!("{}ms", min_leader_resolved_ts_gap), + "read_state" => ?stats.min_leader_resolved_ts.read_state, + "applied_index" => stats.min_leader_resolved_ts.applied_index, + "min_lock" => ?stats.min_leader_resolved_ts.min_lock, + "lock_num" => stats.min_leader_resolved_ts.lock_num, + "txn_num" => stats.min_leader_resolved_ts.txn_num, + "min_memory_lock" => ?stats.cm_min_lock, + "duration_to_last_update_safe_ts" => match stats.min_leader_resolved_ts.duration_to_last_update_ms { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "last_resolve_attempt" => &stats.min_leader_resolved_ts.last_resolve_attempt, + ); + } + + // min follower safe ts + let min_follower_safe_ts_gap = + now.saturating_sub(TimeStamp::from(stats.min_follower_safe_ts.safe_ts).physical()); + if min_follower_safe_ts_gap > follower_threshold { + info!( + "the max gap of follower safe-ts is large"; + "region_id" => stats.min_follower_safe_ts.region_id, + "gap" => format!("{}ms", min_follower_safe_ts_gap), + "safe_ts" => stats.min_follower_safe_ts.safe_ts, + "resolved_ts" => stats.min_follower_safe_ts.resolved_ts, + "duration_to_last_consume_leader" => match stats.min_follower_safe_ts.duration_to_last_consume_leader { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "applied_index" => stats.min_follower_safe_ts.applied_index, + "latest_candidate" => ?stats.min_follower_safe_ts.latest_candidate, + "oldest_candidate" => ?stats.min_follower_safe_ts.oldest_candidate, + ); + } + + // min follower resolved ts + let min_follower_resolved_ts_gap = now + .saturating_sub(TimeStamp::from(stats.min_follower_resolved_ts.resolved_ts).physical()); + if min_follower_resolved_ts_gap > follower_threshold { + if stats.min_follower_resolved_ts.region_id == stats.min_follower_safe_ts.region_id { + info!( + "the max gap of follower resolved-ts is large; it's the same region that has the min safe-ts" + ); + } else { + info!( + "the max gap of follower resolved-ts is large"; + "region_id" => stats.min_follower_resolved_ts.region_id, + "gap" => format!("{}ms", min_follower_resolved_ts_gap), + "safe_ts" => stats.min_follower_resolved_ts.safe_ts, + "resolved_ts" => stats.min_follower_resolved_ts.resolved_ts, + "duration_to_last_consume_leader" => match stats.min_follower_resolved_ts.duration_to_last_consume_leader { + Some(d) => format!("{}ms", d), + None => "none".to_owned(), + }, + "applied_index" => stats.min_follower_resolved_ts.applied_index, + "latest_candidate" => ?stats.min_follower_resolved_ts.latest_candidate, + "oldest_candidate" => ?stats.min_follower_resolved_ts.oldest_candidate, + ); + } + } + } +} + impl Endpoint where T: 'static + CdcHandle, @@ -318,15 +674,18 @@ where region_read_progress.clone(), store_resolver_gc_interval, ); + let scan_concurrency_semaphore = Arc::new(Semaphore::new(cfg.incremental_scan_concurrency)); let ep = Self { store_id: Some(store_id), cfg: cfg.clone(), + memory_quota: Arc::new(MemoryQuota::new(cfg.memory_quota.0 as usize)), advance_notify: Arc::new(Notify::new()), scheduler, store_meta, region_read_progress, advance_worker, scanner_pool, + scan_concurrency_semaphore, regions: HashMap::default(), _phantom: PhantomData::default(), }; @@ -334,36 +693,31 @@ where ep } - fn register_region(&mut self, region: Region) { + fn register_region(&mut self, region: Region, backoff: Option) { let region_id = region.get_id(); assert!(self.regions.get(®ion_id).is_none()); - let observe_region = { - if let Some(read_progress) = self.region_read_progress.get(®ion_id) { - info!( - "register observe region"; - "region" => ?region - ); - ObserveRegion::new(region.clone(), read_progress) - } else { - warn!( - "try register unexit region"; - "region" => ?region, - ); - return; - } + let Some(read_progress) = self.region_read_progress.get(®ion_id) else { + warn!("try register nonexistent region"; "region" => ?region); + return; }; + info!("register observe region"; "region" => ?region); + let (cancelled_tx, cancelled_rx) = channel(); + let observe_region = ObserveRegion::new( + region.clone(), + read_progress, + self.memory_quota.clone(), + cancelled_tx, + ); let observe_handle = observe_region.handle.clone(); - let cancelled = match observe_region.resolver_status { - ResolverStatus::Pending { ref cancelled, .. } => cancelled.clone(), - ResolverStatus::Ready => panic!("resolved ts illeagal created observe region"), - }; observe_region .read_progress() .update_advance_resolved_ts_notify(self.advance_notify.clone()); self.regions.insert(region_id, observe_region); - let scan_task = self.build_scan_task(region, observe_handle, cancelled); - self.scanner_pool.spawn_task(scan_task); + let scan_task = self.build_scan_task(region, observe_handle, cancelled_rx, backoff); + let concurrency_semaphore = self.scan_concurrency_semaphore.clone(); + self.scanner_pool + .spawn_task(scan_task, concurrency_semaphore); RTS_SCAN_TASKS.with_label_values(&["total"]).inc(); } @@ -371,40 +725,17 @@ where &self, region: Region, observe_handle: ObserveHandle, - cancelled: Arc, + cancelled: Receiver<()>, + backoff: Option, ) -> ScanTask { let scheduler = self.scheduler.clone(); - let scheduler_error = self.scheduler.clone(); - let region_id = region.id; - let observe_id = observe_handle.id; ScanTask { handle: observe_handle, - tag: String::new(), - mode: ScanMode::LockOnly, region, checkpoint_ts: TimeStamp::zero(), - is_cancelled: Box::new(move || cancelled.load(Ordering::Acquire)), - send_entries: Box::new(move |entries, apply_index| { - scheduler - .schedule(Task::ScanLocks { - region_id, - observe_id, - entries, - apply_index, - }) - .unwrap_or_else(|e| warn!("schedule resolved ts task failed"; "err" => ?e)); - RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); - }), - on_error: Some(Box::new(move |observe_id, _region, e| { - scheduler_error - .schedule(Task::ReRegisterRegion { - region_id, - observe_id, - cause: format!("met error while handle scan task {:?}", e), - }) - .unwrap_or_else(|schedule_err| warn!("schedule re-register task failed"; "err" => ?schedule_err, "re_register_cause" => ?e)); - RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); - })), + backoff, + cancelled, + scheduler, } } @@ -412,7 +743,7 @@ where if let Some(observe_region) = self.regions.remove(®ion_id) { let ObserveRegion { handle, - resolver_status, + mut resolver_status, .. } = observe_region; @@ -425,8 +756,11 @@ where // Stop observing data handle.stop_observing(); // Stop scanning data - if let ResolverStatus::Pending { cancelled, .. } = resolver_status { - cancelled.store(true, Ordering::Release); + if let ResolverStatus::Pending { + ref mut cancelled, .. + } = resolver_status + { + let _ = cancelled.take(); } } else { debug!("deregister unregister region"; "region_id" => region_id); @@ -448,7 +782,7 @@ where // the `Resolver`'s lock heap // - `PrepareMerge` and `RollbackMerge`, the key range is unchanged self.deregister_region(region_id); - self.register_region(incoming_region); + self.register_region(incoming_region, None); } } @@ -479,7 +813,13 @@ where } // Deregister current observed region and try to register it again. - fn re_register_region(&mut self, region_id: u64, observe_id: ObserveId, cause: String) { + fn re_register_region( + &mut self, + region_id: u64, + observe_id: ObserveId, + cause: Error, + backoff: Option, + ) { if let Some(observe_region) = self.regions.get(®ion_id) { if observe_region.handle.id != observe_id { warn!("resolved ts deregister region failed due to observe_id not match"); @@ -490,7 +830,7 @@ where "register region again"; "region_id" => region_id, "observe_id" => ?observe_id, - "cause" => cause + "cause" => ?cause ); self.deregister_region(region_id); let region; @@ -501,13 +841,18 @@ where None => return, } } - self.register_region(region); + self.register_region(region, backoff); } } // Update advanced resolved ts. // Must ensure all regions are leaders at the point of ts. - fn handle_resolved_ts_advanced(&mut self, regions: Vec, ts: TimeStamp) { + fn handle_resolved_ts_advanced( + &mut self, + regions: Vec, + ts: TimeStamp, + ts_source: TsSource, + ) { if regions.is_empty() { return; } @@ -515,7 +860,9 @@ where for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let _ = observe_region.resolver.resolve(ts, Some(now)); + let _ = observe_region + .resolver + .resolve(ts, Some(now), ts_source.clone()); } } } @@ -538,7 +885,11 @@ where let logs = ChangeLog::encode_change_log(region_id, batch); if let Err(e) = observe_region.track_change_log(&logs) { drop(observe_region); - self.re_register_region(region_id, observe_id, e); + let backoff = match e { + Error::MemoryQuotaExceeded(_) => Some(MEMORY_QUOTA_EXCEEDED_BACKOFF), + Error::Other(_) => None, + }; + self.re_register_region(region_id, observe_id, e, backoff); } } else { debug!("resolved ts CmdBatch discarded"; @@ -555,18 +906,26 @@ where &mut self, region_id: u64, observe_id: ObserveId, - entries: Vec, + entries: ScanEntries, apply_index: u64, ) { - match self.regions.get_mut(®ion_id) { - Some(observe_region) => { - if observe_region.handle.id == observe_id { - observe_region.track_scan_locks(entries, apply_index); + let mut memory_quota_exceeded = None; + if let Some(observe_region) = self.regions.get_mut(®ion_id) { + if observe_region.handle.id == observe_id { + if let Err(Error::MemoryQuotaExceeded(e)) = + observe_region.track_scan_locks(entries, apply_index) + { + memory_quota_exceeded = Some(Error::MemoryQuotaExceeded(e)); } } - None => { - debug!("scan locks region not exist"; "region_id" => region_id, "observe_id" => ?observe_id); - } + } else { + debug!("scan locks region not exist"; + "region_id" => region_id, + "observe_id" => ?observe_id); + } + if let Some(e) = memory_quota_exceeded { + let backoff = Some(MEMORY_QUOTA_EXCEEDED_BACKOFF); + self.re_register_region(region_id, observe_id, e, backoff); } } @@ -586,6 +945,10 @@ where warn!("resolved-ts config fails"; "error" => ?e); } else { self.advance_notify.notify_waiters(); + self.memory_quota + .set_capacity(self.cfg.memory_quota.0 as usize); + self.scan_concurrency_semaphore = + Arc::new(Semaphore::new(self.cfg.incremental_scan_concurrency)); info!( "resolved-ts config changed"; "prev" => prev, @@ -638,7 +1001,7 @@ pub enum Task { ReRegisterRegion { region_id: u64, observe_id: ObserveId, - cause: String, + cause: Error, }, AdvanceResolvedTs { leader_resolver: LeadershipResolver, @@ -646,6 +1009,7 @@ pub enum Task { ResolvedTsAdvanced { regions: Vec, ts: TimeStamp, + ts_source: TsSource, }, ChangeLog { cmd_batch: Vec, @@ -653,7 +1017,7 @@ pub enum Task { ScanLocks { region_id: u64, observe_id: ObserveId, - entries: Vec, + entries: ScanEntries, apply_index: u64, }, ChangeConfig { @@ -700,10 +1064,12 @@ impl fmt::Debug for Task { Task::ResolvedTsAdvanced { ref regions, ref ts, + ref ts_source, } => de .field("name", &"advance_resolved_ts") .field("regions", ®ions) .field("ts", &ts) + .field("ts_source", &ts_source.label()) .finish(), Task::ChangeLog { .. } => de.field("name", &"change_log").finish(), Task::ScanLocks { @@ -750,19 +1116,21 @@ where match task { Task::RegionDestroyed(region) => self.region_destroyed(region), Task::RegionUpdated(region) => self.region_updated(region), - Task::RegisterRegion { region } => self.register_region(region), + Task::RegisterRegion { region } => self.register_region(region, None), Task::DeRegisterRegion { region_id } => self.deregister_region(region_id), Task::ReRegisterRegion { region_id, observe_id, cause, - } => self.re_register_region(region_id, observe_id, cause), + } => self.re_register_region(region_id, observe_id, cause, None), Task::AdvanceResolvedTs { leader_resolver } => { self.handle_advance_resolved_ts(leader_resolver) } - Task::ResolvedTsAdvanced { regions, ts } => { - self.handle_resolved_ts_advanced(regions, ts) - } + Task::ResolvedTsAdvanced { + regions, + ts, + ts_source, + } => self.handle_resolved_ts_advanced(regions, ts, ts_source), Task::ChangeLog { cmd_batch } => self.handle_change_log(cmd_batch), Task::ScanLocks { region_id, @@ -798,6 +1166,130 @@ impl ConfigManager for ResolvedTsConfigManager { } } +#[derive(Default)] +struct Stats { + // stats for metrics + zero_ts_count: i64, + min_leader_resolved_ts: LeaderStats, + min_follower_safe_ts: FollowerStats, + min_follower_resolved_ts: FollowerStats, + resolver: ResolverStats, + // we don't care about min_safe_ts_leader, because safe_ts should be equal to resolved_ts in + // leaders + // The min memory lock in concurrency manager. + cm_min_lock: Option<(TimeStamp, Key)>, +} + +struct LeaderStats { + region_id: u64, + resolved_ts: u64, + read_state: ReadState, + duration_to_last_update_ms: Option, + last_resolve_attempt: Option, + applied_index: u64, + // min lock in LOCK CF + min_lock: Option<(TimeStamp, TxnLocks)>, + lock_num: Option, + txn_num: Option, +} + +impl Default for LeaderStats { + fn default() -> Self { + Self { + region_id: 0, + resolved_ts: u64::MAX, + read_state: ReadState::default(), + duration_to_last_update_ms: None, + applied_index: 0, + last_resolve_attempt: None, + min_lock: None, + lock_num: None, + txn_num: None, + } + } +} + +impl LeaderStats { + fn set( + &mut self, + region_id: u64, + mut resolver: Option<&mut Resolver>, + region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, + leader_info: &LeaderInfo, + ) { + *self = LeaderStats { + region_id, + resolved_ts: leader_info.get_read_state().get_safe_ts(), + read_state: region_read_progress.read_state().clone(), + duration_to_last_update_ms: region_read_progress + .last_instant_of_update_ts() + .map(|i| i.saturating_elapsed().as_millis() as u64), + last_resolve_attempt: resolver.as_mut().and_then(|r| r.take_last_attempt()), + min_lock: resolver + .as_ref() + .and_then(|r| r.oldest_transaction().map(|(t, tk)| (*t, tk.clone()))), + applied_index: region_read_progress.applied_index(), + lock_num: resolver.as_ref().map(|r| r.num_locks()), + txn_num: resolver.as_ref().map(|r| r.num_transactions()), + }; + } +} + +struct FollowerStats { + region_id: u64, + resolved_ts: u64, + safe_ts: u64, + latest_candidate: Option, + oldest_candidate: Option, + applied_index: u64, + duration_to_last_consume_leader: Option, +} + +impl Default for FollowerStats { + fn default() -> Self { + Self { + region_id: 0, + safe_ts: u64::MAX, + resolved_ts: u64::MAX, + latest_candidate: None, + oldest_candidate: None, + applied_index: 0, + duration_to_last_consume_leader: None, + } + } +} + +impl FollowerStats { + fn set( + &mut self, + region_id: u64, + region_read_progress: &MutexGuard<'_, RegionReadProgressCore>, + ) { + let read_state = region_read_progress.read_state(); + *self = FollowerStats { + region_id, + resolved_ts: region_read_progress + .get_leader_info() + .get_read_state() + .get_safe_ts(), + safe_ts: read_state.ts, + applied_index: region_read_progress.applied_index(), + latest_candidate: region_read_progress.pending_items().back().cloned(), + oldest_candidate: region_read_progress.pending_items().front().cloned(), + duration_to_last_consume_leader: region_read_progress + .last_instant_of_consume_leader() + .map(|i| i.saturating_elapsed().as_millis() as u64), + }; + } +} + +#[derive(Default)] +struct ResolverStats { + resolved_count: i64, + unresolved_count: i64, + heap_size: i64, +} + const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s impl RunnableWithTimer for Endpoint @@ -807,138 +1299,9 @@ where S: StoreRegionMeta, { fn on_timeout(&mut self) { - let store_id = self.get_or_init_store_id(); - let (mut oldest_ts, mut oldest_region, mut zero_ts_count) = (u64::MAX, 0, 0); - let (mut oldest_leader_ts, mut oldest_leader_region) = (u64::MAX, 0); - let (mut oldest_safe_ts, mut oldest_safe_ts_region) = (u64::MAX, 0); - let mut oldest_duration_to_last_update_ms = 0; - let mut oldest_duration_to_last_consume_leader_ms = 0; - self.region_read_progress.with(|registry| { - for (region_id, read_progress) in registry { - let safe_ts = read_progress.safe_ts(); - if safe_ts > 0 && safe_ts < oldest_safe_ts { - oldest_safe_ts = safe_ts; - oldest_safe_ts_region = *region_id; - } - - let (leader_info, leader_store_id) = read_progress.dump_leader_info(); - // this is maximum resolved-ts pushed to region_read_progress, namely candidates - // of safe_ts. It may not be the safe_ts yet - let ts = leader_info.get_read_state().get_safe_ts(); - if ts == 0 { - zero_ts_count += 1; - continue; - } - if ts < oldest_ts { - oldest_ts = ts; - oldest_region = *region_id; - // use -1 to denote none. - oldest_duration_to_last_update_ms = read_progress - .get_core() - .last_instant_of_consume_leader() - .map(|t| t.saturating_elapsed().as_millis() as i64) - .unwrap_or(-1); - oldest_duration_to_last_consume_leader_ms = read_progress - .get_core() - .last_instant_of_consume_leader() - .map(|t| t.saturating_elapsed().as_millis() as i64) - .unwrap_or(-1); - } - - if let (Some(store_id), Some(leader_store_id)) = (store_id, leader_store_id) { - if leader_store_id == store_id && ts < oldest_leader_ts { - oldest_leader_ts = ts; - oldest_leader_region = *region_id; - } - } - } - }); - let mut lock_heap_size = 0; - let (mut resolved_count, mut unresolved_count) = (0, 0); - for observe_region in self.regions.values() { - match &observe_region.resolver_status { - ResolverStatus::Pending { locks, .. } => { - for l in locks { - match l { - PendingLock::Track { key, .. } => lock_heap_size += key.len(), - PendingLock::Untrack { key, .. } => lock_heap_size += key.len(), - } - } - unresolved_count += 1; - } - ResolverStatus::Ready { .. } => { - lock_heap_size += observe_region.resolver.size(); - resolved_count += 1; - } - } - } - // approximate a TSO from PD. It is better than local timestamp when clock skew - // exists. - let now: u64 = self - .advance_worker - .last_pd_tso - .try_lock() - .map(|opt| { - opt.map(|(pd_ts, instant)| { - pd_ts.physical() + instant.saturating_elapsed().as_millis() as u64 - }) - .unwrap_or_else(|| TimeStamp::physical_now()) - }) - .unwrap_or_else(|_| TimeStamp::physical_now()); - - RTS_MIN_SAFE_TS.set(oldest_safe_ts as i64); - RTS_MIN_SAFE_TS_REGION.set(oldest_safe_ts_region as i64); - let safe_ts_gap = now.saturating_sub(TimeStamp::from(oldest_safe_ts).physical()); - if safe_ts_gap - > self.cfg.advance_ts_interval.as_millis() - + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION.as_millis() as u64 - + SLOW_LOG_GRACE_PERIOD_MS - { - let mut lock_num = None; - let mut min_start_ts = None; - if let Some(ob) = self.regions.get(&oldest_safe_ts_region) { - min_start_ts = ob - .resolver - .locks() - .keys() - .next() - .cloned() - .map(TimeStamp::into_inner); - lock_num = Some(ob.resolver.locks_by_key.len()); - } - info!( - "the max gap of safe-ts is large"; - "gap" => safe_ts_gap, - "oldest_safe_ts" => ?oldest_safe_ts, - "region_id" => oldest_safe_ts_region, - "advance_ts_interval" => ?self.cfg.advance_ts_interval, - "lock_num" => lock_num, - "min_start_ts" => min_start_ts, - ); - } - RTS_MIN_SAFE_TS_GAP.set(safe_ts_gap as i64); - RTS_MIN_SAFE_TS_DUATION_TO_UPDATE_SAFE_TS.set(oldest_duration_to_last_update_ms); - RTS_MIN_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER - .set(oldest_duration_to_last_consume_leader_ms); - - RTS_MIN_RESOLVED_TS_REGION.set(oldest_region as i64); - RTS_MIN_RESOLVED_TS.set(oldest_ts as i64); - RTS_ZERO_RESOLVED_TS.set(zero_ts_count as i64); - RTS_MIN_RESOLVED_TS_GAP - .set(now.saturating_sub(TimeStamp::from(oldest_ts).physical()) as i64); - - RTS_MIN_LEADER_RESOLVED_TS_REGION.set(oldest_leader_region as i64); - RTS_MIN_LEADER_RESOLVED_TS.set(oldest_leader_ts as i64); - RTS_MIN_LEADER_RESOLVED_TS_GAP - .set(now.saturating_sub(TimeStamp::from(oldest_leader_ts).physical()) as i64); - - RTS_LOCK_HEAP_BYTES_GAUGE.set(lock_heap_size as i64); - RTS_REGION_RESOLVE_STATUS_GAUGE_VEC - .with_label_values(&["resolved"]) - .set(resolved_count as _); - RTS_REGION_RESOLVE_STATUS_GAUGE_VEC - .with_label_values(&["unresolved"]) - .set(unresolved_count as _); + let stats = self.collect_stats(); + self.update_metrics(&stats); + self.log_slow_regions(&stats); } fn get_interval(&self) -> Duration { diff --git a/components/resolved_ts/src/errors.rs b/components/resolved_ts/src/errors.rs index d9845440c07..4e14c1d78d9 100644 --- a/components/resolved_ts/src/errors.rs +++ b/components/resolved_ts/src/errors.rs @@ -1,62 +1,14 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::io::Error as IoError; - -use engine_traits::Error as EngineTraitsError; -use kvproto::errorpb::Error as ErrorHeader; -use raftstore::Error as RaftstoreError; use thiserror::Error; -use tikv::storage::{ - kv::{Error as KvError, ErrorInner as EngineErrorInner}, - mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, - txn::{Error as TxnError, ErrorInner as TxnErrorInner}, -}; -use txn_types::Error as TxnTypesError; +use tikv_util::memory::MemoryQuotaExceeded; #[derive(Debug, Error)] pub enum Error { - #[error("IO error {0}")] - Io(#[from] IoError), - #[error("Engine error {0}")] - Kv(#[from] KvError), - #[error("Transaction error {0}")] - Txn(#[from] TxnError), - #[error("Mvcc error {0}")] - Mvcc(#[from] MvccError), - #[error("Request error {0:?}")] - Request(Box), - #[error("Engine traits error {0}")] - EngineTraits(#[from] EngineTraitsError), - #[error("Txn types error {0}")] - TxnTypes(#[from] TxnTypesError), - #[error("Raftstore error {0}")] - Raftstore(#[from] RaftstoreError), + #[error("Memory quota exceeded")] + MemoryQuotaExceeded(#[from] MemoryQuotaExceeded), #[error("Other error {0}")] Other(#[from] Box), } -impl Error { - pub fn request(err: ErrorHeader) -> Error { - Error::Request(Box::new(err)) - } - - pub fn extract_error_header(self) -> ErrorHeader { - match self { - Error::Kv(KvError(box EngineErrorInner::Request(e))) - | Error::Txn(TxnError(box TxnErrorInner::Engine(KvError( - box EngineErrorInner::Request(e), - )))) - | Error::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::Kv( - KvError(box EngineErrorInner::Request(e)), - ))))) - | Error::Request(box e) => e, - other => { - let mut e = ErrorHeader::default(); - e.set_message(format!("{:?}", other)); - e - } - } - } -} - pub type Result = std::result::Result; diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index eef1211a580..f9eeb7c8b70 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -14,6 +14,7 @@ #![feature(box_patterns)] #![feature(result_flattening)] +#![feature(let_chains)] #[macro_use] extern crate tikv_util; diff --git a/components/resolved_ts/src/metrics.rs b/components/resolved_ts/src/metrics.rs index 15b3463f70e..fb751491d10 100644 --- a/components/resolved_ts/src/metrics.rs +++ b/components/resolved_ts/src/metrics.rs @@ -38,7 +38,7 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_resolved_ts_gap_millis", - "The minimal (non-zero) resolved ts gap for observed regions" + "The gap between now() and the minimal (non-zero) resolved ts" ) .unwrap(); pub static ref RTS_RESOLVED_FAIL_ADVANCE_VEC: IntCounterVec = register_int_counter_vec!( @@ -69,29 +69,29 @@ lazy_static! { "The minimal (non-zero) resolved ts for observed regions" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_REGION: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_region", - "The region which has minimal safe ts" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_REGION: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_region", + "The region id of the follower that has minimal safe ts" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts", - "The minimal (non-zero) safe ts for observed regions" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts", + "The minimal (non-zero) safe ts for followers" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_GAP: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_gap_millis", - "The minimal (non-zero) safe ts gap for observed regions" + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_GAP: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_gap_millis", + "The gap between now() and the minimal (non-zero) safe ts for followers" ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_DUATION_TO_UPDATE_SAFE_TS: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_duration_to_update_safe_ts", - "The duration since last update_safe_ts() called by resolved-ts routine. -1 denotes None." + pub static ref RTS_MIN_LEADER_DUATION_TO_LAST_UPDATE_SAFE_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_leader_min_resolved_ts_duration_to_last_update_safe_ts", + "The duration since last update_safe_ts() called by resolved-ts routine in the leader with min resolved ts. -1 denotes None." ) .unwrap(); - pub static ref RTS_MIN_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( - "tikv_resolved_ts_min_safe_ts_duration_to_last_consume_leader", - "The duration since last check_leader(). -1 denotes None." + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_safe_ts_duration_to_last_consume_leader", + "The duration since last check_leader() in the follower region with min safe ts. -1 denotes None." ) .unwrap(); pub static ref RTS_ZERO_RESOLVED_TS: IntGauge = register_int_gauge!( @@ -104,6 +104,11 @@ lazy_static! { "Total bytes in memory of resolved-ts observed regions's lock heap" ) .unwrap(); + pub static ref RTS_LOCK_QUOTA_IN_USE_BYTES_GAUGE: IntGauge = register_int_gauge!( + "tikv_resolved_ts_memory_quota_in_use_bytes", + "Total bytes in memory of resolved-ts observed regions's lock heap" + ) + .unwrap(); pub static ref RTS_REGION_RESOLVE_STATUS_GAUGE_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_resolved_ts_region_resolve_status", "The status of resolved-ts observed regions", @@ -125,7 +130,17 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS_REGION: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_leader_resolved_ts_region", - "The region which its leader peer has minimal resolved ts" + "The region whose leader peer has minimal resolved ts" + ) + .unwrap(); + pub static ref RTS_MIN_LEADER_RESOLVED_TS_REGION_MIN_LOCK_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_leader_resolved_ts_region_min_lock_ts", + "The minimal lock ts for the region whose leader peer has minimal resolved ts. 0 means no lock. -1 means no region found." + ) + .unwrap(); + pub static ref CONCURRENCY_MANAGER_MIN_LOCK_TS: IntGauge = register_int_gauge!( + "tikv_concurrency_manager_min_lock_ts", + "The minimal lock ts in concurrency manager. 0 means no lock." ) .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS: IntGauge = register_int_gauge!( @@ -135,7 +150,35 @@ lazy_static! { .unwrap(); pub static ref RTS_MIN_LEADER_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( "tikv_resolved_ts_min_leader_resolved_ts_gap_millis", - "The minimal (non-zero) resolved ts gap for observe leader peers" + "The gap between now() and the minimal (non-zero) resolved ts for leader peers" + ) + .unwrap(); + + // for min_follower_resolved_ts + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_REGION: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_region", + "The region id of the follower has minimal resolved ts" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts", + "The minimal (non-zero) resolved ts for follower regions" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_GAP: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_gap_millis", + "The max gap of now() and the minimal (non-zero) resolved ts for follower regions" + ) + .unwrap(); + pub static ref RTS_MIN_FOLLOWER_RESOLVED_TS_DURATION_TO_LAST_CONSUME_LEADER: IntGauge = register_int_gauge!( + "tikv_resolved_ts_min_follower_resolved_ts_duration_to_last_consume_leader", + "The duration since last check_leader() in the follower region with min resolved ts. -1 denotes None." + ) + .unwrap(); + pub static ref RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM: Histogram = register_histogram!( + "tikv_resolved_ts_initial_scan_backoff_duration_seconds", + "Bucketed histogram of resolved-ts initial scan backoff duration", + exponential_buckets(0.1, 2.0, 16).unwrap(), ) .unwrap(); } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 799c5584723..239ef566605 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -1,75 +1,195 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp, collections::BTreeMap, sync::Arc}; +use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; -use collections::{HashMap, HashSet}; +use collections::{HashMap, HashMapEntry}; use raftstore::store::RegionReadProgress; -use tikv_util::time::Instant; -use txn_types::TimeStamp; +use tikv_util::{ + memory::{HeapSize, MemoryQuota, MemoryQuotaExceeded}, + time::Instant, +}; +use txn_types::{Key, TimeStamp}; + +use crate::metrics::*; + +pub const ON_DROP_WARN_HEAP_SIZE: usize = 64 * 1024 * 1024; // 64MB + +#[derive(Clone)] +pub enum TsSource { + // A lock in LOCK CF + Lock(TxnLocks), + // A memory lock in concurrency manager + MemoryLock(Key), + PdTso, + // The following sources can also come from PD or memory lock, but we care more about sources + // in resolved-ts. + BackupStream, + Cdc, +} + +impl TsSource { + pub fn label(&self) -> &str { + match self { + TsSource::Lock(_) => "lock", + TsSource::MemoryLock(_) => "rts_cm_min_lock", + TsSource::PdTso => "pd_tso", + TsSource::BackupStream => "backup_stream", + TsSource::Cdc => "cdc", + } + } + + pub fn key(&self) -> Option { + match self { + TsSource::Lock(locks) => locks + .sample_lock + .as_ref() + .map(|k| Key::from_encoded_slice(k)), + TsSource::MemoryLock(k) => Some(k.clone()), + _ => None, + } + } +} -use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; +#[derive(Default, Clone, PartialEq, Eq)] +pub struct TxnLocks { + pub lock_count: usize, + // A sample key in a transaction. + pub sample_lock: Option>, +} -const MAX_NUMBER_OF_LOCKS_IN_LOG: usize = 10; +impl std::fmt::Debug for TxnLocks { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TxnLocks") + .field("lock_count", &self.lock_count) + .field( + "sample_lock", + &self + .sample_lock + .as_ref() + .map(|k| log_wrappers::Value::key(k)), + ) + .finish() + } +} // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. pub struct Resolver { region_id: u64, // key -> start_ts - pub(crate) locks_by_key: HashMap, TimeStamp>, + locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. - lock_ts_heap: BTreeMap>>, + lock_ts_heap: BTreeMap, + // The last shrink time. + last_aggressive_shrink_time: Instant, // The timestamps that guarantees no more commit will happen before. resolved_ts: TimeStamp, // The highest index `Resolver` had been tracked tracked_index: u64, // The region read progress used to utilize `resolved_ts` to serve stale read request - pub(crate) read_progress: Option>, + read_progress: Option>, // The timestamps that advance the resolved_ts when there is no more write. min_ts: TimeStamp, // Whether the `Resolver` is stopped stopped: bool, + // The memory quota for the `Resolver` and its lock keys and timestamps. + memory_quota: Arc, + // The last attempt of resolve(), used for diagnosis. + last_attempt: Option, +} + +#[derive(Clone)] +pub(crate) struct LastAttempt { + success: bool, + ts: TimeStamp, + reason: TsSource, +} + +impl slog::Value for LastAttempt { + fn serialize( + &self, + _record: &slog::Record<'_>, + key: slog::Key, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + serializer.emit_arguments( + key, + &format_args!( + "{{ success={}, ts={}, reason={}, key={:?} }}", + self.success, + self.ts, + self.reason.label(), + self.reason.key(), + ), + ) + } } impl std::fmt::Debug for Resolver { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let far_lock = self.lock_ts_heap.iter().next(); + let far_lock = self.oldest_transaction(); let mut dt = f.debug_tuple("Resolver"); dt.field(&format_args!("region={}", self.region_id)); - if let Some((ts, keys)) = far_lock { + if let Some((ts, txn_locks)) = far_lock { + dt.field(&format_args!( + "oldest_lock_count={:?}", + txn_locks.lock_count + )); dt.field(&format_args!( - "far_lock={:?}", - keys.iter() - // We must use Display format here or the redact won't take effect. - .map(|k| format!("{}", log_wrappers::Value::key(k))) - .collect::>() + "oldest_lock_sample={:?}", + txn_locks.sample_lock )); - dt.field(&format_args!("far_lock_ts={:?}", ts)); + dt.field(&format_args!("oldest_lock_ts={:?}", ts)); } dt.finish() } } +impl Drop for Resolver { + fn drop(&mut self) { + // Free memory quota used by locks_by_key. + let mut bytes = 0; + let num_locks = self.num_locks(); + for key in self.locks_by_key.keys() { + bytes += self.lock_heap_size(key); + } + if bytes > ON_DROP_WARN_HEAP_SIZE { + warn!("drop huge resolver"; + "region_id" => self.region_id, + "bytes" => bytes, + "num_locks" => num_locks, + "memory_quota_in_use" => self.memory_quota.in_use(), + "memory_quota_capacity" => self.memory_quota.capacity(), + ); + } + self.memory_quota.free(bytes); + } +} + impl Resolver { - pub fn new(region_id: u64) -> Resolver { - Resolver::with_read_progress(region_id, None) + pub fn new(region_id: u64, memory_quota: Arc) -> Resolver { + Resolver::with_read_progress(region_id, None, memory_quota) } pub fn with_read_progress( region_id: u64, read_progress: Option>, + memory_quota: Arc, ) -> Resolver { Resolver { region_id, resolved_ts: TimeStamp::zero(), locks_by_key: HashMap::default(), lock_ts_heap: BTreeMap::new(), + last_aggressive_shrink_time: Instant::now_coarse(), read_progress, tracked_index: 0, min_ts: TimeStamp::zero(), stopped: false, + memory_quota, + last_attempt: None, } } @@ -85,16 +205,7 @@ impl Resolver { self.stopped } - pub fn size(&self) -> usize { - self.locks_by_key.keys().map(|k| k.len()).sum::() - + self - .lock_ts_heap - .values() - .map(|h| h.iter().map(|k| k.len()).sum::()) - .sum::() - } - - pub fn locks(&self) -> &BTreeMap>> { + pub fn locks(&self) -> &BTreeMap { &self.lock_ts_heap } @@ -115,19 +226,92 @@ impl Resolver { self.tracked_index = index; } - pub fn track_lock(&mut self, start_ts: TimeStamp, key: Vec, index: Option) { + // Return an approximate heap memory usage in bytes. + pub fn approximate_heap_bytes(&self) -> usize { + if self.locks_by_key.is_empty() { + return 0; + } + + const SAMPLE_COUNT: usize = 8; + let mut key_count = 0; + let mut key_bytes = 0; + for key in self.locks_by_key.keys() { + key_count += 1; + key_bytes += key.len(); + if key_count >= SAMPLE_COUNT { + break; + } + } + self.locks_by_key.len() * (key_bytes / key_count + std::mem::size_of::()) + + self.lock_ts_heap.len() + * (std::mem::size_of::() + std::mem::size_of::()) + } + + fn lock_heap_size(&self, key: &[u8]) -> usize { + // A resolver has + // * locks_by_key: HashMap, TimeStamp> + // * lock_ts_heap: BTreeMap + // + // We only count memory used by locks_by_key. Because the majority of + // memory is consumed by keys, locks_by_key and lock_ts_heap shares + // the same Arc<[u8]>, so lock_ts_heap is negligible. Also, it's hard to + // track accurate memory usage of lock_ts_heap as a timestamp may have + // many keys. + key.heap_size() + std::mem::size_of::() + } + + fn shrink_ratio(&mut self, ratio: usize) { + // HashMap load factor is 87% approximately, leave some margin to avoid + // frequent rehash. + // + // See https://github.com/rust-lang/hashbrown/blob/v0.14.0/src/raw/mod.rs#L208-L220 + const MIN_SHRINK_RATIO: usize = 2; + if self.locks_by_key.capacity() + > self.locks_by_key.len() * cmp::max(MIN_SHRINK_RATIO, ratio) + { + self.locks_by_key.shrink_to_fit(); + } + } + + pub fn track_lock( + &mut self, + start_ts: TimeStamp, + key: Vec, + index: Option, + ) -> Result<(), MemoryQuotaExceeded> { if let Some(index) = index { self.update_tracked_index(index); } + let bytes = self.lock_heap_size(&key); debug!( - "track lock {}@{}, region {}", + "track lock {}@{}", &log_wrappers::Value::key(&key), - start_ts, - self.region_id + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), + "memory_capacity" => self.memory_quota.capacity(), + "key_heap_size" => bytes, ); + self.memory_quota.alloc(bytes)?; let key: Arc<[u8]> = key.into_boxed_slice().into(); - self.locks_by_key.insert(key.clone(), start_ts); - self.lock_ts_heap.entry(start_ts).or_default().insert(key); + match self.locks_by_key.entry(key) { + HashMapEntry::Occupied(_) => { + // Free memory quota because it's already in the map. + self.memory_quota.free(bytes); + } + HashMapEntry::Vacant(entry) => { + // Add lock count for the start ts. + let txn_locks = self.lock_ts_heap.entry(start_ts).or_insert_with(|| { + let mut txn_locks = TxnLocks::default(); + txn_locks.sample_lock = Some(entry.key().clone()); + txn_locks + }); + txn_locks.lock_count += 1; + + entry.insert(start_ts); + } + } + Ok(()) } pub fn untrack_lock(&mut self, key: &[u8], index: Option) { @@ -135,49 +319,92 @@ impl Resolver { self.update_tracked_index(index); } let start_ts = if let Some(start_ts) = self.locks_by_key.remove(key) { + let bytes = self.lock_heap_size(key); + self.memory_quota.free(bytes); start_ts } else { - debug!("untrack a lock that was not tracked before"; "key" => &log_wrappers::Value::key(key)); + debug!("untrack a lock that was not tracked before"; + "key" => &log_wrappers::Value::key(key), + "region_id" => self.region_id, + ); return; }; debug!( - "untrack lock {}@{}, region {}", + "untrack lock {}@{}", &log_wrappers::Value::key(key), - start_ts, - self.region_id, + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), ); - let entry = self.lock_ts_heap.get_mut(&start_ts); - if let Some(locked_keys) = entry { - locked_keys.remove(key); - if locked_keys.is_empty() { + if let Some(txn_locks) = self.lock_ts_heap.get_mut(&start_ts) { + if txn_locks.lock_count > 0 { + txn_locks.lock_count -= 1; + } + if txn_locks.lock_count == 0 { self.lock_ts_heap.remove(&start_ts); } - } + }; + // Use a large ratio to amortize the cost of rehash. + let shrink_ratio = 8; + self.shrink_ratio(shrink_ratio); } /// Try to advance resolved ts. /// /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. - pub fn resolve(&mut self, min_ts: TimeStamp, now: Option) -> TimeStamp { + pub fn resolve( + &mut self, + min_ts: TimeStamp, + now: Option, + source: TsSource, + ) -> TimeStamp { + // Use a small ratio to shrink the memory usage aggressively. + const AGGRESSIVE_SHRINK_RATIO: usize = 2; + const AGGRESSIVE_SHRINK_INTERVAL: Duration = Duration::from_secs(10); + if self.last_aggressive_shrink_time.saturating_elapsed() > AGGRESSIVE_SHRINK_INTERVAL { + self.shrink_ratio(AGGRESSIVE_SHRINK_RATIO); + self.last_aggressive_shrink_time = Instant::now_coarse(); + } + // The `Resolver` is stopped, not need to advance, just return the current // `resolved_ts` if self.stopped { return self.resolved_ts; } + // Find the min start ts. - let min_lock = self.lock_ts_heap.keys().next().cloned(); + let min_lock = self.oldest_transaction(); let has_lock = min_lock.is_some(); - let min_start_ts = min_lock.unwrap_or(min_ts); + let min_start_ts = min_lock.as_ref().map(|(ts, _)| **ts).unwrap_or(min_ts); // No more commit happens before the ts. let new_resolved_ts = cmp::min(min_start_ts, min_ts); + // reason is the min source of the new resolved ts. + let reason = match (min_lock, min_ts) { + (Some((lock_ts, txn_locks)), min_ts) if *lock_ts < min_ts => { + TsSource::Lock(txn_locks.clone()) + } + (Some(_), _) => source, + (None, _) => source, + }; + if self.resolved_ts >= new_resolved_ts { - let label = if has_lock { "has_lock" } else { "stale_ts" }; RTS_RESOLVED_FAIL_ADVANCE_VEC - .with_label_values(&[label]) + .with_label_values(&[reason.label()]) .inc(); + self.last_attempt = Some(LastAttempt { + success: false, + ts: new_resolved_ts, + reason, + }); + } else { + self.last_attempt = Some(LastAttempt { + success: true, + ts: new_resolved_ts, + reason, + }) } // Resolved ts never decrease. @@ -204,21 +431,16 @@ impl Resolver { pub(crate) fn log_locks(&self, min_start_ts: u64) { // log lock with the minimum start_ts >= min_start_ts - if let Some((start_ts, keys)) = self + if let Some((start_ts, txn_locks)) = self .lock_ts_heap .range(TimeStamp::new(min_start_ts)..) .next() { - let keys_for_log = keys - .iter() - .map(|key| log_wrappers::Value::key(key)) - .take(MAX_NUMBER_OF_LOCKS_IN_LOG) - .collect::>(); info!( "locks with the minimum start_ts in resolver"; "region_id" => self.region_id, "start_ts" => start_ts, - "sampled_keys" => ?keys_for_log, + "txn_locks" => ?txn_locks, ); } } @@ -230,6 +452,18 @@ impl Resolver { pub(crate) fn num_transactions(&self) -> u64 { self.lock_ts_heap.len() as u64 } + + pub(crate) fn read_progress(&self) -> Option<&Arc> { + self.read_progress.as_ref() + } + + pub(crate) fn oldest_transaction(&self) -> Option<(&TimeStamp, &TxnLocks)> { + self.lock_ts_heap.iter().next() + } + + pub(crate) fn take_last_attempt(&mut self) -> Option { + self.last_attempt.take() + } } #[cfg(test)] @@ -300,16 +534,19 @@ mod tests { ]; for (i, case) in cases.into_iter().enumerate() { - let mut resolver = Resolver::new(1); + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); for e in case.clone() { match e { Event::Lock(start_ts, key) => { - resolver.track_lock(start_ts.into(), key.into_raw().unwrap(), None) + resolver + .track_lock(start_ts.into(), key.into_raw().unwrap(), None) + .unwrap(); } Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), Event::Resolve(min_ts, expect) => { assert_eq!( - resolver.resolve(min_ts.into(), None), + resolver.resolve(min_ts.into(), None, TsSource::PdTso), expect.into(), "case {}", i @@ -319,4 +556,158 @@ mod tests { } } } + + #[test] + fn test_memory_quota() { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let mut resolver = Resolver::new(1, memory_quota.clone()); + let mut key = vec![0; 77]; + let lock_size = resolver.lock_heap_size(&key); + let mut ts = TimeStamp::default(); + while resolver.track_lock(ts, key.clone(), None).is_ok() { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + } + let remain = 1024 % lock_size; + assert_eq!(memory_quota.in_use(), 1024 - remain); + + let mut ts = TimeStamp::default(); + for _ in 0..5 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + assert_eq!(memory_quota.in_use(), 1024 - 5 * lock_size - remain); + drop(resolver); + assert_eq!(memory_quota.in_use(), 0); + } + + #[test] + fn test_untrack_lock_shrink_ratio() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); + let mut key = vec![0; 16]; + let mut ts = TimeStamp::default(); + for _ in 0..1000 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + assert!( + resolver.locks_by_key.capacity() >= 1000, + "{}", + resolver.locks_by_key.capacity() + ); + + let mut ts = TimeStamp::default(); + for _ in 0..901 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + // shrink_to_fit may reserve some space in accordance with the resize + // policy, but it is expected to be less than 500. + assert!( + resolver.locks_by_key.capacity() < 500, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + + for _ in 0..99 { + ts.incr(); + key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + assert!( + resolver.locks_by_key.capacity() < 100, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + + // Trigger aggressive shrink. + resolver.last_aggressive_shrink_time = Instant::now_coarse() - Duration::from_secs(600); + resolver.resolve(TimeStamp::new(0), None, TsSource::PdTso); + assert!( + resolver.locks_by_key.capacity() == 0, + "{}, {}", + resolver.locks_by_key.capacity(), + resolver.locks_by_key.len(), + ); + } + + #[test] + fn test_idempotent_track_and_untrack_lock() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let mut resolver = Resolver::new(1, memory_quota); + let mut key = vec![0; 16]; + + // track_lock + let mut ts = TimeStamp::default(); + for c in 0..10 { + ts.incr(); + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + let in_use1 = resolver.memory_quota.in_use(); + let key_count1 = resolver.locks_by_key.len(); + let txn_count1 = resolver.lock_ts_heap.len(); + let txn_lock_count1 = resolver.lock_ts_heap[&ts].lock_count; + assert!(in_use1 > 0); + assert_eq!(key_count1, (c + 1) * 100); + assert_eq!(txn_count1, c + 1); + + // Put same keys again, resolver internal state must be idempotent. + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + let _ = resolver.track_lock(ts, key.clone(), None); + } + let in_use2 = resolver.memory_quota.in_use(); + let key_count2 = resolver.locks_by_key.len(); + let txn_count2 = resolver.lock_ts_heap.len(); + let txn_lock_count2 = resolver.lock_ts_heap[&ts].lock_count; + assert_eq!(in_use1, in_use2); + assert_eq!(key_count1, key_count2); + assert_eq!(txn_count1, txn_count2); + assert_eq!(txn_lock_count1, txn_lock_count2); + } + assert_eq!(resolver.resolve(ts, None, TsSource::PdTso), 1.into()); + + // untrack_lock + let mut ts = TimeStamp::default(); + for _ in 0..10 { + ts.incr(); + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + let in_use1 = resolver.memory_quota.in_use(); + let key_count1 = resolver.locks_by_key.len(); + let txn_count1 = resolver.lock_ts_heap.len(); + + // Unlock same keys again, resolver internal state must be idempotent. + for k in 0..100u64 { + key[0..8].copy_from_slice(&k.to_be_bytes()); + key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); + resolver.untrack_lock(&key, None); + } + let in_use2 = resolver.memory_quota.in_use(); + let key_count2 = resolver.locks_by_key.len(); + let txn_count2 = resolver.lock_ts_heap.len(); + assert_eq!(in_use1, in_use2); + assert_eq!(key_count1, key_count2); + assert_eq!(txn_count1, txn_count2); + + assert_eq!(resolver.resolve(ts, None, TsSource::PdTso), ts); + } + + assert_eq!(resolver.memory_quota.in_use(), 0); + assert_eq!(resolver.locks_by_key.len(), 0); + assert_eq!(resolver.lock_ts_heap.len(), 0); + } } diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 0ca74bda29d..6c8c90dc38f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -3,56 +3,79 @@ use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::KvEngine; -use futures::compat::Future01CompatExt; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb::Region}; +use futures::{channel::oneshot::Receiver, compat::Future01CompatExt, FutureExt}; +use kvproto::metapb::Region; use raftstore::{ - coprocessor::{ObserveHandle, ObserveId}, + coprocessor::ObserveHandle, router::CdcHandle, store::{fsm::ChangeObserver, msg::Callback, RegionSnapshot}, }; use tikv::storage::{ kv::{ScanMode as MvccScanMode, Snapshot}, - mvcc::{DeltaScanner, MvccReader, ScannerBuilder}, - txn::{TxnEntry, TxnEntryScanner}, + mvcc::MvccReader, +}; +use tikv_util::{ + sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE, worker::Scheduler, +}; +use tokio::{ + runtime::{Builder, Runtime}, + sync::Semaphore, }; -use tikv_util::{sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE}; -use tokio::runtime::{Builder, Runtime}; use txn_types::{Key, Lock, LockType, TimeStamp}; use crate::{ errors::{Error, Result}, - metrics::RTS_SCAN_DURATION_HISTOGRAM, + metrics::*, + Task, }; -const DEFAULT_SCAN_BATCH_SIZE: usize = 1024; +const DEFAULT_SCAN_BATCH_SIZE: usize = 128; const GET_SNAPSHOT_RETRY_TIME: u32 = 3; const GET_SNAPSHOT_RETRY_BACKOFF_STEP: Duration = Duration::from_millis(100); -pub type BeforeStartCallback = Box; -pub type OnErrorCallback = Box; -pub type OnEntriesCallback = Box, u64) + Send>; -pub type IsCancelledCallback = Box bool + Send>; - -pub enum ScanMode { - LockOnly, - All, - AllWithOldValue, -} - pub struct ScanTask { pub handle: ObserveHandle, - pub tag: String, - pub mode: ScanMode, pub region: Region, pub checkpoint_ts: TimeStamp, - pub is_cancelled: IsCancelledCallback, - pub send_entries: OnEntriesCallback, - pub on_error: Option, + pub backoff: Option, + pub cancelled: Receiver<()>, + pub scheduler: Scheduler, +} + +impl ScanTask { + async fn send_entries(&self, entries: ScanEntries, apply_index: u64) { + let task = Task::ScanLocks { + region_id: self.region.get_id(), + observe_id: self.handle.id, + entries, + apply_index, + }; + if let Err(e) = self.scheduler.schedule(task) { + warn!("resolved_ts scheduler send entries failed"; "err" => ?e); + } + } + + fn is_cancelled(&mut self) -> bool { + matches!(self.cancelled.try_recv(), Err(_) | Ok(Some(_))) + } + + fn on_error(&self, err: Error) { + if let Err(e) = self.scheduler.schedule(Task::ReRegisterRegion { + region_id: self.region.get_id(), + observe_id: self.handle.id, + cause: err, + }) { + warn!("schedule re-register task failed"; + "region_id" => self.region.get_id(), + "observe_id" => ?self.handle.id, + "error" => ?e); + } + RTS_SCAN_TASKS.with_label_values(&["abort"]).inc(); + } } #[derive(Debug)] -pub enum ScanEntry { - TxnEntry(Vec), +pub enum ScanEntries { Lock(Vec<(Key, Lock)>), None, } @@ -81,95 +104,66 @@ impl, E: KvEngine> ScannerPool { } } - pub fn spawn_task(&self, mut task: ScanTask) { + pub fn spawn_task(&self, mut task: ScanTask, concurrency_semaphore: Arc) { let cdc_handle = self.cdc_handle.clone(); let fut = async move { + tikv_util::defer!({ + RTS_SCAN_TASKS.with_label_values(&["finish"]).inc(); + }); + if let Some(backoff) = task.backoff { + RTS_INITIAL_SCAN_BACKOFF_DURATION_HISTOGRAM.observe(backoff.as_secs_f64()); + let mut backoff = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + backoff) + .compat() + .fuse(); + futures::select! { + res = backoff => if let Err(e) = res { + error!("failed to backoff"; "err" => ?e); + }, + _ = &mut task.cancelled => {} + } + if task.is_cancelled() { + return; + } + } + let _permit = concurrency_semaphore.acquire().await; + if task.is_cancelled() { + return; + } + fail::fail_point!("resolved_ts_before_scanner_get_snapshot"); let snap = match Self::get_snapshot(&mut task, cdc_handle).await { Ok(snap) => snap, Err(e) => { warn!("resolved_ts scan get snapshot failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } + task.on_error(e); return; } }; + fail::fail_point!("resolved_ts_after_scanner_get_snapshot"); let start = Instant::now(); let apply_index = snap.get_apply_index().unwrap(); - let mut entries = vec![]; - match task.mode { - ScanMode::All | ScanMode::AllWithOldValue => { - let txn_extra_op = if let ScanMode::AllWithOldValue = task.mode { - TxnExtraOp::ReadOldValue - } else { - TxnExtraOp::Noop - }; - let mut scanner = ScannerBuilder::new(snap, TimeStamp::max()) - .range(None, None) - .build_delta_scanner(task.checkpoint_ts, txn_extra_op) - .unwrap(); - let mut done = false; - while !done && !(task.is_cancelled)() { - let (es, has_remaining) = match Self::scan_delta(&mut scanner) { - Ok(rs) => rs, - Err(e) => { - warn!("resolved_ts scan delta failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } - return; - } - }; - done = !has_remaining; - entries.push(ScanEntry::TxnEntry(es)); - } - } - ScanMode::LockOnly => { - let mut reader = MvccReader::new(snap, Some(MvccScanMode::Forward), false); - let mut done = false; - let mut start = None; - while !done && !(task.is_cancelled)() { - let (locks, has_remaining) = - match Self::scan_locks(&mut reader, start.as_ref(), task.checkpoint_ts) - { - Ok(rs) => rs, - Err(e) => { - warn!("resolved_ts scan lock failed"; "err" => ?e); - let ScanTask { - on_error, - region, - handle, - .. - } = task; - if let Some(on_error) = on_error { - on_error(handle.id, region, e); - } - return; - } - }; - done = !has_remaining; - if has_remaining { - start = Some(locks.last().unwrap().0.clone()) + let mut reader = MvccReader::new(snap, Some(MvccScanMode::Forward), false); + let mut done = false; + let mut start_key = None; + while !done && !task.is_cancelled() { + let (locks, has_remaining) = + match Self::scan_locks(&mut reader, start_key.as_ref(), task.checkpoint_ts) { + Ok(rs) => rs, + Err(e) => { + warn!("resolved_ts scan lock failed"; "err" => ?e); + task.on_error(e); + return; } - entries.push(ScanEntry::Lock(locks)); - } + }; + done = !has_remaining; + if has_remaining { + start_key = Some(locks.last().unwrap().0.clone()) } + task.send_entries(ScanEntries::Lock(locks), apply_index) + .await; } - entries.push(ScanEntry::None); RTS_SCAN_DURATION_HISTOGRAM.observe(start.saturating_elapsed().as_secs_f64()); - (task.send_entries)(entries, apply_index); + task.send_entries(ScanEntries::None, apply_index).await; }; self.workers.spawn(fut); } @@ -181,49 +175,51 @@ impl, E: KvEngine> ScannerPool { let mut last_err = None; for retry_times in 0..=GET_SNAPSHOT_RETRY_TIME { if retry_times != 0 { - if let Err(e) = GLOBAL_TIMER_HANDLE + let mut backoff = GLOBAL_TIMER_HANDLE .delay( std::time::Instant::now() + GET_SNAPSHOT_RETRY_BACKOFF_STEP .mul_f64(10_f64.powi(retry_times as i32 - 1)), ) .compat() - .await - { - error!("failed to backoff"; "err" => ?e); + .fuse(); + futures::select! { + res = backoff => if let Err(e) = res { + error!("failed to backoff"; "err" => ?e); + }, + _ = &mut task.cancelled => {} } - if (task.is_cancelled)() { - return Err(Error::Other("scan task cancelled".into())); + if task.is_cancelled() { + return Err(box_err!("scan task cancelled")); } } let (cb, fut) = tikv_util::future::paired_future_callback(); let change_cmd = ChangeObserver::from_rts(task.region.id, task.handle.clone()); - cdc_handle.capture_change( - task.region.id, - task.region.get_region_epoch().clone(), - change_cmd, - Callback::read(Box::new(cb)), - )?; + cdc_handle + .capture_change( + task.region.id, + task.region.get_region_epoch().clone(), + change_cmd, + Callback::read(Box::new(cb)), + ) + .map_err(|e| Error::Other(box_err!("{:?}", e)))?; let mut resp = box_try!(fut.await); if resp.response.get_header().has_error() { let err = resp.response.take_header().take_error(); // These two errors can't handled by retrying since the epoch and observe id is // unchanged if err.has_epoch_not_match() || err.get_message().contains("stale observe id") { - return Err(Error::request(err)); + return Err(box_err!("get snapshot failed: {:?}", err)); } last_err = Some(err) } else { return Ok(resp.snapshot.unwrap()); } } - Err(Error::Other( - format!( - "backoff timeout after {} try, last error: {:?}", - GET_SNAPSHOT_RETRY_TIME, - last_err.unwrap() - ) - .into(), + Err(box_err!( + "backoff timeout after {} try, last error: {:?}", + GET_SNAPSHOT_RETRY_TIME, + last_err.unwrap() )) } @@ -232,29 +228,14 @@ impl, E: KvEngine> ScannerPool { start: Option<&Key>, _checkpoint_ts: TimeStamp, ) -> Result<(Vec<(Key, Lock)>, bool)> { - let (locks, has_remaining) = reader.scan_locks( - start, - None, - |lock| matches!(lock.lock_type, LockType::Put | LockType::Delete), - DEFAULT_SCAN_BATCH_SIZE, - )?; + let (locks, has_remaining) = reader + .scan_locks( + start, + None, + |lock| matches!(lock.lock_type, LockType::Put | LockType::Delete), + DEFAULT_SCAN_BATCH_SIZE, + ) + .map_err(|e| Error::Other(box_err!("{:?}", e)))?; Ok((locks, has_remaining)) } - - fn scan_delta(scanner: &mut DeltaScanner) -> Result<(Vec, bool)> { - let mut entries = Vec::with_capacity(DEFAULT_SCAN_BATCH_SIZE); - let mut has_remaining = true; - while entries.len() < entries.capacity() { - match scanner.next_entry()? { - Some(entry) => { - entries.push(entry); - } - None => { - has_remaining = false; - break; - } - } - } - Ok((entries, has_remaining)) - } } diff --git a/components/resolved_ts/tests/failpoints/mod.rs b/components/resolved_ts/tests/failpoints/mod.rs index 808f5ed62ff..64b58e0ed22 100644 --- a/components/resolved_ts/tests/failpoints/mod.rs +++ b/components/resolved_ts/tests/failpoints/mod.rs @@ -2,6 +2,11 @@ #[path = "../mod.rs"] mod testsuite; +use std::{ + sync::{mpsc::channel, Mutex}, + time::Duration, +}; + use futures::executor::block_on; use kvproto::kvrpcpb::*; use pd_client::PdClient; @@ -58,15 +63,14 @@ fn test_report_min_resolved_ts() { fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); fail::cfg("mock_min_resolved_ts_interval", "return(0)").unwrap(); let mut suite = TestSuite::new(1); - // default config is 1s assert_eq!( suite .cluster .cfg .tikv .raft_store - .report_min_resolved_ts_interval, - ReadableDuration::secs(1) + .pd_report_min_resolved_ts_interval, + ReadableDuration::millis(50) ); let region = suite.cluster.get_region(&[]); let ts1 = suite.cluster.pd_client.get_min_resolved_ts(); @@ -128,3 +132,43 @@ fn test_report_min_resolved_ts_disable() { fail::remove("mock_min_resolved_ts_interval_disable"); suite.stop(); } + +#[test] +fn test_pending_locks_memory_quota_exceeded() { + // Pause scan lock so that locks will be put in pending locks. + fail::cfg("resolved_ts_after_scanner_get_snapshot", "pause").unwrap(); + // Check if memory quota exceeded is triggered. + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback( + "resolved_ts_on_pending_locks_memory_quota_exceeded", + move || { + let sender = tx.lock().unwrap(); + sender.send(()).unwrap(); + }, + ) + .unwrap(); + + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + // Must not trigger memory quota exceeded. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Must trigger memory quota exceeded. + rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + fail::remove("resolved_ts_after_scanner_get_snapshot"); + fail::remove("resolved_ts_on_pending_locks_memory_quota_exceeded"); + suite.stop(); +} diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index 7802108b92b..881d0b299f1 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -2,15 +2,17 @@ #[path = "../mod.rs"] mod testsuite; -use std::time::Duration; +use std::{sync::mpsc::channel, time::Duration}; use futures::executor::block_on; use kvproto::{kvrpcpb::*, metapb::RegionEpoch}; use pd_client::PdClient; +use resolved_ts::Task; use tempfile::Builder; -use test_raftstore::sleep_ms; +use test_raftstore::{sleep_ms, IsolationFilterFactory}; use test_sst_importer::*; pub use testsuite::*; +use tikv_util::store::new_peer; #[test] fn test_resolved_ts_basic() { @@ -141,3 +143,120 @@ fn test_dynamic_change_advance_ts_interval() { suite.stop(); } + +#[test] +fn test_change_log_memory_quota_exceeded() { + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + suite.must_get_rts_ge( + region.id, + block_on(suite.cluster.pd_client.get_tso()).unwrap(), + ); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Resolved ts should not advance. + let (tx, rx) = channel(); + suite.must_schedule_task( + 1, + Task::GetDiagnosisInfo { + region_id: 1, + log_locks: false, + min_start_ts: u64::MAX, + callback: Box::new(move |res| { + tx.send(res).unwrap(); + }), + }, + ); + let res = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + assert_eq!(res.unwrap().1, 0, "{:?}", res); + + suite.stop(); +} + +#[test] +fn test_scan_log_memory_quota_exceeded() { + let mut suite = TestSuite::new(1); + let region = suite.cluster.get_region(&[]); + + suite.must_get_rts_ge( + region.id, + block_on(suite.cluster.pd_client.get_tso()).unwrap(), + ); + + let (k, v) = (b"k1", b"v"); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.to_vec(); + mutation.value = v.to_vec(); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); + + // Set a small memory quota to trigger memory quota exceeded. + suite.must_change_memory_quota(1, 1); + // Split region + suite.cluster.must_split(®ion, k); + + let r1 = suite.cluster.get_region(&[]); + let r2 = suite.cluster.get_region(k); + let current_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + // Wait for scan log. + sleep_ms(500); + // Resolved ts of region1 should be advanced + suite.must_get_rts_ge(r1.id, current_ts); + + // Resolved ts should not advance. + let (tx, rx) = channel(); + suite.must_schedule_task( + r2.id, + Task::GetDiagnosisInfo { + region_id: r2.id, + log_locks: false, + min_start_ts: u64::MAX, + callback: Box::new(move |res| { + tx.send(res).unwrap(); + }), + }, + ); + let res = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + assert_eq!(res.unwrap().1, 0, "{:?}", res); + + suite.stop(); +} + +// This case checks resolved ts can still be advanced quickly even if some TiKV +// stores are partitioned. +#[test] +fn test_store_partitioned() { + let mut suite = TestSuite::new(3); + let r = suite.cluster.get_region(&[]); + suite.cluster.must_transfer_leader(r.id, new_peer(1, 1)); + suite.must_get_rts_ge(r.id, block_on(suite.cluster.pd_client.get_tso()).unwrap()); + + suite + .cluster + .add_send_filter(IsolationFilterFactory::new(3)); + let tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + for _ in 0..50 { + let rts = suite.region_resolved_ts(r.id).unwrap(); + if rts > tso { + if rts.physical() - tso.physical() < 3000 { + break; + } else { + panic!("resolved ts doesn't advance in time") + } + } + sleep_ms(100); + } + + suite.stop(); +} diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 4e6226f8935..fc3d5720929 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -4,6 +4,7 @@ use std::{sync::*, time::Duration}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; +use engine_rocks::RocksEngine; use futures::{executor::block_on, stream, SinkExt}; use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment, Result, WriteFlags}; use kvproto::{ @@ -26,7 +27,7 @@ pub fn init() { } pub struct TestSuite { - pub cluster: Cluster, + pub cluster: Cluster>, pub endpoints: HashMap>, pub obs: HashMap, tikv_cli: HashMap, @@ -44,7 +45,10 @@ impl TestSuite { Self::with_cluster(count, cluster) } - pub fn with_cluster(count: usize, mut cluster: Cluster) -> Self { + pub fn with_cluster( + count: usize, + mut cluster: Cluster>, + ) -> Self { init(); let pd_cli = cluster.pd_client.clone(); let mut endpoints = HashMap::default(); @@ -122,8 +126,21 @@ impl TestSuite { ); c }; + self.must_schedule_task(store_id, Task::ChangeConfig { change }); + } + + pub fn must_change_memory_quota(&self, store_id: u64, bytes: u64) { + let change = { + let mut c = std::collections::HashMap::default(); + c.insert("memory_quota".to_owned(), ConfigValue::Size(bytes)); + c + }; + self.must_schedule_task(store_id, Task::ChangeConfig { change }); + } + + pub fn must_schedule_task(&self, store_id: u64, task: Task) { let scheduler = self.endpoints.get(&store_id).unwrap().scheduler(); - scheduler.schedule(Task::ChangeConfig { change }).unwrap(); + scheduler.schedule(task).unwrap(); } pub fn must_kv_prewrite( diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index fd98fc9a092..0750a21c574 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -16,7 +16,7 @@ use tokio_timer::Delay; use crate::{ resource_group::{ResourceConsumeType, ResourceController}, - resource_limiter::ResourceLimiter, + resource_limiter::{ResourceLimiter, ResourceType}, }; const MAX_WAIT_DURATION: Duration = Duration::from_secs(10); @@ -92,7 +92,9 @@ pub struct LimitedFuture { #[pin] post_delay: OptionalFuture>, resource_limiter: Arc, - res: Poll, + // if the future is first polled, we need to let it consume a 0 value + // to compensate the debt of previously finished tasks. + is_first_poll: bool, } impl LimitedFuture { @@ -102,7 +104,7 @@ impl LimitedFuture { pre_delay: None.into(), post_delay: None.into(), resource_limiter, - res: Poll::Pending, + is_first_poll: true, } } } @@ -112,26 +114,50 @@ impl Future for LimitedFuture { fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let mut this = self.project(); - if !this.post_delay.is_done() { - assert!(this.pre_delay.is_done()); + if *this.is_first_poll { + debug_assert!(this.pre_delay.finished && this.post_delay.finished); + *this.is_first_poll = false; + let wait_dur = this + .resource_limiter + .consume(Duration::ZERO, IoBytes::default(), true) + .min(MAX_WAIT_DURATION); + if wait_dur > Duration::ZERO { + *this.pre_delay = Some( + GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + wait_dur) + .compat(), + ) + .into(); + } + } + if !this.post_delay.finished { + assert!(this.pre_delay.finished); std::mem::swap(&mut *this.pre_delay, &mut *this.post_delay); } - if !this.pre_delay.is_done() { + if !this.pre_delay.finished { let res = this.pre_delay.poll(cx); if res.is_pending() { return Poll::Pending; } } - if this.res.is_ready() { - return std::mem::replace(this.res, Poll::Pending); - } - let last_io_bytes = match get_thread_io_bytes_stats() { - Ok(b) => Some(b), - Err(e) => { - warn!("load thread io bytes failed"; "err" => e); - None + // get io stats is very expensive, so we only do so if only io control is + // enabled. + let mut last_io_bytes = None; + if this + .resource_limiter + .get_limiter(ResourceType::Io) + .get_rate_limit() + .is_finite() + { + match get_thread_io_bytes_stats() { + Ok(b) => { + last_io_bytes = Some(b); + } + Err(e) => { + warn!("load thread io bytes failed"; "err" => e); + } } - }; + } let start = Instant::now(); let res = this.f.poll(cx); let dur = start.saturating_elapsed(); @@ -146,8 +172,10 @@ impl Future for LimitedFuture { } else { IoBytes::default() }; - let mut wait_dur = this.resource_limiter.consume(dur, io_bytes); - if wait_dur == Duration::ZERO { + let mut wait_dur = this + .resource_limiter + .consume(dur, io_bytes, res.is_pending()); + if wait_dur == Duration::ZERO || res.is_ready() { return res; } if wait_dur > MAX_WAIT_DURATION { @@ -160,31 +188,24 @@ impl Future for LimitedFuture { .compat(), ) .into(); - if this.post_delay.poll(cx).is_ready() { - return res; - } - *this.res = res; + _ = this.post_delay.poll(cx); Poll::Pending } } /// `OptionalFuture` is similar to futures::OptionFuture, but provide an extra -/// `is_done` method. +/// `finished` flag to determine if the future requires poll. #[pin_project] struct OptionalFuture { #[pin] f: Option, - done: bool, + finished: bool, } impl OptionalFuture { fn new(f: Option) -> Self { - let done = f.is_none(); - Self { f, done } - } - - fn is_done(&self) -> bool { - self.done + let finished = f.is_none(); + Self { f, finished } } } @@ -201,7 +222,7 @@ impl Future for OptionalFuture { let this = self.project(); match this.f.as_pin_mut() { Some(x) => x.poll(cx).map(|r| { - *this.done = true; + *this.finished = true; Some(r) }), None => Poll::Ready(None), @@ -263,7 +284,13 @@ mod tests { .name_prefix("test") .build_future_pool(); - let resource_limiter = Arc::new(ResourceLimiter::new("".into(), f64::INFINITY, 1000.0, 0)); + let resource_limiter = Arc::new(ResourceLimiter::new( + "".into(), + f64::INFINITY, + 1000.0, + 0, + true, + )); fn spawn_and_wait(pool: &FuturePool, f: F, limiter: Arc) where @@ -295,7 +322,7 @@ mod tests { let delta = new_stats - stats; let dur = start.saturating_elapsed(); assert_eq!(delta.total_consumed, 150); - assert_eq!(delta.total_wait_dur_us, 150_000); + assert!(delta.total_wait_dur_us >= 140_000 && delta.total_wait_dur_us <= 160_000); assert!(dur >= Duration::from_millis(150) && dur <= Duration::from_millis(160)); // fetch io bytes failed, consumed value is 0. @@ -303,7 +330,10 @@ mod tests { { fail::cfg("failed_to_get_thread_io_bytes_stats", "1*return").unwrap(); spawn_and_wait(&pool, empty(), resource_limiter.clone()); - assert_eq!(resource_limiter.get_limit_statistics(Io), new_stats); + assert_eq!( + resource_limiter.get_limit_statistics(Io).total_consumed, + new_stats.total_consumed + ); fail::remove("failed_to_get_thread_io_bytes_stats"); } } diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 6cfd24914a1..917718e8409 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] #![feature(local_key_cell_methods)] +#![feature(array_zip)] use std::sync::Arc; @@ -10,9 +11,9 @@ use serde::{Deserialize, Serialize}; mod resource_group; pub use resource_group::{ - ResourceConsumeType, ResourceController, ResourceGroupManager, TaskMetadata, - MIN_PRIORITY_UPDATE_INTERVAL, + ResourceConsumeType, ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL, }; +pub use tikv_util::resource_control::*; mod future; pub use future::{with_resource_limiter, ControlledFuture}; @@ -29,7 +30,9 @@ pub use channel::ResourceMetered; mod resource_limiter; pub use resource_limiter::ResourceLimiter; use tikv_util::worker::Worker; -use worker::{GroupQuotaAdjustWorker, BACKGROUND_LIMIT_ADJUST_DURATION}; +use worker::{ + GroupQuotaAdjustWorker, PriorityLimiterAdjustWorker, BACKGROUND_LIMIT_ADJUST_DURATION, +}; mod metrics; pub mod worker; @@ -66,10 +69,13 @@ pub fn start_periodic_tasks( bg_worker.spawn_async_task(async move { resource_mgr_service_clone.watch_resource_groups().await; }); - // spawn a task to auto adjust background quota limiter. + // spawn a task to auto adjust background quota limiter and priority quota + // limiter. let mut worker = GroupQuotaAdjustWorker::new(mgr.clone(), io_bandwidth); + let mut priority_worker = PriorityLimiterAdjustWorker::new(mgr.clone()); bg_worker.spawn_interval_task(BACKGROUND_LIMIT_ADJUST_DURATION, move || { worker.adjust_quota(); + priority_worker.adjust(); }); // spawn a task to periodically upload resource usage statistics to PD. bg_worker.spawn_async_task(async move { diff --git a/components/resource_control/src/metrics.rs b/components/resource_control/src/metrics.rs index 16338f41c6c..45723063492 100644 --- a/components/resource_control/src/metrics.rs +++ b/components/resource_control/src/metrics.rs @@ -7,19 +7,25 @@ lazy_static! { pub static ref BACKGROUND_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_resource_control_background_quota_limiter", "The quota limiter of background resource groups per resource type", - &["name", "type"] + &["resource_group", "type"] ) .unwrap(); pub static ref BACKGROUND_RESOURCE_CONSUMPTION: IntCounterVec = register_int_counter_vec!( "tikv_resource_control_background_resource_consumption", "Total resource consumed of background resource groups per resource type", - &["name", "type"] + &["resource_group", "type"] ) .unwrap(); pub static ref BACKGROUND_TASKS_WAIT_DURATION: IntCounterVec = register_int_counter_vec!( "tikv_resource_control_background_task_wait_duration", "Total wait duration of background tasks per resource group", - &["name"] + &["resource_group"] + ) + .unwrap(); + pub static ref PRIORITY_QUOTA_LIMIT_VEC: IntGaugeVec = register_int_gauge_vec!( + "tikv_resource_control_priority_quota_limit", + "The quota limiter for each priority in resource control", + &["priority"] ) .unwrap(); } diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index a356d30a7ac..d6933d0a383 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -1,7 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::Cow, cell::Cell, cmp::{max, min}, collections::HashSet, @@ -22,7 +21,11 @@ use kvproto::{ resource_manager::{GroupMode, ResourceGroup as PbResourceGroup}, }; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; -use tikv_util::{info, time::Instant}; +use tikv_util::{ + info, + resource_control::{TaskMetadata, TaskPriority, DEFAULT_RESOURCE_GROUP_NAME}, + time::Instant, +}; use yatp::queue::priority::TaskPriorityProvider; use crate::{metrics::deregister_metrics, resource_limiter::ResourceLimiter}; @@ -33,8 +36,6 @@ const DEFAULT_PRIORITY_PER_READ_TASK: u64 = 50; const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; /// duration to update the minimal priority value of each resource group. pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); -/// default resource group name -const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; /// default value of max RU quota. const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; /// The maximum RU quota that can be configured. @@ -59,18 +60,33 @@ pub enum ResourceConsumeType { /// ResourceGroupManager manages the metadata of each resource group. pub struct ResourceGroupManager { pub(crate) resource_groups: DashMap, + // the count of all groups, a fast path because call `DashMap::len` is a little slower. + group_count: AtomicU64, registry: RwLock>>, // auto incremental version generator used for mark the background // resource limiter has changed. version_generator: AtomicU64, + // the shared resource limiter of each priority + priority_limiters: [Arc; TaskPriority::PRIORITY_COUNT], } impl Default for ResourceGroupManager { fn default() -> Self { + let priority_limiters = TaskPriority::priorities().map(|p| { + Arc::new(ResourceLimiter::new( + p.as_str().to_owned(), + f64::INFINITY, + f64::INFINITY, + 0, + false, + )) + }); let manager = Self { resource_groups: Default::default(), + group_count: AtomicU64::new(0), registry: Default::default(), version_generator: AtomicU64::new(0), + priority_limiters, }; // init the default resource group by default. @@ -90,6 +106,11 @@ impl Default for ResourceGroupManager { } impl ResourceGroupManager { + #[inline] + pub fn get_group_count(&self) -> u64 { + self.group_count.load(Ordering::Relaxed) + } + fn get_ru_setting(rg: &PbResourceGroup, is_read: bool) -> u64 { match (rg.get_mode(), is_read) { // RU mode, read and write use the same setting. @@ -129,8 +150,13 @@ impl ResourceGroupManager { .and_then(|g| g.limiter.clone()); let limiter = self.build_resource_limiter(&rg, prev_limiter); - self.resource_groups - .insert(group_name, ResourceGroup::new(rg, limiter)); + if self + .resource_groups + .insert(group_name, ResourceGroup::new(rg, limiter)) + .is_none() + { + self.group_count.fetch_add(1, Ordering::Relaxed); + } } fn build_resource_limiter( @@ -146,6 +172,7 @@ impl ResourceGroupManager { f64::INFINITY, f64::INFINITY, version, + true, ))) }) } else { @@ -161,6 +188,7 @@ impl ResourceGroupManager { if self.resource_groups.remove(&group_name).is_some() { deregister_metrics(name); info!("remove resource group"; "name"=> name); + self.group_count.fetch_sub(1, Ordering::Relaxed); } } @@ -184,6 +212,8 @@ impl ResourceGroupManager { controller.remove_resource_group(name.as_bytes()); } }); + self.group_count + .fetch_sub(removed_names.len() as u64, Ordering::Relaxed); } } @@ -234,20 +264,86 @@ impl ResourceGroupManager { } } + // only enable priority quota limiter when there is at least 1 user-defined + // resource group. + #[inline] + fn enable_priority_limiter(&self) -> bool { + self.get_group_count() > 1 + } + + // Always return the background resource limiter if any; + // Only return the foregroup limiter when priority is enabled. pub fn get_resource_limiter( &self, rg: &str, request_source: &str, + override_priority: u64, + ) -> Option> { + let (limiter, group_priority) = + self.get_background_resource_limiter_with_priority(rg, request_source); + if limiter.is_some() { + return limiter; + } + + // if there is only 1 resource group, priority quota limiter is useless so just + // return None for better performance. + if !self.enable_priority_limiter() { + return None; + } + + // request priority has higher priority, 0 means priority is not set. + let mut task_priority = override_priority as u32; + if task_priority == 0 { + task_priority = group_priority; + } + Some(self.priority_limiters[TaskPriority::from(task_priority) as usize].clone()) + } + + // return a ResourceLimiter for background tasks only. + pub fn get_background_resource_limiter( + &self, + rg: &str, + request_source: &str, ) -> Option> { + self.get_background_resource_limiter_with_priority(rg, request_source) + .0 + } + + fn get_background_resource_limiter_with_priority( + &self, + rg: &str, + request_source: &str, + ) -> (Option>, u32) { + fail_point!("only_check_source_task_name", |name| { + assert_eq!(&name.unwrap(), request_source); + (None, 8) + }); + let mut group_priority = None; if let Some(group) = self.resource_groups.get(rg) { + group_priority = Some(group.group.priority); if !group.fallback_default { - return group.get_resource_limiter(request_source); + return ( + group.get_background_resource_limiter(request_source), + group.group.priority, + ); } } - self.resource_groups + let default_group = self + .resource_groups .get(DEFAULT_RESOURCE_GROUP_NAME) - .and_then(|g| g.get_resource_limiter(request_source)) + .unwrap(); + ( + default_group.get_background_resource_limiter(request_source), + group_priority.unwrap_or(default_group.group.priority), + ) + } + + #[inline] + pub fn get_priority_resource_limiters( + &self, + ) -> [Arc; TaskPriority::PRIORITY_COUNT] { + self.priority_limiters.clone() } } @@ -282,7 +378,10 @@ impl ResourceGroup { .get_fill_rate() } - fn get_resource_limiter(&self, request_source: &str) -> Option> { + fn get_background_resource_limiter( + &self, + request_source: &str, + ) -> Option> { self.limiter.as_ref().and_then(|limiter| { // the source task name is the last part of `request_source` separated by "_" // the request_source is @@ -383,7 +482,9 @@ impl ResourceController { let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); // skip to adjust max ru if it is the "default" group and the ru config eq // MAX_RU_QUOTA - if ru_quota > *max_ru_quota && (name != b"default" || ru_quota < MAX_RU_QUOTA) { + if ru_quota > *max_ru_quota + && (name != DEFAULT_RESOURCE_GROUP_NAME.as_bytes() || ru_quota < MAX_RU_QUOTA) + { *max_ru_quota = ru_quota; // adjust all group weight because the current value is too small. self.adjust_all_resource_group_factors(ru_quota); @@ -526,92 +627,9 @@ impl ResourceController { } } -const OVERRIDE_PRIORITY_MASK: u8 = 0b1000_0000; -const RESOURCE_GROUP_NAME_MASK: u8 = 0b0100_0000; - -#[derive(Clone, Default)] -pub struct TaskMetadata<'a> { - // The first byte is a bit map to indicate which field exists, - // then append override priority if nonzero, - // then append resource group name if not default - metadata: Cow<'a, [u8]>, -} - -impl<'a> TaskMetadata<'a> { - pub fn deep_clone(&self) -> TaskMetadata<'static> { - TaskMetadata { - metadata: Cow::Owned(self.metadata.to_vec()), - } - } - - pub fn from_ctx(ctx: &ResourceControlContext) -> Self { - let mut mask = 0; - let mut buf = vec![]; - if ctx.override_priority != 0 { - mask |= OVERRIDE_PRIORITY_MASK; - } - if !ctx.resource_group_name.is_empty() - && ctx.resource_group_name != DEFAULT_RESOURCE_GROUP_NAME - { - mask |= RESOURCE_GROUP_NAME_MASK; - } - if mask == 0 { - // if all are default value, no need to write anything to save copy cost - return Self { - metadata: Cow::Owned(buf), - }; - } - buf.push(mask); - if mask & OVERRIDE_PRIORITY_MASK != 0 { - buf.extend_from_slice(&(ctx.override_priority as u32).to_ne_bytes()); - } - if mask & RESOURCE_GROUP_NAME_MASK != 0 { - buf.extend_from_slice(ctx.resource_group_name.as_bytes()); - } - Self { - metadata: Cow::Owned(buf), - } - } - - fn from_bytes(bytes: &'a [u8]) -> Self { - Self { - metadata: Cow::Borrowed(bytes), - } - } - - pub fn to_vec(self) -> Vec { - self.metadata.into_owned() - } - - fn override_priority(&self) -> u32 { - if self.metadata.is_empty() { - return 0; - } - if self.metadata[0] & OVERRIDE_PRIORITY_MASK == 0 { - return 0; - } - u32::from_ne_bytes(self.metadata[1..5].try_into().unwrap()) - } - - pub fn group_name(&self) -> &[u8] { - if self.metadata.is_empty() { - return DEFAULT_RESOURCE_GROUP_NAME.as_bytes(); - } - if self.metadata[0] & RESOURCE_GROUP_NAME_MASK == 0 { - return DEFAULT_RESOURCE_GROUP_NAME.as_bytes(); - } - let start = if self.metadata[0] & OVERRIDE_PRIORITY_MASK != 0 { - 5 - } else { - 1 - }; - &self.metadata[start..] - } -} - impl TaskPriorityProvider for ResourceController { fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { - let metadata = TaskMetadata::from_bytes(extras.metadata()); + let metadata = TaskMetadata::from(extras.metadata()); self.resource_group(metadata.group_name()).get_priority( extras.current_level() as usize, if metadata.override_priority() == 0 { @@ -867,6 +885,35 @@ pub(crate) mod tests { ); } + #[test] + fn test_resource_group_crud() { + let resource_manager = ResourceGroupManager::default(); + assert_eq!(resource_manager.get_group_count(), 1); + + let group1 = new_resource_group_ru("test1".into(), 100, HIGH_PRIORITY); + resource_manager.add_resource_group(group1); + assert_eq!(resource_manager.get_group_count(), 2); + + let group2 = new_resource_group_ru("test2".into(), 200, LOW_PRIORITY); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.get_group_count(), 3); + + let group1 = new_resource_group_ru("test1".into(), 150, HIGH_PRIORITY); + resource_manager.add_resource_group(group1.clone()); + assert_eq!(resource_manager.get_group_count(), 3); + assert_eq!( + resource_manager.get_resource_group("test1").unwrap().group, + group1 + ); + + resource_manager.remove_resource_group("test2"); + assert!(resource_manager.get_resource_group("test2").is_none()); + assert_eq!(resource_manager.get_group_count(), 2); + + resource_manager.remove_resource_group("test2"); + assert_eq!(resource_manager.get_group_count(), 2); + } + #[test] fn test_resource_group_priority() { let resource_manager = ResourceGroupManager::default(); @@ -1137,28 +1184,94 @@ pub(crate) mod tests { } #[test] - fn test_task_metadata() { - let cases = [ - ("default", 0u32), - ("default", 6u32), - ("test", 0u32), - ("test", 15u32), - ]; - - let metadata = TaskMetadata::from_ctx(&ResourceControlContext::default()); - assert_eq!(metadata.group_name(), b"default"); - for (group_name, priority) in cases { - let metadata = TaskMetadata::from_ctx(&ResourceControlContext { - resource_group_name: group_name.to_string(), - override_priority: priority as u64, - ..Default::default() - }); - assert_eq!(metadata.override_priority(), priority); - assert_eq!(metadata.group_name(), group_name.as_bytes()); - let vec = metadata.to_vec(); - let metadata1 = TaskMetadata::from_bytes(&vec); - assert_eq!(metadata1.override_priority(), priority); - assert_eq!(metadata1.group_name(), group_name.as_bytes()); - } + fn test_get_resource_limiter() { + let mgr = ResourceGroupManager::default(); + + let default_group = new_background_resource_group_ru( + "default".into(), + 200, + MEDIUM_PRIORITY, + vec!["br".into(), "stats".into()], + ); + mgr.add_resource_group(default_group); + let default_limiter = mgr + .get_resource_group("default") + .unwrap() + .limiter + .clone() + .unwrap(); + + assert!(mgr.get_resource_limiter("default", "query", 0).is_none()); + assert!( + mgr.get_resource_limiter("default", "query", HIGH_PRIORITY as u64) + .is_none() + ); + + let group1 = new_resource_group("test1".into(), true, 100, 100, HIGH_PRIORITY); + mgr.add_resource_group(group1); + + let bg_group = new_background_resource_group_ru( + "bg".into(), + 50, + LOW_PRIORITY, + vec!["ddl".into(), "stats".into()], + ); + mgr.add_resource_group(bg_group); + let bg_limiter = mgr + .get_resource_group("bg") + .unwrap() + .limiter + .clone() + .unwrap(); + + assert!( + mgr.get_background_resource_limiter("test1", "ddl") + .is_none() + ); + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("test1", "stats") + .unwrap(), + &default_limiter + )); + + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("bg", "stats").unwrap(), + &bg_limiter + )); + assert!(mgr.get_background_resource_limiter("bg", "br").is_none()); + assert!( + mgr.get_background_resource_limiter("bg", "invalid") + .is_none() + ); + + assert!(Arc::ptr_eq( + &mgr.get_background_resource_limiter("unknown", "stats") + .unwrap(), + &default_limiter + )); + + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "stats", 0).unwrap(), + &default_limiter + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "query", 0).unwrap(), + &mgr.priority_limiters[0] + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("test1", "query", LOW_PRIORITY as u64) + .unwrap(), + &mgr.priority_limiters[2] + )); + + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("default", "query", LOW_PRIORITY as u64) + .unwrap(), + &mgr.priority_limiters[2] + )); + assert!(Arc::ptr_eq( + &mgr.get_resource_limiter("unknown", "query", 0).unwrap(), + &mgr.priority_limiters[1] + )); } } diff --git a/components/resource_control/src/resource_limiter.rs b/components/resource_control/src/resource_limiter.rs index 8898b4eba23..ab2144f18cc 100644 --- a/components/resource_control/src/resource_limiter.rs +++ b/components/resource_control/src/resource_limiter.rs @@ -39,6 +39,8 @@ pub struct ResourceLimiter { name: String, version: u64, limiters: [QuotaLimiter; ResourceType::COUNT], + // whether the resource limiter is a background limiter or priority limiter. + is_background: bool, } impl std::fmt::Debug for ResourceLimiter { @@ -48,29 +50,43 @@ impl std::fmt::Debug for ResourceLimiter { } impl ResourceLimiter { - pub fn new(name: String, cpu_limit: f64, io_limit: f64, version: u64) -> Self { + pub fn new( + name: String, + cpu_limit: f64, + io_limit: f64, + version: u64, + is_background: bool, + ) -> Self { let cpu_limiter = QuotaLimiter::new(cpu_limit); let io_limiter = QuotaLimiter::new(io_limit); Self { name, version, limiters: [cpu_limiter, io_limiter], + is_background, } } - pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { + pub fn is_background(&self) -> bool { + self.is_background + } + + pub fn consume(&self, cpu_time: Duration, io_bytes: IoBytes, wait: bool) -> Duration { let cpu_dur = - self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64); - let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes); + self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64, wait); + let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes, wait); let wait_dur = cpu_dur.max(io_dur); - BACKGROUND_TASKS_WAIT_DURATION - .with_label_values(&[&self.name]) - .inc_by(wait_dur.as_micros() as u64); + if wait_dur > Duration::ZERO { + BACKGROUND_TASKS_WAIT_DURATION + .with_label_values(&[&self.name]) + .inc_by(wait_dur.as_micros() as u64); + } + wait_dur } pub async fn async_consume(&self, cpu_time: Duration, io_bytes: IoBytes) -> Duration { - let dur = self.consume(cpu_time, io_bytes); + let dur = self.consume(cpu_time, io_bytes, true); if !dur.is_zero() { _ = GLOBAL_TIMER_HANDLE .delay(Instant::now() + dur) @@ -86,7 +102,7 @@ impl ResourceLimiter { } pub(crate) fn get_limit_statistics(&self, ty: ResourceType) -> GroupStatistics { - let (total_consumed, total_wait_dur_us, read_consumed, write_consumed) = + let (total_consumed, total_wait_dur_us, read_consumed, write_consumed, request_count) = self.limiters[ty as usize].get_statistics(); GroupStatistics { version: self.version, @@ -94,6 +110,7 @@ impl ResourceLimiter { total_wait_dur_us, read_consumed, write_consumed, + request_count, } } } @@ -104,6 +121,7 @@ pub(crate) struct QuotaLimiter { total_wait_dur_us: AtomicU64, read_bytes: AtomicU64, write_bytes: AtomicU64, + req_count: AtomicU64, } impl QuotaLimiter { @@ -113,6 +131,7 @@ impl QuotaLimiter { total_wait_dur_us: AtomicU64::new(0), read_bytes: AtomicU64::new(0), write_bytes: AtomicU64::new(0), + req_count: AtomicU64::new(0), } } @@ -128,40 +147,47 @@ impl QuotaLimiter { self.limiter.set_speed_limit(limit); } - fn get_statistics(&self) -> (u64, u64, u64, u64) { + fn get_statistics(&self) -> (u64, u64, u64, u64, u64) { ( self.limiter.total_bytes_consumed() as u64, self.total_wait_dur_us.load(Ordering::Relaxed), self.read_bytes.load(Ordering::Relaxed), self.write_bytes.load(Ordering::Relaxed), + self.req_count.load(Ordering::Relaxed), ) } - fn consume(&self, value: u64) -> Duration { - if value == 0 { + fn consume(&self, value: u64, wait: bool) -> Duration { + if value == 0 && self.limiter.speed_limit().is_infinite() { return Duration::ZERO; } - let dur = self.limiter.consume_duration(value as usize); - if dur != Duration::ZERO { + let mut dur = self.limiter.consume_duration(value as usize); + if !wait { + dur = Duration::ZERO; + } else if dur != Duration::ZERO { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } + self.req_count.fetch_add(1, Ordering::Relaxed); dur } - fn consume_io(&self, value: IoBytes) -> Duration { + fn consume_io(&self, value: IoBytes, wait: bool) -> Duration { self.read_bytes.fetch_add(value.read, Ordering::Relaxed); self.write_bytes.fetch_add(value.write, Ordering::Relaxed); let value = value.read + value.write; - if value == 0 { + if value == 0 && self.limiter.speed_limit().is_infinite() { return Duration::ZERO; } - let dur = self.limiter.consume_duration(value as usize); - if dur != Duration::ZERO { + let mut dur = self.limiter.consume_duration(value as usize); + if !wait { + dur = Duration::ZERO; + } else if dur != Duration::ZERO { self.total_wait_dur_us .fetch_add(dur.as_micros() as u64, Ordering::Relaxed); } + self.req_count.fetch_add(1, Ordering::Relaxed); dur } } @@ -173,6 +199,7 @@ pub struct GroupStatistics { pub total_wait_dur_us: u64, pub read_consumed: u64, pub write_consumed: u64, + pub request_count: u64, } impl std::ops::Sub for GroupStatistics { @@ -184,6 +211,7 @@ impl std::ops::Sub for GroupStatistics { total_wait_dur_us: self.total_wait_dur_us.saturating_sub(rhs.total_wait_dur_us), read_consumed: self.read_consumed.saturating_sub(rhs.read_consumed), write_consumed: self.write_consumed.saturating_sub(rhs.write_consumed), + request_count: self.request_count.saturating_sub(rhs.request_count), } } } @@ -198,6 +226,7 @@ impl std::ops::Div for GroupStatistics { total_wait_dur_us: (self.total_wait_dur_us as f64 / rhs) as u64, read_consumed: (self.read_consumed as f64 / rhs) as u64, write_consumed: (self.write_consumed as f64 / rhs) as u64, + request_count: (self.request_count as f64 / rhs) as u64, } } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index 5ecac9d74c4..26652cda00e 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -565,13 +565,17 @@ pub mod tests { s_clone.report_ru_metrics().await; }); // Mock consume. - let bg_limiter = s.manager.get_resource_limiter("background", "br").unwrap(); + let bg_limiter = s + .manager + .get_background_resource_limiter("background", "br") + .unwrap(); bg_limiter.consume( Duration::from_secs(2), IoBytes { read: 1000, write: 1000, }, + true, ); // Wait for report ru metrics. std::thread::sleep(Duration::from_millis(100)); @@ -584,7 +588,7 @@ pub mod tests { s.manager.add_resource_group(background_group); let new_bg_limiter = s .manager - .get_resource_limiter("background", "lightning") + .get_background_resource_limiter("background", "lightning") .unwrap(); new_bg_limiter.consume( Duration::from_secs(5), @@ -592,6 +596,7 @@ pub mod tests { read: 2000, write: 2000, }, + true, ); // Wait for report ru metrics. std::thread::sleep(Duration::from_millis(100)); diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index deb1b2e44de..b90787914d6 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -9,11 +9,15 @@ use std::{ }; use file_system::{fetch_io_bytes, IoBytes, IoType}; +use prometheus::Histogram; use strum::EnumCount; use tikv_util::{ + debug, + resource_control::TaskPriority, sys::{cpu_time::ProcessStat, SysQuota}, time::Instant, warn, + yatp_pool::metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC, }; use crate::{ @@ -25,6 +29,10 @@ use crate::{ pub const BACKGROUND_LIMIT_ADJUST_DURATION: Duration = Duration::from_secs(10); const MICROS_PER_SEC: f64 = 1_000_000.0; +// the minimal schedule wait duration due to the overhead of queue. +// We should exclude this cause when calculate the estimated total wait +// duration. +const MINIMAL_SCHEDULE_WAIT_SECS: f64 = 0.000_005; //5us pub struct ResourceUsageStats { total_quota: f64, @@ -39,7 +47,7 @@ pub struct SysQuotaGetter { process_stat: ProcessStat, prev_io_stats: [IoBytes; IoType::COUNT], prev_io_ts: Instant, - io_bandwidth: u64, + io_bandwidth: f64, } impl ResourceStatsProvider for SysQuotaGetter { @@ -55,7 +63,7 @@ impl ResourceStatsProvider for SysQuotaGetter { } ResourceType::Io => { let mut stats = ResourceUsageStats { - total_quota: self.io_bandwidth as f64, + total_quota: self.io_bandwidth, current_used: 0.0, }; let now = Instant::now_coarse(); @@ -97,7 +105,7 @@ impl GroupQuotaAdjustWorker { process_stat: ProcessStat::cur_proc_stat().unwrap(), prev_io_stats: [IoBytes::default(); IoType::COUNT], prev_io_ts: Instant::now_coarse(), - io_bandwidth, + io_bandwidth: io_bandwidth as f64, }; Self::with_quota_getter(resource_ctl, resource_quota_getter) } @@ -295,6 +303,240 @@ struct GroupStats { expect_cost_rate: f64, } +/// PriorityLimiterAdjustWorker automically adjust the quota of each priority +/// limiter based on the statistics data during a certain period of time. +/// In general, caller should call this function in a fixed interval. +pub struct PriorityLimiterAdjustWorker { + resource_ctl: Arc, + trackers: [PriorityLimiterStatsTracker; TaskPriority::PRIORITY_COUNT], + resource_quota_getter: R, + last_adjust_time: Instant, + is_last_low_cpu: bool, + is_last_single_group: bool, +} + +impl PriorityLimiterAdjustWorker { + pub fn new(resource_ctl: Arc) -> Self { + let resource_quota_getter = SysQuotaGetter { + process_stat: ProcessStat::cur_proc_stat().unwrap(), + prev_io_stats: [IoBytes::default(); IoType::COUNT], + prev_io_ts: Instant::now_coarse(), + io_bandwidth: f64::INFINITY, + }; + Self::with_quota_getter(resource_ctl, resource_quota_getter) + } +} + +impl PriorityLimiterAdjustWorker { + fn with_quota_getter( + resource_ctl: Arc, + resource_quota_getter: R, + ) -> Self { + let trackers = resource_ctl + .get_priority_resource_limiters() + .zip(TaskPriority::priorities()) + .map(|(l, p)| PriorityLimiterStatsTracker::new(l, p.as_str())); + Self { + resource_ctl, + trackers, + resource_quota_getter, + last_adjust_time: Instant::now_coarse(), + is_last_low_cpu: true, + is_last_single_group: true, + } + } + pub fn adjust(&mut self) { + let now = Instant::now_coarse(); + let dur = now.saturating_duration_since(self.last_adjust_time); + if dur < Duration::from_secs(1) { + warn!("adjust duration too small, skip adjustment."; "dur" => ?dur); + return; + } + self.last_adjust_time = now; + + // fast path for only the default resource group which means resource + // control is not used at all. + let group_count = self.resource_ctl.get_group_count(); + if group_count == 1 { + if self.is_last_single_group { + return; + } + self.is_last_single_group = true; + self.trackers.iter().skip(1).for_each(|t| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY) + }); + return; + } + self.is_last_single_group = false; + + let stats: [_; TaskPriority::PRIORITY_COUNT] = + array::from_fn(|i| self.trackers[i].get_and_update_last_stats(dur.as_secs_f64())); + + let process_cpu_stats = match self + .resource_quota_getter + .get_current_stats(ResourceType::Cpu) + { + Ok(s) => s, + Err(e) => { + warn!("get process total cpu failed; skip adjusment."; "err" => ?e); + return; + } + }; + + if process_cpu_stats.current_used < process_cpu_stats.total_quota * 0.5 { + if self.is_last_low_cpu { + return; + } + self.is_last_low_cpu = true; + self.trackers.iter().skip(1).for_each(|t| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + // 0 represent infinity + PRIORITY_QUOTA_LIMIT_VEC + .get_metric_with_label_values(&[t.priority]) + .unwrap() + .set(0); + }); + return; + } + self.is_last_low_cpu = false; + + let total_reqs: u64 = stats.iter().map(|s| s.req_count).sum(); + let max_reqs = stats.iter().map(|s| s.req_count).max().unwrap(); + // there is only 1 active priority, do not restrict. + if total_reqs * 99 / 100 <= max_reqs { + self.trackers + .iter() + .skip(1) + .for_each(|t: &PriorityLimiterStatsTracker| { + t.limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY) + }); + return; + } + + let cpu_duration: [_; TaskPriority::PRIORITY_COUNT] = array::from_fn(|i| stats[i].cpu_secs); + let real_cpu_total: f64 = cpu_duration.iter().sum(); + let expect_pool_cpu_total = real_cpu_total * (process_cpu_stats.total_quota * 0.95) + / process_cpu_stats.current_used; + let mut limits = [0.0; 2]; + let level_expected: [_; TaskPriority::PRIORITY_COUNT] = + array::from_fn(|i| stats[i].cpu_secs + stats[i].wait_secs); + // substract the cpu time usage for priority high. + let mut expect_cpu_time_total = expect_pool_cpu_total - level_expected[0]; + + // still reserve a minimal cpu quota + let minimal_quota = process_cpu_stats.total_quota / MICROS_PER_SEC * 0.05; + for i in 1..self.trackers.len() { + if expect_cpu_time_total < minimal_quota { + expect_cpu_time_total = minimal_quota; + } + let limit = expect_cpu_time_total * MICROS_PER_SEC; + self.trackers[i] + .limiter + .get_limiter(ResourceType::Cpu) + .set_rate_limit(limit); + PRIORITY_QUOTA_LIMIT_VEC + .get_metric_with_label_values(&[self.trackers[i].priority]) + .unwrap() + .set(limit as i64); + limits[i - 1] = limit; + expect_cpu_time_total -= level_expected[i]; + } + debug!("adjsut cpu limiter by priority"; "cpu_quota" => process_cpu_stats.total_quota, + "process_cpu" => process_cpu_stats.current_used, "expected_cpu" => ?level_expected, + "cpu_costs" => ?cpu_duration, "limits" => ?limits, + "limit_cpu_total" => expect_pool_cpu_total, "pool_cpu_cost" => real_cpu_total); + } +} + +#[derive(Debug)] +struct LimiterStats { + // QuotaLimiter consumed cpu secs in total + cpu_secs: f64, + // QuotaLimiter waited secs in total. + wait_secs: f64, + // the total number of tasks that are scheduled. + req_count: u64, +} + +struct HistogramTracker { + metrics: Histogram, + last_sum: f64, + last_count: u64, +} + +impl HistogramTracker { + fn new(metrics: Histogram) -> Self { + let last_sum = metrics.get_sample_sum(); + let last_count = metrics.get_sample_count(); + Self { + metrics, + last_sum, + last_count, + } + } + + fn get_and_upate_statistics(&mut self) -> (f64, u64) { + let cur_sum = self.metrics.get_sample_sum(); + let cur_count = self.metrics.get_sample_count(); + let res = (cur_sum - self.last_sum, cur_count - self.last_count); + self.last_sum = cur_sum; + self.last_count = cur_count; + res + } +} + +struct PriorityLimiterStatsTracker { + priority: &'static str, + limiter: Arc, + last_stats: GroupStatistics, + // unified-read-pool and schedule-worker-pool wait duration metrics. + task_wait_dur_trakcers: [HistogramTracker; 2], +} + +impl PriorityLimiterStatsTracker { + fn new(limiter: Arc, priority: &'static str) -> Self { + let task_wait_dur_trakcers = + ["unified-read-pool", "sched-worker-priority"].map(|pool_name| { + HistogramTracker::new( + YATP_POOL_SCHEDULE_WAIT_DURATION_VEC + .get_metric_with_label_values(&[pool_name, priority]) + .unwrap(), + ) + }); + let last_stats = limiter.get_limit_statistics(ResourceType::Cpu); + Self { + priority, + limiter, + last_stats, + task_wait_dur_trakcers, + } + } + + fn get_and_update_last_stats(&mut self, dur_secs: f64) -> LimiterStats { + let cur_stats = self.limiter.get_limit_statistics(ResourceType::Cpu); + let stats_delta = (cur_stats - self.last_stats) / dur_secs; + self.last_stats = cur_stats; + let wait_stats: [_; 2] = + array::from_fn(|i| self.task_wait_dur_trakcers[i].get_and_upate_statistics()); + let schedule_wait_dur_secs = wait_stats.iter().map(|s| s.0).sum::() / dur_secs; + let expected_wait_dur_secs = stats_delta.request_count as f64 * MINIMAL_SCHEDULE_WAIT_SECS; + let normed_schedule_wait_dur_secs = + (schedule_wait_dur_secs - expected_wait_dur_secs).max(0.0); + LimiterStats { + cpu_secs: stats_delta.total_consumed as f64 / MICROS_PER_SEC, + wait_secs: stats_delta.total_wait_dur_us as f64 / MICROS_PER_SEC + + normed_schedule_wait_dur_secs, + req_count: stats_delta.request_count, + } + } +} + #[cfg(test)] mod tests { use std::time::Duration; @@ -340,7 +582,11 @@ mod tests { let resource_ctl = Arc::new(ResourceGroupManager::default()); let rg1 = new_resource_group_ru("test".into(), 1000, 14); resource_ctl.add_resource_group(rg1); - assert!(resource_ctl.get_resource_limiter("test", "br").is_none()); + assert!( + resource_ctl + .get_background_resource_limiter("test", "br") + .is_none() + ); let test_provider = TestResourceStatsProvider::new(8.0, 10000.0); let mut worker = @@ -351,10 +597,12 @@ mod tests { resource_ctl.add_resource_group(default_bg); assert!( resource_ctl - .get_resource_limiter("default", "lightning") + .get_background_resource_limiter("default", "lightning") .is_none() ); - let limiter = resource_ctl.get_resource_limiter("default", "br").unwrap(); + let limiter = resource_ctl + .get_background_resource_limiter("default", "br") + .unwrap(); assert!( limiter .get_limiter(ResourceType::Cpu) @@ -452,6 +700,7 @@ mod tests { read: 1000, write: 1000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -481,6 +730,7 @@ mod tests { read: 1000, write: 1000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -499,6 +749,7 @@ mod tests { read: 5000, write: 5000, }, + true, ); worker.adjust_quota(); check_limiter( @@ -513,13 +764,15 @@ mod tests { let default = new_background_resource_group_ru("default".into(), 2000, 8, vec!["br".into()]); resource_ctl.add_resource_group(default); - let new_limiter = resource_ctl.get_resource_limiter("default", "br").unwrap(); + let new_limiter = resource_ctl + .get_background_resource_limiter("default", "br") + .unwrap(); assert_eq!(&*new_limiter as *const _, &*limiter as *const _); let bg = new_background_resource_group_ru("background".into(), 1000, 15, vec!["br".into()]); resource_ctl.add_resource_group(bg); let bg_limiter = resource_ctl - .get_resource_limiter("background", "br") + .get_background_resource_limiter("background", "br") .unwrap(); reset_quota(&mut worker, 5.0, 7000.0, Duration::from_secs(1)); @@ -548,6 +801,7 @@ mod tests { read: 600, write: 600, }, + true, ); bg_limiter.consume( Duration::from_millis(1800), @@ -555,6 +809,7 @@ mod tests { read: 900, write: 900, }, + true, ); worker.adjust_quota(); check_limiter( @@ -581,7 +836,7 @@ mod tests { new_background_resource_group_ru("background".into(), 1000, 15, vec!["br".into()]); resource_ctl.add_resource_group(new_bg); let new_bg_limiter = resource_ctl - .get_resource_limiter("background", "br") + .get_background_resource_limiter("background", "br") .unwrap(); assert_ne!(&*bg_limiter as *const _, &*new_bg_limiter as *const _); assert!( @@ -623,6 +878,7 @@ mod tests { read: 600, write: 600, }, + true, ); new_bg_limiter.consume( Duration::from_millis(1800), @@ -630,6 +886,7 @@ mod tests { read: 900, write: 900, }, + true, ); worker.adjust_quota(); @@ -650,4 +907,118 @@ mod tests { }, ); } + + #[test] + fn test_adjust_priority_resource_limiter() { + let resource_ctl = Arc::new(ResourceGroupManager::default()); + let priority_limiters = resource_ctl.get_priority_resource_limiters(); + let test_provider = TestResourceStatsProvider::new(8.0, f64::INFINITY); + let mut worker = + PriorityLimiterAdjustWorker::with_quota_getter(resource_ctl.clone(), test_provider); + + let reset_quota = |worker: &mut PriorityLimiterAdjustWorker, + cpu: f64| { + worker.resource_quota_getter.cpu_used = cpu; + worker.last_adjust_time = Instant::now_coarse() - Duration::from_secs(10); + priority_limiters[1] + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + priority_limiters[2] + .get_limiter(ResourceType::Cpu) + .set_rate_limit(f64::INFINITY); + }; + + fn check(val: f64, expected: f64) { + assert!( + (val.is_infinite() && expected.is_infinite()) + || (expected * 0.99 < val && val < expected * 1.01), + "actual: {}, expected: {}", + val, + expected + ); + } + + let check_limiter = |high: f64, medium: f64, low: f64| { + check( + priority_limiters[0] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + high * MICROS_PER_SEC, + ); + check( + priority_limiters[1] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + medium * MICROS_PER_SEC, + ); + check( + priority_limiters[2] + .get_limiter(ResourceType::Cpu) + .get_rate_limit(), + low * MICROS_PER_SEC, + ); + }; + + // only default group, always return infinity. + reset_quota(&mut worker, 6.4); + priority_limiters[1].consume(Duration::from_secs(50), IoBytes::default(), true); + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + let rg1 = new_resource_group_ru("test_high".into(), 1000, 16); + resource_ctl.add_resource_group(rg1); + let rg2 = new_resource_group_ru("test_low".into(), 2000, 1); + resource_ctl.add_resource_group(rg2); + + reset_quota(&mut worker, 6.4); + priority_limiters[1].consume(Duration::from_secs(64), IoBytes::default(), true); + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(400), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 1.2); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(120), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(200), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 2.6, 0.6); + + reset_quota(&mut worker, 6.4); + for _i in 0..100 { + priority_limiters[2].consume(Duration::from_millis(200), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, f64::INFINITY, f64::INFINITY); + + reset_quota(&mut worker, 8.0); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[1].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[2].consume(Duration::from_millis(320), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 2.8); + + reset_quota(&mut worker, 6.0); + for _i in 0..100 { + priority_limiters[0].consume(Duration::from_millis(240), IoBytes::default(), true); + priority_limiters[2].consume(Duration::from_millis(360), IoBytes::default(), true); + } + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 5.2); + + // duration too small, unchanged. + worker.resource_quota_getter.cpu_used = 6.0; + worker.last_adjust_time = Instant::now_coarse() - Duration::from_millis(500); + worker.adjust(); + check_limiter(f64::INFINITY, 5.2, 5.2); + } } diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 55da894c6e8..9062a9f094e 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -56,6 +56,7 @@ futures = "0.3" grpcio = { workspace = true } grpcio-health = { workspace = true } hex = "0.4" +hybrid_engine = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } libc = "0.2" @@ -68,7 +69,7 @@ raft = { workspace = true } raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } -rand = "0.8" +region_cache_memory_engine = { workspace = true } resolved_ts = { workspace = true } resource_control = { workspace = true } resource_metering = { workspace = true } diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 165a1c8509e..a2415facad1 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -28,8 +28,10 @@ use engine_traits::{ use error_code::ErrorCodeExt; use file_system::{get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor}; use grpcio::Environment; +use hybrid_engine::HybridEngine; use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; +use region_cache_memory_engine::RegionCacheMemoryEngine; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, TikvConfig}, @@ -695,6 +697,22 @@ impl Stop for LazyWorker { } } +pub trait KvEngineBuilder: KvEngine { + fn build(disk_engine: RocksEngine) -> Self; +} + +impl KvEngineBuilder for RocksEngine { + fn build(disk_engine: RocksEngine) -> Self { + disk_engine + } +} + +impl KvEngineBuilder for HybridEngine { + fn build(_disk_engine: RocksEngine) -> Self { + unimplemented!() + } +} + pub trait ConfiguredRaftEngine: RaftEngine { fn build( _: &TikvConfig, @@ -762,7 +780,11 @@ impl ConfiguredRaftEngine for RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + self.clone(), + DbType::Raft, + )), ); } } diff --git a/components/server/src/memory.rs b/components/server/src/memory.rs index 303ff257a78..fadf18f7534 100644 --- a/components/server/src/memory.rs +++ b/components/server/src/memory.rs @@ -19,9 +19,24 @@ impl MemoryTraceManager { for id in ids { let sub_trace = provider.sub_trace(id); let sub_trace_name = sub_trace.name(); - MEM_TRACE_SUM_GAUGE - .with_label_values(&[&format!("{}-{}", provider_name, sub_trace_name)]) - .set(sub_trace.sum() as i64) + let leaf_ids = sub_trace.get_children_ids(); + if leaf_ids.is_empty() { + MEM_TRACE_SUM_GAUGE + .with_label_values(&[&format!("{}-{}", provider_name, sub_trace_name)]) + .set(sub_trace.sum() as i64); + } else { + for leaf_id in leaf_ids { + let leaf = sub_trace.sub_trace(leaf_id); + MEM_TRACE_SUM_GAUGE + .with_label_values(&[&format!( + "{}-{}-{}", + provider_name, + sub_trace_name, + leaf.name(), + )]) + .set(leaf.sum() as i64); + } + } } MEM_TRACE_SUM_GAUGE diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 7ff51474d7d..594eac686fe 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -28,18 +28,21 @@ use backup_stream::{ BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; -use cdc::{CdcConfigManager, MemoryQuota}; +use cdc::CdcConfigManager; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; +use engine_rocks::{ + from_rocks_compression_type, RocksCompactedEvent, RocksEngine, RocksStatistics, +}; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - Engines, KvEngine, MiscExt, RaftEngine, SingletonFactory, TabletContext, TabletRegistry, - CF_DEFAULT, CF_WRITE, + Engines, KvEngine, RaftEngine, SingletonFactory, TabletContext, TabletRegistry, CF_DEFAULT, + CF_WRITE, }; use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetricsManager}; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; +use hybrid_engine::HybridEngine; use kvproto::{ brpb::create_backup, cdcpb::create_change_data, deadlock::create_deadlock, debugpb::create_debug, diagnosticspb::create_diagnostics, import_sstpb::create_import_sst, @@ -69,13 +72,16 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; +use region_cache_memory_engine::RegionCacheMemoryEngine; use resolved_ts::{LeadershipResolver, Task}; use resource_control::ResourceGroupManager; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use snap_recovery::RecoveryService; use tikv::{ - config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, + config::{ + ConfigController, DbConfigManger, DbType, LogConfigManager, MemoryConfigManager, TikvConfig, + }, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, @@ -105,9 +111,11 @@ use tikv::{ Engine, Storage, }, }; +use tikv_alloc::{add_thread_memory_accessor, remove_thread_memory_accessor}; use tikv_util::{ check_environment_variables, - config::VersionTrack, + config::{ReadableSize, VersionTrack}, + memory::MemoryQuota, mpsc as TikvMpsc, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, @@ -120,7 +128,10 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + common::{ + ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, KvEngineBuilder, + TikvServerCore, + }, memory::*, setup::*, signal_handler, @@ -128,12 +139,16 @@ use crate::{ }; #[inline] -fn run_impl( +fn run_impl( config: TikvConfig, service_event_tx: TikvMpsc::Sender, service_event_rx: TikvMpsc::Receiver, -) { - let mut tikv = TikvServer::::init(config, service_event_tx.clone()); +) where + EK: KvEngine + KvEngineBuilder, + CER: ConfiguredRaftEngine, + F: KvFormat, +{ + let mut tikv = TikvServer::::init(config, service_event_tx.clone()); // Must be called after `TikvServer::init`. let memory_limit = tikv.core.config.memory_usage_limit.unwrap().0; let high_water = (tikv.core.config.memory_usage_high_water * memory_limit as f64) as u64; @@ -194,15 +209,6 @@ pub fn run_tikv( service_event_tx: TikvMpsc::Sender, service_event_rx: TikvMpsc::Receiver, ) { - // Sets the global logger ASAP. - // It is okay to use the config w/o `validate()`, - // because `initial_logger()` handles various conditions. - initial_logger(&config); - - // Print version information. - let build_timestamp = option_env!("TIKV_BUILD_TIME"); - tikv::log_tikv_info(build_timestamp); - // Print resource quota. SysQuota::log_quota(); CPU_CORES_QUOTA_GAUGE.set(SysQuota::cpu_cores_quota()); @@ -214,9 +220,33 @@ pub fn run_tikv( dispatch_api_version!(config.storage.api_version(), { if !config.raft_engine.enable { - run_impl::(config, service_event_tx, service_event_rx) + if config.region_cache_memory_limit == ReadableSize(0) { + run_impl::( + config, + service_event_tx, + service_event_rx, + ) + } else { + run_impl::, RocksEngine, API>( + config, + service_event_tx, + service_event_rx, + ) + } } else { - run_impl::(config, service_event_tx, service_event_rx) + if config.region_cache_memory_limit == ReadableSize(0) { + run_impl::( + config, + service_event_tx, + service_event_rx, + ) + } else { + run_impl::, RaftLogEngine, API>( + config, + service_event_tx, + service_event_rx, + ) + } } }) } @@ -226,21 +256,26 @@ const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); /// A complete TiKV server. -struct TikvServer { +struct TikvServer +where + EK: KvEngine, + ER: RaftEngine, + F: KvFormat, +{ core: TikvServerCore, cfg_controller: Option, security_mgr: Arc, pd_client: Arc, - router: RaftRouter, - system: Option>, + router: RaftRouter, + system: Option>, resolver: Option, snap_mgr: Option, // Will be filled in `init_servers`. - engines: Option>, + engines: Option>, kv_statistics: Option>, raft_statistics: Option>, - servers: Option>, + servers: Option>, region_info_accessor: RegionInfoAccessor, - coprocessor_host: Option>, + coprocessor_host: Option>, concurrency_manager: ConcurrencyManager, env: Arc, check_leader_worker: Worker, @@ -264,9 +299,9 @@ struct Servers { lock_mgr: LockManager, server: LocalServer, node: Node, - importer: Arc, + importer: Arc>, cdc_scheduler: tikv_util::worker::Scheduler, - cdc_memory_quota: MemoryQuota, + cdc_memory_quota: Arc, rsmeter_pubsub_service: resource_metering::PubSubService, backup_stream_scheduler: Option>, debugger: DebuggerImpl>, LockManager, F>, @@ -275,12 +310,13 @@ struct Servers { type LocalServer = Server>; type LocalRaftKv = RaftKv>; -impl TikvServer +impl TikvServer where + EK: KvEngine, ER: RaftEngine, F: KvFormat, { - fn init(mut config: TikvConfig, tx: TikvMpsc::Sender) -> TikvServer { + fn init(mut config: TikvConfig, tx: TikvMpsc::Sender) -> TikvServer { tikv_util::thread_group::set_properties(Some(GroupProperties::default())); // It is okay use pd config and security config before `init_config`, // because these configs must be provided by command line, and only @@ -293,6 +329,13 @@ where EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .after_start(|| { + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. + unsafe { add_thread_memory_accessor() }; + }) + .before_stop(|| { + remove_thread_memory_accessor(); + }) .build(), ); let pd_client = TikvServerCore::connect_to_pd_cluster( @@ -357,6 +400,7 @@ where router.clone(), config.coprocessor.clone(), )); + let region_info_accessor = RegionInfoAccessor::new(coprocessor_host.as_mut().unwrap()); // Initialize concurrency manager @@ -433,7 +477,7 @@ where } } - fn init_engines(&mut self, engines: Engines) { + fn init_engines(&mut self, engines: Engines) { let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); let engine = RaftKv::new( ServerRaftStoreRouter::new( @@ -455,9 +499,7 @@ where }); } - fn init_gc_worker( - &mut self, - ) -> GcWorker>> { + fn init_gc_worker(&mut self) -> GcWorker>> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), @@ -496,6 +538,7 @@ where ); cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + cfg_controller.register(tikv::config::Module::Memory, Box::new(MemoryConfigManager)); // Create cdc. let mut cdc_worker = Box::new(LazyWorker::new("cdc")); @@ -522,7 +565,7 @@ where if let Some(sst_worker) = &mut self.sst_worker { let sst_runner = RecoveryRunner::new( - engines.engines.kv.clone(), + engines.engines.kv.get_disk_engine().clone(), engines.store_meta.clone(), self.core .config @@ -545,6 +588,7 @@ where engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + true, )) } else { None @@ -986,7 +1030,9 @@ where } // Start CDC. - let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = Arc::new(MemoryQuota::new( + self.core.config.cdc.sink_memory_quota.0 as _, + )); let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, @@ -1034,7 +1080,10 @@ where // Create Debugger. let mut debugger = DebuggerImpl::new( - engines.engines.clone(), + Engines::new( + engines.engines.kv.get_disk_engine().clone(), + engines.engines.raft.clone(), + ), self.cfg_controller.as_ref().unwrap().clone(), Some(storage), ); @@ -1069,6 +1118,7 @@ where servers.importer.clone(), None, self.resource_manager.clone(), + Arc::new(self.region_info_accessor.clone()), ); let import_cfg_mgr = import_service.get_config_manager(); @@ -1155,7 +1205,7 @@ where let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); let backup_service = - backup::Service::::with_router(backup_scheduler, self.router.clone()); + backup::Service::::with_router(backup_scheduler, self.router.clone()); if servers .server .register_service(create_backup(backup_service)) @@ -1274,7 +1324,7 @@ where ); } - fn init_storage_stats_task(&self, engines: Engines) { + fn init_storage_stats_task(&self, engines: Engines) { let config_disk_capacity: u64 = self.core.config.raft_store.capacity.0; let data_dir = self.core.config.storage.data_dir.clone(); let store_path = self.core.store_path.clone(); @@ -1442,7 +1492,6 @@ where self.cfg_controller.take().unwrap(), Arc::new(self.core.config.security.clone()), self.engines.as_ref().unwrap().engine.raft_extension(), - self.core.store_path.clone(), self.resource_manager.clone(), self.grpc_service_mgr.clone(), ) { @@ -1502,11 +1551,16 @@ where } } -impl TikvServer { +impl TikvServer +where + EK: KvEngine + KvEngineBuilder, + CER: ConfiguredRaftEngine, + F: KvFormat, +{ fn init_raw_engines( &mut self, flow_listener: engine_rocks::FlowListener, - ) -> (Engines, Arc) { + ) -> (Engines, Arc) { let block_cache = self.core.config.storage.block_cache.build_shared_cache(); let env = self .core @@ -1540,19 +1594,24 @@ impl TikvServer { .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); let factory = Box::new(builder.build()); - let kv_engine = factory + let disk_engine = factory .create_shared_db(&self.core.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); + let kv_engine: EK = KvEngineBuilder::build(disk_engine.clone()); self.kv_statistics = Some(factory.rocks_statistics()); - let engines = Engines::new(kv_engine.clone(), raft_engine); + let engines = Engines::new(kv_engine, raft_engine); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + disk_engine.clone(), + DbType::Kv, + )), ); let reg = TabletRegistry::new( - Box::new(SingletonFactory::new(kv_engine)), + Box::new(SingletonFactory::new(disk_engine)), &self.core.store_path, ) .unwrap(); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index fe2b685313e..38f5e94038f 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -32,7 +32,7 @@ use backup_stream::{ BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; -use cdc::{CdcConfigManager, MemoryQuota}; +use cdc::CdcConfigManager; use concurrency_manager::ConcurrencyManager; use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine, TabletRegistry, CF_DEFAULT, CF_WRITE}; @@ -73,7 +73,7 @@ use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use tikv::{ config::{ loop_registry, ConfigController, ConfigurableDb, DbConfigManger, DbType, LogConfigManager, - TikvConfig, + MemoryConfigManager, TikvConfig, }, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, @@ -103,9 +103,11 @@ use tikv::{ Engine, Storage, }, }; +use tikv_alloc::{add_thread_memory_accessor, remove_thread_memory_accessor}; use tikv_util::{ check_environment_variables, config::VersionTrack, + memory::MemoryQuota, mpsc as TikvMpsc, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, @@ -192,15 +194,6 @@ pub fn run_tikv( service_event_tx: TikvMpsc::Sender, service_event_rx: TikvMpsc::Receiver, ) { - // Sets the global logger ASAP. - // It is okay to use the config w/o `validate()`, - // because `initial_logger()` handles various conditions. - initial_logger(&config); - - // Print version information. - let build_timestamp = option_env!("TIKV_BUILD_TIME"); - tikv::log_tikv_info(build_timestamp); - // Print resource quota. SysQuota::log_quota(); CPU_CORES_QUOTA_GAUGE.set(SysQuota::cpu_cores_quota()); @@ -243,7 +236,7 @@ struct TikvServer { env: Arc, cdc_worker: Option>>, cdc_scheduler: Option>, - cdc_memory_quota: Option, + cdc_memory_quota: Option>, backup_stream_scheduler: Option>, sst_worker: Option>>, quota_limiter: Arc, @@ -262,7 +255,7 @@ struct TikvEngines { struct Servers { lock_mgr: LockManager, server: LocalServer, - importer: Arc, + importer: Arc>, rsmeter_pubsub_service: resource_metering::PubSubService, } @@ -288,6 +281,13 @@ where EnvBuilder::new() .cq_count(config.server.grpc_concurrency) .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .after_start(|| { + // SAFETY: we will call `remove_thread_memory_accessor` at before_stop. + unsafe { add_thread_memory_accessor() }; + }) + .before_stop(|| { + remove_thread_memory_accessor(); + }) .build(), ); let pd_client = TikvServerCore::connect_to_pd_cluster( @@ -432,6 +432,7 @@ where ); cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + cfg_controller.register(tikv::config::Module::Memory, Box::new(MemoryConfigManager)); let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); cfg_controller.register( @@ -459,6 +460,7 @@ where engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + true, )) } else { None @@ -637,7 +639,9 @@ where Box::new(CdcConfigManager(cdc_scheduler.clone())), ); // Start cdc endpoint. - let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = Arc::new(MemoryQuota::new( + self.core.config.cdc.sink_memory_quota.0 as _, + )); let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, @@ -937,6 +941,7 @@ where backup_worker.start(backup_endpoint); // Import SST service. + let region_info_accessor = self.region_info_accessor.as_ref().unwrap().clone(); let import_service = ImportSstService::new( self.core.config.import.clone(), self.core.config.raft_store.raft_entry_max_size, @@ -945,6 +950,7 @@ where servers.importer.clone(), Some(self.router.as_ref().unwrap().store_meta().clone()), self.resource_manager.clone(), + Arc::new(region_info_accessor), ); let import_cfg_mgr = import_service.get_config_manager(); @@ -1290,7 +1296,6 @@ where self.cfg_controller.clone().unwrap(), Arc::new(self.core.config.security.clone()), self.engines.as_ref().unwrap().engine.raft_extension(), - self.core.store_path.clone(), self.resource_manager.clone(), self.grpc_service_mgr.clone(), ) { @@ -1482,7 +1487,11 @@ impl TikvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(registry.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + registry.clone(), + DbType::Kv, + )), ); self.tablet_registry = Some(registry.clone()); raft_engine.register_config(cfg_controller); diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index b758b9e39df..0228e0c7f28 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -74,7 +74,6 @@ fn make_engine_log_path(path: &str, sub_path: &str, filename: &str) -> String { }) } -#[allow(dead_code)] pub fn initial_logger(config: &TikvConfig) { fail::fail_point!("mock_force_uninitial_logger", |_| { LOG_INITIALIZED.store(false, Ordering::SeqCst); @@ -245,12 +244,10 @@ pub fn initial_metric(cfg: &MetricConfig) { pub fn overwrite_config_with_cmd_args(config: &mut TikvConfig, matches: &ArgMatches<'_>) { if let Some(level) = matches.value_of("log-level") { config.log.level = logger::get_level_by_string(level).unwrap().into(); - config.log_level = slog::Level::Info.into(); } if let Some(file) = matches.value_of("log-file") { config.log.file.filename = file.to_owned(); - config.log_file = "".to_owned(); } if let Some(addr) = matches.value_of("addr") { diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml index 8b0b0ec4c3a..23cbdcfe098 100644 --- a/components/snap_recovery/Cargo.toml +++ b/components/snap_recovery/Cargo.toml @@ -5,6 +5,13 @@ edition = "2021" publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] + [dependencies] chrono = "0.4" encryption = { workspace = true } diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index 7ece321d9dd..c6a14c1e0d3 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -3,8 +3,8 @@ use std::{cmp, error::Error as StdError, i32, result, sync::Arc, thread, time::Duration}; use encryption_export::data_key_manager_from_config; -use engine_rocks::{util::new_engine_opt, RocksEngine}; -use engine_traits::{Engines, Error as EngineError, Peekable, RaftEngine, SyncMutable}; +use engine_rocks::util::new_engine_opt; +use engine_traits::{Engines, Error as EngineError, KvEngine, RaftEngine}; use kvproto::{metapb, raft_serverpb::StoreIdent}; use pd_client::{Error as PdError, PdClient}; use raft_log_engine::RaftLogEngine; @@ -251,21 +251,21 @@ pub trait LocalEngineService { } // init engine and read local engine info -pub struct LocalEngines { - engines: Engines, +pub struct LocalEngines { + engines: Engines, } -impl LocalEngines { - pub fn new(engines: Engines) -> LocalEngines { +impl LocalEngines { + pub fn new(engines: Engines) -> LocalEngines { LocalEngines { engines } } - pub fn get_engine(&self) -> &Engines { + pub fn get_engine(&self) -> &Engines { &self.engines } } -impl LocalEngineService for LocalEngines { +impl LocalEngineService for LocalEngines { fn set_cluster_id(&self, cluster_id: u64) { let res = self .get_engine() diff --git a/components/snap_recovery/src/leader_keeper.rs b/components/snap_recovery/src/leader_keeper.rs index 417d5becca3..ca2623c82ca 100644 --- a/components/snap_recovery/src/leader_keeper.rs +++ b/components/snap_recovery/src/leader_keeper.rs @@ -9,18 +9,17 @@ use std::{ use engine_traits::KvEngine; use futures::compat::Future01CompatExt; -use itertools::Itertools; use raftstore::{ errors::{Error, Result}, store::{Callback, CasualMessage, CasualRouter, SignificantMsg, SignificantRouter}, }; use tikv_util::{future::paired_future_callback, timer::GLOBAL_TIMER_HANDLE}; -pub struct LeaderKeeper { +pub struct LeaderKeeper<'a, EK, Router: 'a> { router: Router, not_leader: HashSet, - _ek: PhantomData, + _ek: PhantomData<&'a EK>, } #[derive(Default)] @@ -51,10 +50,10 @@ impl std::fmt::Debug for StepResult { } } -impl LeaderKeeper +impl<'a, EK, Router> LeaderKeeper<'a, EK, Router> where EK: KvEngine, - Router: CasualRouter + SignificantRouter + 'static, + Router: CasualRouter + SignificantRouter + 'a, { pub fn new(router: Router, to_keep: impl IntoIterator) -> Self { Self { @@ -85,8 +84,9 @@ where const CONCURRENCY: usize = 256; let r = Mutex::new(StepResult::default()); let success = Mutex::new(HashSet::new()); - for batch in &self.not_leader.iter().chunks(CONCURRENCY) { - let tasks = batch.map(|region_id| async { + let regions = self.not_leader.iter().copied().collect::>(); + for batch in regions.as_slice().chunks(CONCURRENCY) { + let tasks = batch.iter().map(|region_id| async { match self.check_leader(*region_id).await { Ok(_) => { success.lock().unwrap().insert(*region_id); @@ -150,7 +150,7 @@ mod test { leaders: RefCell>, } - impl LeaderKeeper { + impl<'a, EK, Router> LeaderKeeper<'a, EK, Router> { fn mut_router(&mut self) -> &mut Router { &mut self.router } diff --git a/components/snap_recovery/src/region_meta_collector.rs b/components/snap_recovery/src/region_meta_collector.rs index e3542d6691b..3a88931fae4 100644 --- a/components/snap_recovery/src/region_meta_collector.rs +++ b/components/snap_recovery/src/region_meta_collector.rs @@ -2,8 +2,7 @@ use std::{cell::RefCell, error::Error as StdError, result, thread::JoinHandle}; -use engine_rocks::RocksEngine; -use engine_traits::{Engines, Iterable, Peekable, RaftEngine, CF_RAFT}; +use engine_traits::{Engines, KvEngine, RaftEngine, CF_RAFT}; use futures::channel::mpsc::UnboundedSender; use kvproto::{ raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, @@ -30,9 +29,13 @@ pub enum Error { } /// `RegionMetaCollector` is the collector that collector all region meta -pub struct RegionMetaCollector { +pub struct RegionMetaCollector +where + EK: KvEngine, + ER: RaftEngine, +{ /// The engine we are working on - engines: Engines, + engines: Engines, /// region meta report to br tx: UnboundedSender, /// Current working workers @@ -40,8 +43,12 @@ pub struct RegionMetaCollector { } #[allow(dead_code)] -impl RegionMetaCollector { - pub fn new(engines: Engines, tx: UnboundedSender) -> Self { +impl RegionMetaCollector +where + EK: KvEngine, + ER: RaftEngine, +{ + pub fn new(engines: Engines, tx: UnboundedSender) -> Self { RegionMetaCollector { engines, tx, @@ -74,14 +81,22 @@ impl RegionMetaCollector { } } -struct CollectWorker { +struct CollectWorker +where + EK: KvEngine, + ER: RaftEngine, +{ /// The engine we are working on - engines: Engines, + engines: Engines, tx: UnboundedSender, } -impl CollectWorker { - pub fn new(engines: Engines, tx: UnboundedSender) -> Self { +impl CollectWorker +where + EK: KvEngine, + ER: RaftEngine, +{ + pub fn new(engines: Engines, tx: UnboundedSender) -> Self { CollectWorker { engines, tx } } diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs index 10f82d64917..6bf706e158f 100644 --- a/components/snap_recovery/src/services.rs +++ b/components/snap_recovery/src/services.rs @@ -2,8 +2,14 @@ use std::{ error::Error as StdError, + fmt::Display, + future::Future, result, - sync::mpsc::{sync_channel, SyncSender}, + sync::{ + atomic::{AtomicBool, Ordering}, + mpsc::{sync_channel, SyncSender}, + Arc, Mutex, + }, thread::Builder, time::Instant, }; @@ -13,14 +19,16 @@ use engine_rocks::{ util::get_cf_handle, RocksEngine, }; -use engine_traits::{CfNamesExt, CfOptionsExt, Engines, Peekable, RaftEngine}; +use engine_traits::{CfNamesExt, CfOptionsExt, Engines, KvEngine, RaftEngine}; use futures::{ channel::mpsc, executor::{ThreadPool, ThreadPoolBuilder}, + stream::{AbortHandle, Aborted}, FutureExt, SinkExt, StreamExt, }; use grpcio::{ - ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, + ClientStreamingSink, RequestStream, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, + UnarySink, WriteFlags, }; use kvproto::{raft_serverpb::StoreIdent, recoverdatapb::*}; use raftstore::{ @@ -59,21 +67,65 @@ pub enum Error { #[error("{0:?}")] Other(#[from] Box), } + /// Service handles the recovery messages from backup restore. #[derive(Clone)] -pub struct RecoveryService { - engines: Engines, - router: RaftRouter, +pub struct RecoveryService +where + EK: KvEngine, + ER: RaftEngine, +{ + engines: Engines, + router: RaftRouter, threads: ThreadPool, + + /// The handle to last call of recover region RPC. + /// + /// We need to make sure the execution of keeping leader exits before next + /// `RecoverRegion` rpc gets in. Or the previous call may stuck at keep + /// leader forever, once the second caller request the leader to be at + /// another store. + // NOTE: Perhaps it would be better to abort the procedure as soon as the client + // stream has been closed, but yet it seems there isn't such hook like + // `on_client_go` for us, and the current implementation only start + // work AFTER the client closes their sender part(!) + last_recovery_region_rpc: Arc>>, +} + +struct RecoverRegionState { + start_at: Instant, + finished: Arc, + abort: AbortHandle, +} + +impl RecoverRegionState { + /// Create the state by wrapping a execution of recover region. + fn wrap_task, T>( + task: F, + ) -> (Self, impl Future>) { + let finished = Arc::new(AtomicBool::new(false)); + let (cancelable_task, abort) = futures::future::abortable(task); + let state = Self { + start_at: Instant::now(), + finished: Arc::clone(&finished), + abort, + }; + (state, async move { + let res = cancelable_task.await; + finished.store(true, Ordering::SeqCst); + res + }) + } } -impl RecoveryService { +impl RecoveryService +where + EK: KvEngine, + ER: RaftEngine, +{ /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a /// `thread pool`. - pub fn new( - engines: Engines, - router: RaftRouter, - ) -> RecoveryService { + pub fn new(engines: Engines, router: RaftRouter) -> RecoveryService { let props = tikv_util::thread_group::current_properties(); let threads = ThreadPoolBuilder::new() .pool_size(4) @@ -90,7 +142,7 @@ impl RecoveryService { // config rocksdb l0 to optimize the restore // also for massive data applied during the restore, it easy to reach the write // stop - let db = engines.kv.clone(); + let db: &RocksEngine = engines.kv.get_disk_engine(); for cf_name in db.cf_names() { Self::set_db_options(cf_name, db.clone()).expect("set db option failure"); } @@ -99,6 +151,7 @@ impl RecoveryService { engines, router, threads, + last_recovery_region_rpc: Arc::default(), } } @@ -140,10 +193,38 @@ impl RecoveryService { Ok(store_id) } + fn abort_last_recover_region(&self, place: impl Display) { + let mut last_state_lock = self.last_recovery_region_rpc.lock().unwrap(); + Self::abort_last_recover_region_of(place, &mut last_state_lock) + } + + fn replace_last_recover_region(&self, place: impl Display, new_state: RecoverRegionState) { + let mut last_state_lock = self.last_recovery_region_rpc.lock().unwrap(); + Self::abort_last_recover_region_of(place, &mut last_state_lock); + *last_state_lock = Some(new_state); + } + + fn abort_last_recover_region_of( + place: impl Display, + last_state_lock: &mut Option, + ) { + if let Some(last_state) = last_state_lock.take() { + info!("Another task enter, checking last task."; + "finished" => ?last_state.finished, + "start_before" => ?last_state.start_at.elapsed(), + "abort_by" => %place, + ); + if !last_state.finished.load(Ordering::SeqCst) { + last_state.abort.abort(); + warn!("Last task not finished, aborting it."); + } + } + } + // a new wait apply syncer share with all regions, // when all region reached the target index, share reference decreased to 0, // trigger closure to send finish info back. - pub fn wait_apply_last(router: RaftRouter, sender: SyncSender) { + pub fn wait_apply_last(router: RaftRouter, sender: SyncSender) { let wait_apply = SnapshotRecoveryWaitApplySyncer::new(0, sender); router.broadcast_normal(|| { PeerMsg::SignificantMsg(SignificantMsg::SnapshotRecoveryWaitApply( @@ -186,11 +267,15 @@ fn compact(engine: RocksEngine) -> Result<()> { Ok(()) } -impl RecoverData for RecoveryService { +impl RecoverData for RecoveryService +where + EK: KvEngine, + ER: RaftEngine, +{ // 1. br start to ready region meta fn read_region_meta( &mut self, - _ctx: RpcContext<'_>, + ctx: RpcContext<'_>, _req: ReadRegionMetaRequest, mut sink: ServerStreamingSink, ) { @@ -215,6 +300,11 @@ impl RecoverData for RecoveryService { } }); + // Hacking: Sometimes, the client may omit the RPC call to `recover_region` if + // no leader should be register to some (unfortunate) store. So we abort + // last recover region here too, anyway this RPC implies a consequent + // `recover_region` for now. + self.abort_last_recover_region(format_args!("read_region_meta by {}", ctx.peer())); self.threads.spawn_ok(send_task); } @@ -222,11 +312,11 @@ impl RecoverData for RecoveryService { // assign region leader and wait leader apply to last log fn recover_region( &mut self, - _ctx: RpcContext<'_>, + ctx: RpcContext<'_>, mut stream: RequestStream, sink: ClientStreamingSink, ) { - let raft_router = self.router.clone(); + let mut raft_router = Mutex::new(self.router.clone()); let store_id = self.get_store_id(); info!("start to recover the region"); let task = async move { @@ -241,17 +331,15 @@ impl RecoverData for RecoveryService { } } - let mut lk = LeaderKeeper::new(raft_router.clone(), leaders.clone()); + let mut lk = LeaderKeeper::new(&raft_router, leaders.clone()); // We must use the tokio runtime here because there isn't a `block_in_place` // like thing in the futures executor. It simply panics when block // on the block_on context. // It is also impossible to directly `await` here, because that will make // borrowing to the raft router crosses the await point. - tokio::runtime::Builder::new_current_thread() - .build() - .expect("failed to build temporary tokio runtime.") - .block_on(lk.elect_and_wait_all_ready()); + lk.elect_and_wait_all_ready().await; info!("all region leader assigned done"; "count" => %leaders.len()); + drop(lk); let now = Instant::now(); // wait apply to the last log @@ -260,7 +348,7 @@ impl RecoverData for RecoveryService { let (tx, rx) = sync_channel(1); REGION_EVENT_COUNTER.start_wait_leader_apply.inc(); let wait_apply = SnapshotRecoveryWaitApplySyncer::new(region_id, tx.clone()); - if let Err(e) = raft_router.significant_send( + if let Err(e) = raft_router.get_mut().unwrap().significant_send( region_id, SignificantMsg::SnapshotRecoveryWaitApply(wait_apply.clone()), ) { @@ -277,6 +365,10 @@ impl RecoverData for RecoveryService { for (rid, rx) in leaders.iter().zip(rx_apply) { if let Some(rx) = rx { CURRENT_WAIT_APPLY_LEADER.set(*rid as _); + // FIXME: we cannot the former RPC when we get stuck at here. + // Perhaps we need to make `SnapshotRecoveryWaitApplySyncer` be able to support + // asynchronous channels. But for now, waiting seems won't cause live lock, so + // we are keeping it unchanged. match rx.recv() { Ok(region_id) => { debug!("leader apply to last log"; "region_id" => region_id); @@ -301,10 +393,20 @@ impl RecoverData for RecoveryService { Err(e) => error!("failed to get store id"; "error" => ?e), }; - let _ = sink.success(resp).await; + resp }; - self.threads.spawn_ok(task); + let (state, task) = RecoverRegionState::wrap_task(task); + self.replace_last_recover_region(format!("recover_region by {}", ctx.peer()), state); + self.threads.spawn_ok(async move { + let res = match task.await { + Ok(resp) => sink.success(resp), + Err(Aborted) => sink.fail(RpcStatus::new(RpcStatusCode::ABORTED)), + }; + if let Err(err) = res.await { + warn!("failed to response recover region rpc"; "err" => %err); + } + }); } // 3. ensure all region peer/follower apply to last @@ -352,10 +454,14 @@ impl RecoverData for RecoveryService { // implement a resolve/delete data funciton let resolved_ts = req.get_resolved_ts(); let (tx, rx) = mpsc::unbounded(); - let resolver = DataResolverManager::new(self.engines.kv.clone(), tx, resolved_ts.into()); + let resolver = DataResolverManager::new( + self.engines.kv.get_disk_engine().clone(), + tx, + resolved_ts.into(), + ); info!("start to resolve kv data"); resolver.start(); - let db = self.engines.kv.clone(); + let db = self.engines.kv.get_disk_engine().clone(); let store_id = self.get_store_id(); let send_task = async move { let id = store_id?; @@ -381,3 +487,32 @@ impl RecoverData for RecoveryService { self.threads.spawn_ok(send_task); } } + +#[cfg(test)] +mod test { + use std::{sync::atomic::Ordering, time::Duration}; + + use futures::never::Never; + + use super::RecoverRegionState; + + #[test] + fn test_state() { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + let (state, task) = RecoverRegionState::wrap_task(futures::future::pending::()); + let hnd = rt.spawn(task); + state.abort.abort(); + rt.block_on(async { tokio::time::timeout(Duration::from_secs(10), hnd).await }) + .unwrap() + .unwrap() + .unwrap_err(); + + let (state, task) = RecoverRegionState::wrap_task(futures::future::ready(42)); + assert_eq!(state.finished.load(Ordering::SeqCst), false); + assert_eq!(rt.block_on(task), Ok(42)); + assert_eq!(state.finished.load(Ordering::SeqCst), true); + } +} diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index d292b44606e..b501e509a8a 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -5,12 +5,7 @@ edition = "2021" publish = false [features] -default = ["cloud-aws", "cloud-gcp", "cloud-azure", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] -cloud-aws = ["external_storage_export/cloud-aws"] -cloud-gcp = ["external_storage_export/cloud-gcp"] -cloud-azure = ["external_storage_export/cloud-azure"] -cloud-storage-grpc = ["external_storage_export/cloud-storage-grpc"] -cloud-storage-dylib = ["external_storage_export/cloud-storage-dylib"] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] test-engines-rocksdb = [ "engine_test/test-engines-rocksdb", @@ -34,7 +29,7 @@ encryption = { workspace = true } engine_rocks = { workspace = true } engine_traits = { workspace = true } error_code = { workspace = true } -external_storage_export = { workspace = true } +external_storage ={ workspace = true } file_system = { workspace = true } futures = { version = "0.3", features = ["thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } @@ -44,7 +39,7 @@ kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } prometheus = { version = "0.13", default-features = false } protobuf = { version = "2.8", features = ["bytes"] } rand = "0.8" diff --git a/components/sst_importer/src/caching/storage_cache.rs b/components/sst_importer/src/caching/storage_cache.rs index 23732545b92..585772c2552 100644 --- a/components/sst_importer/src/caching/storage_cache.rs +++ b/components/sst_importer/src/caching/storage_cache.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -use external_storage_export::ExternalStorage; +use external_storage::ExternalStorage; use kvproto::brpb::StorageBackend; use super::cache_map::{MakeCache, ShareOwned}; @@ -31,7 +31,7 @@ impl StoragePool { fn create(backend: &StorageBackend, size: usize) -> Result { let mut r = Vec::with_capacity(size); for _ in 0..size { - let s = external_storage_export::create_storage(backend, Default::default())?; + let s = external_storage::create_storage(backend, Default::default())?; r.push(Arc::from(s)); } Ok(Self(r.into_boxed_slice())) diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 7ff940fff12..e03288bb3e1 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -2,6 +2,7 @@ use std::{ error::Error as StdError, io::Error as IoError, num::ParseIntError, path::PathBuf, result, + time::Duration, }; use encryption::Error as EncryptionError; @@ -31,6 +32,7 @@ pub fn error_inc(type_: &str, err: &Error) { Error::BadFormat(..) => "bad_format", Error::Encryption(..) => "encryption", Error::CodecError(..) => "codec", + Error::Suspended { .. } => "suspended", _ => return, }; IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); @@ -116,6 +118,12 @@ pub enum Error { #[error("Importing a SST file with imcompatible api version")] IncompatibleApiVersion, + #[error("{0}, please retry write later")] + RequestTooNew(String), + + #[error("{0}, please rescan region later")] + RequestTooOld(String), + #[error("Key mode mismatched with the request mode, writer: {:?}, storage: {:?}, key: {}", .writer, .storage_api_version, .key)] InvalidKeyMode { writer: SstWriterType, @@ -125,6 +133,9 @@ pub enum Error { #[error("resource is not enough {0}")] ResourceNotEnough(String), + + #[error("imports are suspended for {time_to_lease_expire:?}")] + Suspended { time_to_lease_expire: Duration }, } impl Error { @@ -160,6 +171,16 @@ impl From for import_sstpb::Error { err.set_store_error(import_err); err.set_message(format!("{}", e)); } + Error::Suspended { + time_to_lease_expire, + } => { + let mut store_err = errorpb::Error::default(); + let mut server_is_busy = errorpb::ServerIsBusy::default(); + server_is_busy.set_backoff_ms(time_to_lease_expire.as_millis() as _); + store_err.set_server_is_busy(server_is_busy); + err.set_store_error(store_err); + err.set_message(format!("{}", e)); + } _ => { err.set_message(format!("{}", e)); } @@ -197,6 +218,9 @@ impl ErrorCodeExt for Error { Error::IncompatibleApiVersion => error_code::sst_importer::INCOMPATIBLE_API_VERSION, Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, + Error::Suspended { .. } => error_code::sst_importer::SUSPENDED, + Error::RequestTooNew(_) => error_code::sst_importer::REQUEST_TOO_NEW, + Error::RequestTooOld(_) => error_code::sst_importer::REQUEST_TOO_OLD, } } } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index b270d26a411..a8fdea6a564 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -4,18 +4,16 @@ use std::{ collections::HashMap, fmt, io::{self, Write}, + marker::PhantomData, path::{Path, PathBuf}, sync::Arc, + time::SystemTime, }; use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use encryption::{DataKeyManager, EncrypterWriter}; -use engine_rocks::{get_env, RocksSstReader}; -use engine_traits::{ - iter_option, EncryptionKeyManager, IterOptions, Iterator, KvEngine, RefIterable, SstExt, - SstMetaInfo, SstReader, -}; -use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; +use engine_traits::{iter_option, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader}; +use file_system::{sync_dir, File, OpenOptions}; use keys::data_key; use kvproto::{import_sstpb::*, kvrpcpb::ApiVersion}; use tikv_util::time::Instant; @@ -215,17 +213,19 @@ impl Drop for ImportFile { /// The file being written is stored in `$root/.temp/$file_name`. After writing /// is completed, the file is moved to `$root/$file_name`. The file generated /// from the ingestion process will be placed in `$root/.clone/$file_name`. -pub struct ImportDir { +pub struct ImportDir { root_dir: PathBuf, temp_dir: PathBuf, clone_dir: PathBuf, + + _phantom: PhantomData, } -impl ImportDir { +impl ImportDir { const TEMP_DIR: &'static str = ".temp"; const CLONE_DIR: &'static str = ".clone"; - pub fn new>(root: P) -> Result { + pub fn new>(root: P) -> Result { let root_dir = root.as_ref().to_owned(); let temp_dir = root_dir.join(Self::TEMP_DIR); let clone_dir = root_dir.join(Self::CLONE_DIR); @@ -241,6 +241,7 @@ impl ImportDir { root_dir, temp_dir, clone_dir, + _phantom: PhantomData, }) } @@ -260,17 +261,36 @@ impl ImportDir { }) } - pub fn join(&self, meta: &SstMeta) -> Result { + pub fn join_for_write(&self, meta: &SstMeta) -> Result { let file_name = sst_meta_to_path(meta)?; self.get_import_path(file_name.to_str().unwrap()) } + /// Different with join_for_write, join_for_read will also handle the api + /// version 1 filenames which can be generated by old version TiKV. + pub fn join_for_read(&self, meta: &SstMeta) -> Result { + let file_name = sst_meta_to_path(meta)?; + let files_result = self.get_import_path(file_name.to_str().unwrap()); + // if files does not exists, it means the SstMeta is generated by old version + // TiKV, we try sst_meta_to_path_v1 + match files_result { + Ok(path) => { + if path.save.exists() { + return Ok(path); + } + let file_name = sst_meta_to_path_v1(meta)?; + self.get_import_path(file_name.to_str().unwrap()) + } + Err(e) => Err(e), + } + } + pub fn create( &self, meta: &SstMeta, key_manager: Option>, ) -> Result { - let path = self.join(meta)?; + let path = self.join_for_write(meta)?; if path.save.exists() { return Err(Error::FileExists(path.save, "create SST upload cache")); } @@ -289,7 +309,7 @@ impl ImportDir { } pub fn delete(&self, meta: &SstMeta, manager: Option<&DataKeyManager>) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; self.delete_file(&path.save, manager)?; self.delete_file(&path.temp, manager)?; self.delete_file(&path.clone, manager)?; @@ -297,7 +317,7 @@ impl ImportDir { } pub fn exist(&self, meta: &SstMeta) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; Ok(path.save.exists()) } @@ -306,12 +326,16 @@ impl ImportDir { meta: &SstMeta, key_manager: Option>, ) -> Result { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); - let env = get_env(key_manager, get_io_rate_limiter())?; - let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; + let sst_reader = E::SstReader::open(path_str, key_manager)?; // TODO: check the length and crc32 of ingested file. - let meta_info = sst_reader.sst_meta_info(meta.to_owned()); + let (count, size) = sst_reader.kv_count_and_size(); + let meta_info = SstMetaInfo { + total_kvs: count, + total_bytes: size, + meta: meta.to_owned(), + }; Ok(meta_info) } @@ -333,10 +357,9 @@ impl ImportDir { // otherwise we are upgrade/downgrade between V1 and V2 // this can be done if all keys are written by TiDB _ => { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); - let env = get_env(key_manager.clone(), get_io_rate_limiter())?; - let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; + let sst_reader = E::SstReader::open(path_str, key_manager.clone())?; for &(start, end) in TIDB_RANGES_COMPLEMENT { let opt = iter_option(&data_key(start), &data_key(end), false); @@ -358,7 +381,7 @@ impl ImportDir { Ok(true) } - pub fn ingest( + pub fn ingest( &self, metas: &[SstMetaInfo], engine: &E, @@ -381,7 +404,7 @@ impl ImportDir { let mut paths = HashMap::new(); let mut ingest_bytes = 0; for info in metas { - let path = self.join(&info.meta)?; + let path = self.join_for_read(&info.meta)?; let cf = info.meta.get_cf_name(); super::prepare_sst_for_ingestion(&path.save, &path.clone, key_manager.as_deref())?; ingest_bytes += info.total_bytes; @@ -406,41 +429,15 @@ impl ImportDir { key_manager: Option>, ) -> Result<()> { for meta in metas { - let path = self.join(meta)?; + let path = self.join_for_read(meta)?; let path_str = path.save.to_str().unwrap(); - let env = get_env(key_manager.clone(), get_io_rate_limiter())?; - let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; + let sst_reader = E::SstReader::open(path_str, key_manager.clone())?; sst_reader.verify_checksum()?; } Ok(()) } - pub fn load_start_key_by_meta( - &self, - meta: &SstMeta, - km: Option>, - ) -> Result>> { - let path = self.join(meta)?; - let r = match km { - Some(km) => E::SstReader::open_encrypted(&path.save.to_string_lossy(), km)?, - None => E::SstReader::open(&path.save.to_string_lossy())?, - }; - let opts = IterOptions::new(None, None, false); - let mut i = r.iter(opts)?; - if !i.seek_to_first()? || !i.valid()? { - return Ok(None); - } - // Should we warn if the key doesn't start with the prefix key? (Is that - // possible?) - // Also note this brings implicit coupling between this and - // RocksEngine. Perhaps it is better to make the engine to provide - // decode functions. Anyway we have directly used the RocksSstReader - // somewhere... This won't make things worse. - let real_key = i.key().strip_prefix(keys::DATA_PREFIX_KEY); - Ok(real_key.map(ToOwned::to_owned)) - } - - pub fn list_ssts(&self) -> Result> { + pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { let e = e?; @@ -449,7 +446,10 @@ impl ImportDir { } let path = e.path(); match parse_meta_from_path(&path) { - Ok(sst) => ssts.push(sst), + Ok(sst) => { + let last_modify = e.metadata()?.modified()?; + ssts.push((sst.0, sst.1, last_modify)) + } Err(e) => error!(%e; "path_to_sst_meta failed"; "path" => %path.display(),), } } @@ -458,8 +458,28 @@ impl ImportDir { } const SST_SUFFIX: &str = ".sst"; - +// version 2: compared to version 1 which is the default version, we will check +// epoch of request and local region in write API. +pub const API_VERSION_2: i32 = 2; + +/// sst_meta_to_path will encode the filepath with default api version (current +/// is 2). So when the SstMeta is created in old version of TiKV and filepath +/// will not correspond to the real file, in the deletion logic we can't remove +/// these files. pub fn sst_meta_to_path(meta: &SstMeta) -> Result { + Ok(PathBuf::from(format!( + "{}_{}_{}_{}_{}_{}{}", + UuidBuilder::from_slice(meta.get_uuid())?.build(), + meta.get_region_id(), + meta.get_region_epoch().get_conf_ver(), + meta.get_region_epoch().get_version(), + meta.get_cf_name(), + API_VERSION_2, + SST_SUFFIX, + ))) +} + +pub fn sst_meta_to_path_v1(meta: &SstMeta) -> Result { Ok(PathBuf::from(format!( "{}_{}_{}_{}_{}{}", UuidBuilder::from_slice(meta.get_uuid())?.build(), @@ -471,7 +491,7 @@ pub fn sst_meta_to_path(meta: &SstMeta) -> Result { ))) } -pub fn parse_meta_from_path>(path: P) -> Result { +pub fn parse_meta_from_path>(path: P) -> Result<(SstMeta, i32)> { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { Some(name) => name, @@ -500,11 +520,18 @@ pub fn parse_meta_from_path>(path: P) -> Result { // cf_name to path. meta.set_cf_name(elems[4].to_owned()); } - Ok(meta) + let mut api_version = 1; + if elems.len() > 5 { + api_version = elems[5].parse()?; + } + Ok((meta, api_version)) } #[cfg(test)] mod test { + use std::fs; + + use engine_rocks::RocksEngine; use engine_traits::CF_DEFAULT; use super::*; @@ -520,11 +547,12 @@ mod test { meta.mut_region_epoch().set_version(3); let path = sst_meta_to_path(&meta).unwrap(); - let expected_path = format!("{}_1_2_3_default.sst", uuid); + let expected_path = format!("{}_1_2_3_default_2.sst", uuid); assert_eq!(path.to_str().unwrap(), &expected_path); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.0); + assert_eq!(2, meta_with_ver.1); } #[test] @@ -543,8 +571,38 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = parse_meta_from_path(path).unwrap(); - assert_eq!(meta, new_meta); + let meta_with_ver = parse_meta_from_path(path).unwrap(); + assert_eq!(meta, meta_with_ver.0); + assert_eq!(1, meta_with_ver.1); + } + + #[test] + fn test_join_for_rw() { + use tempfile::TempDir; + use uuid::Uuid; + + let tmp = TempDir::new().unwrap(); + let dir = ImportDir::::new(tmp.path()).unwrap(); + let mut meta = SstMeta::default(); + meta.set_uuid(Uuid::new_v4().as_bytes().to_vec()); + let filename_v1 = sst_meta_to_path_v1(&meta).unwrap(); + let path_v1 = tmp.path().join(filename_v1); + + let got = dir + .join_for_read(&meta) + .expect("fallback to version 1 because version 2 file does not exist"); + assert_eq!(got.save, path_v1); + + let filename_v2 = sst_meta_to_path(&meta).unwrap(); + let path_v2 = tmp.path().join(filename_v2); + fs::File::create(&path_v2).expect("create empty file"); + let got = dir.join_for_read(&meta).expect("read should succeed"); + assert_eq!(got.save, path_v2); + fs::remove_file(path_v2).expect("delete file"); + + fs::File::create(&path_v1).expect("create empty file"); + let got = dir.join_for_read(&meta).expect("read should succeed"); + assert_eq!(got.save, path_v1); } #[cfg(feature = "test-engines-rocksdb")] @@ -595,15 +653,6 @@ mod test { .unwrap(); w.finish().unwrap(); dp.save(arcmgr.as_deref()).unwrap(); - let mut ssts = dir.list_ssts().unwrap(); - ssts.iter_mut().for_each(|meta| { - let start = dir - .load_start_key_by_meta::(meta, arcmgr.clone()) - .unwrap() - .unwrap(); - meta.mut_range().set_start(start) - }); - assert_eq!(ssts, vec![meta]); } #[test] diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index 0cfc3bab774..ff137005b09 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -27,7 +27,7 @@ pub mod sst_importer; pub use self::{ config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, - import_file::sst_meta_to_path, + import_file::{sst_meta_to_path, API_VERSION_2}, import_mode2::range_overlaps, sst_importer::SstImporter, sst_writer::{RawSstWriter, TxnSstWriter}, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 33f3c691a26..6eef07b1ebc 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,29 +4,28 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{self, BufReader, Read}, + io::{self, BufReader, ErrorKind, Read}, ops::Bound, path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, Arc, }, - time::Duration, + time::{Duration, SystemTime}, }; use collections::HashSet; use dashmap::{mapref::entry::Entry, DashMap}; -use encryption::{to_engine_encryption_method, DataKeyManager}; -use engine_rocks::{get_env, RocksSstReader}; +use encryption::{DataKeyManager, FileEncryptionInfo}; use engine_traits::{ - name_to_cf, util::check_key_in_range, CfName, EncryptionKeyManager, FileEncryptionInfo, - IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, - SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, + name_to_cf, util::check_key_in_range, CfName, IterOptions, Iterator, KvEngine, RefIterable, + SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + CF_WRITE, }; -use external_storage_export::{ +use external_storage::{ compression_reader_dispatcher, encrypt_wrap_reader, ExternalStorage, RestoreConfig, }; -use file_system::{get_io_rate_limiter, IoType, OpenOptions}; +use file_system::{IoType, OpenOptions}; use kvproto::{ brpb::{CipherInfo, StorageBackend}, import_sstpb::{Range, *}, @@ -153,8 +152,8 @@ impl CacheKvFile { } /// SstImporter manages SST files that are waiting for ingesting. -pub struct SstImporter { - dir: ImportDir, +pub struct SstImporter { + dir: ImportDir, key_manager: Option>, switcher: Either, // TODO: lift api_version as a type parameter. @@ -169,14 +168,14 @@ pub struct SstImporter { mem_limit: Arc, } -impl SstImporter { +impl SstImporter { pub fn new>( cfg: &Config, root: P, key_manager: Option>, api_version: ApiVersion, raft_kv_v2: bool, - ) -> Result { + ) -> Result { let switcher = if raft_kv_v2 { Either::Right(ImportModeSwitcherV2::new(cfg)) } else { @@ -281,7 +280,7 @@ impl SstImporter { } } - pub fn start_switch_mode_check(&self, executor: &Handle, db: Option) { + pub fn start_switch_mode_check(&self, executor: &Handle, db: Option) { match &self.switcher { Either::Left(switcher) => switcher.start(executor, db.unwrap()), Either::Right(switcher) => switcher.start(executor), @@ -289,10 +288,27 @@ impl SstImporter { } pub fn get_path(&self, meta: &SstMeta) -> PathBuf { - let path = self.dir.join(meta).unwrap(); + let path = self.dir.join_for_read(meta).unwrap(); path.save } + pub fn get_total_size(&self) -> Result { + let mut total_size = 0; + for entry in file_system::read_dir(self.dir.get_root_dir())? { + match entry.and_then(|e| e.metadata().map(|m| (e, m))) { + Ok((_, m)) => { + if !m.is_file() { + continue; + } + total_size += m.len(); + } + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + } + Ok(total_size) + } + pub fn create(&self, meta: &SstMeta) -> Result { match self.dir.create(meta, self.key_manager.clone()) { Ok(f) => { @@ -338,7 +354,7 @@ impl SstImporter { .check_api_version(metas, self.key_manager.clone(), self.api_version) } - pub fn ingest(&self, metas: &[SstMetaInfo], engine: &E) -> Result<()> { + pub fn ingest(&self, metas: &[SstMetaInfo], engine: &E) -> Result<()> { match self .dir .ingest(metas, engine, self.key_manager.clone(), self.api_version) @@ -378,7 +394,7 @@ impl SstImporter { // // This method returns the *inclusive* key range (`[start, end]`) of SST // file created, or returns None if the SST is empty. - pub async fn download_ext( + pub async fn download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -396,7 +412,7 @@ impl SstImporter { "rewrite_rule" => ?rewrite_rule, "speed_limit" => speed_limiter.speed_limit(), ); - let r = self.do_download_ext::( + let r = self.do_download_ext( meta, backend, name, @@ -418,7 +434,7 @@ impl SstImporter { } } - pub fn enter_normal_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { + pub fn enter_normal_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { if let Either::Left(ref switcher) = self.switcher { switcher.enter_normal_mode(&db, mf) } else { @@ -426,7 +442,7 @@ impl SstImporter { } } - pub fn enter_import_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { + pub fn enter_import_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { if let Either::Left(ref switcher) = self.switcher { switcher.enter_import_mode(&db, mf) } else { @@ -453,7 +469,7 @@ impl SstImporter { backend: &StorageBackend, support_kms: bool, speed_limiter: &Limiter, - restore_config: external_storage_export::RestoreConfig, + restore_config: external_storage::RestoreConfig, ) -> Result<()> { self._download_rt .block_on(self.async_download_file_from_external_storage( @@ -479,7 +495,7 @@ impl SstImporter { // TODO: pass a config to support hdfs let ext_storage = if cache_id.is_empty() { EXT_STORAGE_CACHE_COUNT.with_label_values(&["skip"]).inc(); - let s = external_storage_export::create_storage(backend, Default::default())?; + let s = external_storage::create_storage(backend, Default::default())?; Arc::from(s) } else { self.cached_storage.cached_or_create(cache_id, backend)? @@ -496,7 +512,7 @@ impl SstImporter { support_kms: bool, speed_limiter: &Limiter, cache_key: &str, - restore_config: external_storage_export::RestoreConfig, + restore_config: external_storage::RestoreConfig, ) -> Result<()> { let start_read = Instant::now(); if let Some(p) = dst_file.parent() { @@ -642,8 +658,7 @@ impl SstImporter { async fn exec_download( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, - ext_storage: Arc, + ext_storage: Arc, speed_limiter: &Limiter, ) -> Result { let start = Instant::now(); @@ -668,7 +683,7 @@ impl SstImporter { Some((meta.get_range_offset(), range_length)) } }; - let restore_config = external_storage_export::RestoreConfig { + let restore_config = external_storage::RestoreConfig { range, compression_type: Some(meta.get_compression_type()), expected_sha256, @@ -690,9 +705,8 @@ impl SstImporter { .with_label_values(&["exec_download"]) .observe(start.saturating_elapsed().as_secs_f64()); - let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; Ok(LoadedFile { - content: Arc::from(rewrite_buff.into_boxed_slice()), + content: Arc::from(buff.into_boxed_slice()), permit, }) } @@ -700,8 +714,7 @@ impl SstImporter { pub async fn do_read_kv_file( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, - ext_storage: Arc, + ext_storage: Arc, speed_limiter: &Limiter, ) -> Result { let start = Instant::now(); @@ -741,7 +754,7 @@ impl SstImporter { } cache - .get_or_try_init(|| self.exec_download(meta, rewrite_rule, ext_storage, speed_limiter)) + .get_or_try_init(|| self.exec_download(meta, ext_storage, speed_limiter)) .await?; Ok(CacheKvFile::Mem(cache)) } @@ -750,18 +763,16 @@ impl SstImporter { &self, ext_storage: Arc, support_kms: bool, - ) -> Arc { + ) -> Arc { // kv-files needn't are decrypted with KMS when download currently because these // files are not encrypted when log-backup. It is different from // sst-files because sst-files is encrypted when saved with rocksdb env // with KMS. to do: support KMS when log-backup and restore point. match (support_kms, self.key_manager.clone()) { - (true, Some(key_manager)) => { - Arc::new(external_storage_export::EncryptedExternalStorage { - key_manager, - storage: ext_storage, - }) - } + (true, Some(key_manager)) => Arc::new(external_storage::EncryptedExternalStorage { + key_manager, + storage: ext_storage, + }), _ => ext_storage, } } @@ -770,7 +781,7 @@ impl SstImporter { &self, file_length: u64, file_name: &str, - ext_storage: Arc, + ext_storage: Arc, speed_limiter: &Limiter, restore_config: RestoreConfig, ) -> Result> { @@ -792,12 +803,12 @@ impl SstImporter { encrypt_wrap_reader(file_crypter, inner)? }; - let r = external_storage_export::read_external_storage_info_buff( + let r = external_storage::read_external_storage_info_buff( &mut reader, speed_limiter, file_length, expected_sha256, - external_storage_export::MIN_READ_SPEED, + external_storage::MIN_READ_SPEED, ) .await; let url = ext_storage.url()?.to_string(); @@ -814,8 +825,7 @@ impl SstImporter { pub async fn read_from_kv_file( &self, meta: &KvMeta, - rewrite_rule: &RewriteRule, - ext_storage: Arc, + ext_storage: Arc, backend: &StorageBackend, speed_limiter: &Limiter, ) -> Result> { @@ -823,7 +833,7 @@ impl SstImporter { self.do_download_kv_file(meta, backend, speed_limiter) .await? } else { - self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter) + self.do_read_kv_file(meta, ext_storage, speed_limiter) .await? }; match c { @@ -841,8 +851,7 @@ impl SstImporter { let mut buffer = Vec::new(); reader.read_to_end(&mut buffer)?; - let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; - Ok(Arc::from(rewrite_buff.into_boxed_slice())) + Ok(Arc::from(buffer.into_boxed_slice())) } } } @@ -881,7 +890,7 @@ impl SstImporter { } else { Some((offset, range_length)) }; - let restore_config = external_storage_export::RestoreConfig { + let restore_config = external_storage::RestoreConfig { range, compression_type: Some(meta.compression_type), expected_sha256, @@ -940,7 +949,11 @@ impl SstImporter { // perform iteration and key rewrite. let mut new_buff = Vec::with_capacity(file_buff.len()); - let mut event_iter = EventIterator::new(file_buff.as_slice()); + let mut event_iter = EventIterator::with_rewriting( + file_buff.as_slice(), + rewrite_rule.get_old_key_prefix(), + rewrite_rule.get_new_key_prefix(), + ); let mut key = new_prefix.to_vec(); let new_prefix_data_key_len = key.len(); @@ -983,9 +996,14 @@ impl SstImporter { start_ts: u64, restore_ts: u64, file_buff: Arc<[u8]>, + rewrite_rule: &RewriteRule, mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { - let mut event_iter = EventIterator::new(file_buff.as_ref()); + let mut event_iter = EventIterator::with_rewriting( + file_buff.as_ref(), + rewrite_rule.get_old_key_prefix(), + rewrite_rule.get_new_key_prefix(), + ); let mut smallest_key = None; let mut largest_key = None; let mut total_key = 0; @@ -1001,6 +1019,16 @@ impl SstImporter { event_iter.next()?; INPORTER_APPLY_COUNT.with_label_values(&["key_meet"]).inc(); + if !event_iter + .key() + .starts_with(rewrite_rule.get_new_key_prefix()) + { + return Err(Error::WrongKeyPrefix { + what: "do_apply_kv_file", + key: event_iter.key().to_vec(), + prefix: rewrite_rule.get_old_key_prefix().to_vec(), + }); + } let key = event_iter.key().to_vec(); let value = event_iter.value().to_vec(); let ts = Key::decode_ts_from(&key)?; @@ -1028,7 +1056,7 @@ impl SstImporter { largest_key = largest_key .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.max(key.clone()))); } - if total_key != not_in_range { + if not_in_range != 0 || ts_not_expected != 0 { info!("build download request file done"; "total_keys" => %total_key, "ts_filtered_keys" => %ts_not_expected, @@ -1052,7 +1080,7 @@ impl SstImporter { // raw download, without ext, compatibility to old tests. #[cfg(test)] - fn download( + fn download( &self, meta: &SstMeta, backend: &StorageBackend, @@ -1074,7 +1102,7 @@ impl SstImporter { )) } - async fn do_download_ext( + async fn do_download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -1085,15 +1113,15 @@ impl SstImporter { engine: E, ext: DownloadExt<'_>, ) -> Result> { - let path = self.dir.join(meta)?; + let path = self.dir.join_for_write(meta)?; let file_crypter = crypter.map(|c| FileEncryptionInfo { - method: to_engine_encryption_method(c.cipher_type), + method: c.cipher_type, key: c.cipher_key, iv: meta.cipher_iv.to_owned(), }); - let restore_config = external_storage_export::RestoreConfig { + let restore_config = external_storage::RestoreConfig { file_crypter, ..Default::default() }; @@ -1111,10 +1139,8 @@ impl SstImporter { .await?; // now validate the SST file. - let env = get_env(self.key_manager.clone(), get_io_rate_limiter())?; - // Use abstracted SstReader after Env is abstracted. let dst_file_name = path.temp.to_str().unwrap(); - let sst_reader = RocksSstReader::open_with_env(dst_file_name, Some(env))?; + let sst_reader = E::SstReader::open(dst_file_name, self.key_manager.clone())?; sst_reader.verify_checksum()?; // undo key rewrite so we could compare with the keys inside SST @@ -1354,26 +1380,16 @@ impl SstImporter { } /// List the basic information of the current SST files. - /// The information contains UUID, region ID, region Epoch. - /// Other fields may be left blank. - pub fn list_ssts(&self) -> Result> { + /// The information contains UUID, region ID, region Epoch, api version, + /// last modified time. Other fields may be left blank. + pub fn list_ssts(&self) -> Result> { self.dir.list_ssts() } - /// Load the start key by a metadata. - /// This will open the internal SST and try to load the first user key. - /// (For RocksEngine, that is the key without the 'z' prefix.) - /// When the SST is empty or the first key cannot be parsed as user key, - /// return None. - pub fn load_start_key_by_meta(&self, meta: &SstMeta) -> Result>> { - self.dir - .load_start_key_by_meta::(meta, self.key_manager.clone()) - } - - pub fn new_txn_writer(&self, db: &E, meta: SstMeta) -> Result> { + pub fn new_txn_writer(&self, db: &E, meta: SstMeta) -> Result> { let mut default_meta = meta.clone(); default_meta.set_cf_name(CF_DEFAULT.to_owned()); - let default_path = self.dir.join(&default_meta)?; + let default_path = self.dir.join_for_write(&default_meta)?; let default = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_DEFAULT) @@ -1383,7 +1399,7 @@ impl SstImporter { let mut write_meta = meta; write_meta.set_cf_name(CF_WRITE.to_owned()); - let write_path = self.dir.join(&write_meta)?; + let write_path = self.dir.join_for_write(&write_meta)?; let write = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_WRITE) @@ -1403,13 +1419,9 @@ impl SstImporter { )) } - pub fn new_raw_writer( - &self, - db: &E, - mut meta: SstMeta, - ) -> Result> { + pub fn new_raw_writer(&self, db: &E, mut meta: SstMeta) -> Result> { meta.set_cf_name(CF_DEFAULT.to_owned()); - let default_path = self.dir.join(&meta)?; + let default_path = self.dir.join_for_write(&meta)?; let default = E::SstWriterBuilder::new() .set_db(db) .set_cf(CF_DEFAULT) @@ -1465,12 +1477,14 @@ mod tests { usize, }; + use engine_rocks::get_env; use engine_traits::{ - collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, - RefIterable, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, + collect, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, RefIterable, + SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; - use external_storage_export::read_external_storage_info_buff; + use external_storage::read_external_storage_info_buff; use file_system::File; + use kvproto::encryptionpb::EncryptionMethod; use online_config::{ConfigManager, OnlineConfig}; use openssl::hash::{Hasher, MessageDigest}; use tempfile::Builder; @@ -1490,7 +1504,7 @@ mod tests { let mut meta = SstMeta::default(); meta.set_uuid(Uuid::new_v4().as_bytes().to_vec()); - let path = dir.join(&meta).unwrap(); + let path = dir.join_for_write(&meta).unwrap(); // Test ImportDir::create() { @@ -1556,9 +1570,9 @@ mod tests { for sst in &ssts { ingested .iter() - .find(|s| s.get_uuid() == sst.get_uuid()) + .find(|s| s.get_uuid() == sst.0.get_uuid()) .unwrap(); - dir.delete(sst, key_manager.as_deref()).unwrap(); + dir.delete(&sst.0, key_manager.as_deref()).unwrap(); } assert!(dir.list_ssts().unwrap().is_empty()); } @@ -1676,7 +1690,7 @@ mod tests { meta.mut_region_epoch().set_conf_ver(5); meta.mut_region_epoch().set_version(6); - let backend = external_storage_export::make_local_backend(ext_sst_dir.path()); + let backend = external_storage::make_local_backend(ext_sst_dir.path()); Ok((ext_sst_dir, backend, meta)) } @@ -1724,7 +1738,7 @@ mod tests { kv_meta.set_length(len as _); kv_meta.set_sha256(sha256.finish().unwrap().to_vec()); - let backend = external_storage_export::make_local_backend(ext_dir.path()); + let backend = external_storage::make_local_backend(ext_dir.path()); Ok((ext_dir, backend, kv_meta, buff.buffer().to_vec())) } @@ -1793,7 +1807,7 @@ mod tests { meta.mut_region_epoch().set_conf_ver(5); meta.mut_region_epoch().set_version(6); - let backend = external_storage_export::make_local_backend(ext_sst_dir.path()); + let backend = external_storage::make_local_backend(ext_sst_dir.path()); Ok((ext_sst_dir, backend, meta)) } @@ -1839,7 +1853,7 @@ mod tests { meta.mut_region_epoch().set_conf_ver(5); meta.mut_region_epoch().set_version(6); - let backend = external_storage_export::make_local_backend(ext_sst_dir.path()); + let backend = external_storage::make_local_backend(ext_sst_dir.path()); Ok((ext_sst_dir, backend, meta)) } @@ -1873,7 +1887,7 @@ mod tests { hasher.update(data).unwrap(); let hash256 = hasher.finish().unwrap().to_vec(); - block_on_external_io(external_storage_export::read_external_storage_into_file( + block_on_external_io(external_storage::read_external_storage_into_file( &mut input, &mut output, &Limiter::new(f64::INFINITY), @@ -1891,7 +1905,7 @@ mod tests { let mut input = pending::>().into_async_read(); let mut output = Vec::new(); - let err = block_on_external_io(external_storage_export::read_external_storage_into_file( + let err = block_on_external_io(external_storage::read_external_storage_into_file( &mut input, &mut output, &Limiter::new(f64::INFINITY), @@ -1986,7 +2000,8 @@ mod tests { ..Default::default() }; let import_dir = tempfile::tempdir().unwrap(); - let importer = SstImporter::new(&cfg, import_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, import_dir, None, ApiVersion::V1, false).unwrap(); let mem_limit_old = importer.mem_limit.load(Ordering::SeqCst); // create new config and get the diff config. @@ -2033,7 +2048,7 @@ mod tests { // create importer object. let import_dir = tempfile::tempdir().unwrap(); let (_, key_manager) = new_key_manager_for_test(); - let importer = SstImporter::new( + let importer = SstImporter::::new( &Config::default(), import_dir, Some(key_manager), @@ -2050,10 +2065,8 @@ mod tests { }; // test do_read_kv_file() - let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); let output = block_on_external_io(importer.do_read_kv_file( &kv_meta, - rewrite_rule, ext_storage, &Limiter::new(f64::INFINITY), )) @@ -2093,7 +2106,7 @@ mod tests { // create importer object. let import_dir = tempfile::tempdir().unwrap(); let (_, key_manager) = new_key_manager_for_test(); - let importer = SstImporter::new( + let importer = SstImporter::::new( &Config::default(), import_dir, Some(key_manager), @@ -2110,7 +2123,7 @@ mod tests { }; // test read all of the file. - let restore_config = external_storage_export::RestoreConfig { + let restore_config = external_storage::RestoreConfig { expected_sha256: Some(kv_meta.get_sha256().to_vec()), ..Default::default() }; @@ -2133,7 +2146,7 @@ mod tests { // test read range of the file. let (offset, len) = (5, 16); - let restore_config = external_storage_export::RestoreConfig { + let restore_config = external_storage::RestoreConfig { range: Some((offset, len)), ..Default::default() }; @@ -2161,9 +2174,14 @@ mod tests { memory_use_ratio: 0.0, ..Default::default() }; - let importer = - SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1, false).unwrap(); - let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let importer = SstImporter::::new( + &cfg, + import_dir, + Some(key_manager), + ApiVersion::V1, + false, + ) + .unwrap(); let ext_storage = { importer.wrap_kms( importer.external_storage_or_cache(&backend, "").unwrap(), @@ -2181,7 +2199,6 @@ mod tests { assert!(importer.import_support_download()); let output = block_on_external_io(importer.read_from_kv_file( &kv_meta, - rewrite_rule, ext_storage, &backend, &Limiter::new(f64::INFINITY), @@ -2211,7 +2228,7 @@ mod tests { // create importer object. let import_dir = tempfile::tempdir().unwrap(); let (_, key_manager) = new_key_manager_for_test(); - let importer = SstImporter::new( + let importer = SstImporter::::new( &Config::default(), import_dir, Some(key_manager.clone()), @@ -2223,7 +2240,7 @@ mod tests { // perform download file into .temp dir. let file_name = "sample.sst"; let path = importer.dir.get_import_path(file_name).unwrap(); - let restore_config = external_storage_export::RestoreConfig::default(); + let restore_config = external_storage::RestoreConfig::default(); importer .download_file_from_external_storage( meta.get_length(), @@ -2248,7 +2265,7 @@ mod tests { let (_, key_manager) = new_key_manager_for_test(); let import_dir = tempfile::tempdir().unwrap(); - let importer = SstImporter::new( + let importer = SstImporter::::new( &Config::default(), import_dir, Some(key_manager), @@ -2258,7 +2275,7 @@ mod tests { .unwrap(); let path = importer.dir.get_import_path(kv_meta.get_name()).unwrap(); - let restore_config = external_storage_export::RestoreConfig { + let restore_config = external_storage::RestoreConfig { expected_sha256: Some(kv_meta.get_sha256().to_vec()), ..Default::default() }; @@ -2288,11 +2305,13 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2308,7 +2327,7 @@ mod tests { assert_eq!(range.get_end(), b"t123_r13"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2338,7 +2357,7 @@ mod tests { let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); let (temp_dir, key_manager) = new_key_manager_for_test(); - let importer = SstImporter::new( + let importer = SstImporter::::new( &cfg, &importer_dir, Some(key_manager.clone()), @@ -2352,7 +2371,7 @@ mod tests { let db = new_test_engine_with_env(db_path.to_str().unwrap(), DATA_CFS, env.clone()); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2368,7 +2387,7 @@ mod tests { assert_eq!(range.get_end(), b"t123_r13"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2397,11 +2416,13 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2418,7 +2439,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2442,14 +2463,16 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); // creates a sample SST file. let (_ext_sst_dir, backend, meta) = create_sample_external_sst_file_txn_default().unwrap(); let db = create_sst_test_engine().unwrap(); let _ = importer - .download::( + .download( &meta, &backend, "sample_default.sst", @@ -2463,7 +2486,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2486,14 +2509,16 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); // creates a sample SST file. let (_ext_sst_dir, backend, meta) = create_sample_external_sst_file_txn_write().unwrap(); let db = create_sst_test_engine().unwrap(); let _ = importer - .download::( + .download( &meta, &backend, "sample_write.sst", @@ -2507,7 +2532,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size may be changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2553,11 +2578,12 @@ mod tests { let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); let importer = - SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2625,14 +2651,16 @@ mod tests { let (_ext_sst_dir, backend, mut meta) = create_sample_external_sst_file().unwrap(); let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); // note: the range doesn't contain the DATA_PREFIX 'z'. meta.mut_range().set_start(b"t123_r02".to_vec()); meta.mut_range().set_end(b"t123_r12".to_vec()); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2649,7 +2677,7 @@ mod tests { // verifies that the file is saved to the correct place. // (the file size is changed, so not going to check the file size) - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2671,13 +2699,15 @@ mod tests { let (_ext_sst_dir, backend, mut meta) = create_sample_external_sst_file().unwrap(); let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); meta.mut_range().set_start(b"t5_r02".to_vec()); meta.mut_range().set_end(b"t5_r12".to_vec()); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2693,7 +2723,7 @@ mod tests { assert_eq!(range.get_end(), b"t5_r07"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); // verifies the SST content is correct. @@ -2718,11 +2748,13 @@ mod tests { meta.set_uuid(vec![0u8; 16]); let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); - let backend = external_storage_export::make_local_backend(ext_sst_dir.path()); + let backend = external_storage::make_local_backend(ext_sst_dir.path()); - let result = importer.download::( + let result = importer.download( &meta, &backend, "sample.sst", @@ -2743,12 +2775,14 @@ mod tests { let (_ext_sst_dir, backend, mut meta) = create_sample_external_sst_file().unwrap(); let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); meta.mut_range().set_start(vec![b'x']); meta.mut_range().set_end(vec![b'y']); - let result = importer.download::( + let result = importer.download( &meta, &backend, "sample.sst", @@ -2769,10 +2803,12 @@ mod tests { let (_ext_sst_dir, backend, meta) = create_sample_external_sst_file().unwrap(); let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); let db = create_sst_test_engine().unwrap(); - let result = importer.download::( + let result = importer.download( &meta, &backend, "sample.sst", @@ -2806,11 +2842,12 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, api_version, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, api_version, false).unwrap(); let db = create_sst_test_engine().unwrap(); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2826,7 +2863,7 @@ mod tests { assert_eq!(range.get_end(), b"d"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); @@ -2865,11 +2902,12 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, api_version, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, api_version, false).unwrap(); let db = create_sst_test_engine().unwrap(); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2885,7 +2923,7 @@ mod tests { assert_eq!(range.get_end(), b"c\x00"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); @@ -2920,11 +2958,12 @@ mod tests { // performs the download. let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, api_version, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, api_version, false).unwrap(); let db = create_sst_test_engine().unwrap(); let range = importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2940,7 +2979,7 @@ mod tests { assert_eq!(range.get_end(), b"c"); // verifies that the file is saved to the correct place. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; let sst_file_metadata = sst_file_path.metadata().unwrap(); assert!(sst_file_metadata.is_file()); @@ -2968,12 +3007,13 @@ mod tests { let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); let mut importer = - SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); importer.set_compression_type(CF_DEFAULT, Some(SstCompressionType::Snappy)); let db = create_sst_test_engine().unwrap(); importer - .download::( + .download( &meta, &backend, "sample.sst", @@ -2986,7 +3026,7 @@ mod tests { .unwrap(); // verifies the SST is compressed using Snappy. - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); @@ -3001,12 +3041,13 @@ mod tests { let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); let mut importer = - SstImporter::new(&cfg, &importer_dir, None, ApiVersion::V1, false).unwrap(); + SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) + .unwrap(); importer.set_compression_type(CF_DEFAULT, Some(SstCompressionType::Zstd)); let db_path = importer_dir.path().join("db"); let db = new_test_engine(db_path.to_str().unwrap(), DATA_CFS); - let mut w = importer.new_txn_writer::(&db, meta).unwrap(); + let mut w = importer.new_txn_writer(&db, meta).unwrap(); let mut batch = WriteBatch::default(); let mut pairs = vec![]; @@ -3033,7 +3074,7 @@ mod tests { // verifies SST compression algorithm... for meta in metas { - let sst_file_path = importer.dir.join(&meta).unwrap().save; + let sst_file_path = importer.dir.join_for_read(&meta).unwrap().save; assert!(sst_file_path.is_file()); let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); @@ -3049,12 +3090,18 @@ mod tests { #[test] fn test_import_support_download() { let import_dir = tempfile::tempdir().unwrap(); - let importer = - SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1, false).unwrap(); + let importer = SstImporter::::new( + &Config::default(), + import_dir, + None, + ApiVersion::V1, + false, + ) + .unwrap(); assert_eq!(importer.import_support_download(), false); let import_dir = tempfile::tempdir().unwrap(); - let importer = SstImporter::new( + let importer = SstImporter::::new( &Config { memory_use_ratio: 0.0, ..Default::default() @@ -3072,8 +3119,14 @@ mod tests { fn test_inc_mem_and_check() { // create importer object. let import_dir = tempfile::tempdir().unwrap(); - let importer = - SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1, false).unwrap(); + let importer = SstImporter::::new( + &Config::default(), + import_dir, + None, + ApiVersion::V1, + false, + ) + .unwrap(); assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); // test inc_mem_and_check() and dec_mem() successfully. @@ -3100,8 +3153,14 @@ mod tests { #[test] fn test_dashmap_lock() { let import_dir = tempfile::tempdir().unwrap(); - let importer = - SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1, false).unwrap(); + let importer = SstImporter::::new( + &Config::default(), + import_dir, + None, + ApiVersion::V1, + false, + ) + .unwrap(); let key = "file1"; let r = Arc::new(OnceCell::new()); diff --git a/components/sst_importer/src/sst_writer.rs b/components/sst_importer/src/sst_writer.rs index f6f896a0923..1c6b06902a4 100644 --- a/components/sst_importer/src/sst_writer.rs +++ b/components/sst_importer/src/sst_writer.rs @@ -301,7 +301,7 @@ mod tests { use crate::{Config, SstImporter}; // Return the temp dir path to avoid it drop out of the scope. - fn new_writer Result>( + fn new_writer, &RocksEngine, SstMeta) -> Result>( f: F, api_version: ApiVersion, ) -> (W, TempDir) { @@ -310,7 +310,8 @@ mod tests { let importer_dir = tempfile::tempdir().unwrap(); let cfg = Config::default(); - let importer = SstImporter::new(&cfg, &importer_dir, None, api_version, false).unwrap(); + let importer = + SstImporter::::new(&cfg, &importer_dir, None, api_version, false).unwrap(); let db_path = importer_dir.path().join("db"); let db = new_test_engine(db_path.to_str().unwrap(), DATA_CFS); (f(&importer, &db, meta).unwrap(), importer_dir) diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index ff7526172d5..121daf49ea8 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -3,8 +3,7 @@ use std::path::Path; use encryption::DataKeyManager; -use engine_traits::EncryptionKeyManager; -use external_storage_export::ExternalStorage; +use external_storage::ExternalStorage; use file_system::File; use super::Result; @@ -127,8 +126,8 @@ mod tests { RocksTitanDbOptions, }; use engine_traits::{ - CfName, CfOptions, DbOptions, EncryptionKeyManager, ImportExt, Peekable, SstWriter, - SstWriterBuilder, TitanCfOptions, CF_DEFAULT, + CfName, CfOptions, DbOptions, ImportExt, Peekable, SstWriter, SstWriterBuilder, + TitanCfOptions, CF_DEFAULT, }; use tempfile::Builder; use test_util::encryption::new_test_key_manager; diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index 59300f993e3..1dbe232fd9e 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -4,20 +4,15 @@ version = "0.0.1" edition = "2021" publish = false -[features] -default = ["cloud-aws", "cloud-gcp", "cloud-azure"] -cloud-aws = ["external_storage_export/cloud-aws"] -cloud-gcp = ["external_storage_export/cloud-gcp"] -cloud-azure = ["external_storage_export/cloud-azure"] - [dependencies] api_version = { workspace = true } backup = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } crc64fast = "0.1" +engine_rocks = { workspace = true } engine_traits = { workspace = true } -external_storage_export = { workspace = true } +external_storage ={ workspace = true } file_system = { workspace = true } futures = "0.3" futures-executor = "0.3" diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index 3a5800e989b..4331f072750 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -11,8 +11,9 @@ use std::{ use api_version::{dispatch_api_version, keyspace::KvPair, ApiV1, KvFormat, RawValue}; use backup::Task; use collections::HashMap; +use engine_rocks::RocksEngine; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; -use external_storage_export::make_local_backend; +use external_storage::make_local_backend; use futures::{channel::mpsc as future_mpsc, executor::block_on}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{brpb::*, kvrpcpb::*, tikvpb::TikvClient}; @@ -39,7 +40,7 @@ use tikv_util::{ use txn_types::TimeStamp; pub struct TestSuite { - pub cluster: Cluster, + pub cluster: Cluster>, pub endpoints: HashMap>, pub tikv_cli: TikvClient, pub context: Context, diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index c81230f6a16..341495cdb52 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -547,7 +547,9 @@ impl PdCluster { fn get_store(&self, store_id: u64) -> Result { match self.stores.get(&store_id) { Some(s) if s.store.get_id() != 0 => Ok(s.store.clone()), - _ => Err(box_err!("store {} not found", store_id)), + // Matches PD error message. + // See https://github.com/tikv/pd/blob/v7.3.0/server/grpc_service.go#L777-L780 + _ => Err(box_err!("invalid store ID {}, not found", store_id)), } } @@ -1435,12 +1437,23 @@ impl TestPdClient { cluster.replication_status = Some(status); } - pub fn switch_replication_mode(&self, state: DrAutoSyncState, available_stores: Vec) { + pub fn switch_replication_mode( + &self, + state: Option, + available_stores: Vec, + ) { let mut cluster = self.cluster.wl(); let status = cluster.replication_status.as_mut().unwrap(); + if state.is_none() { + status.set_mode(ReplicationMode::Majority); + let mut dr = status.mut_dr_auto_sync(); + dr.state_id += 1; + return; + } + status.set_mode(ReplicationMode::DrAutoSync); let mut dr = status.mut_dr_auto_sync(); dr.state_id += 1; - dr.set_state(state); + dr.set_state(state.unwrap()); dr.available_stores = available_stores; } diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 08de4cc3aa1..53ff2c0f0b6 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -37,7 +37,7 @@ use pd_client::PdClient; use raftstore::{ store::{ cmd_resp, initial_region, region_meta::RegionMeta, util::check_key_in_region, Bucket, - BucketRange, Callback, RegionSnapshot, TabletSnapManager, WriteResponse, + BucketRange, Callback, RaftCmdExtraOpts, RegionSnapshot, TabletSnapManager, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, }, Error, Result, @@ -51,7 +51,7 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{ check_raft_cmd_request, is_error_response, new_admin_request, new_delete_cmd, - new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, + new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, new_put_cmd, new_region_detail_cmd, new_region_leader_cmd, new_request, new_status_request, new_store, new_tikv_config_with_api_ver, new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, RawEngine, @@ -69,6 +69,9 @@ use tikv_util::{ }; use txn_types::WriteBatchFlags; +// MAX duration waiting for releasing store metas, default: 10s. +const MAX_WAIT_RELEASE_INTERVAL: u32 = 1000; + // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id // isn't allocated by pd, and node id, store id are same. @@ -283,9 +286,18 @@ pub trait Simulator { } fn async_command_on_node( + &mut self, + node_id: u64, + request: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { + self.async_command_on_node_with_opts(node_id, request, RaftCmdExtraOpts::default()) + } + + fn async_command_on_node_with_opts( &mut self, node_id: u64, mut request: RaftCmdRequest, + opts: RaftCmdExtraOpts, ) -> BoxFuture<'static, RaftCmdResponse> { let region_id = request.get_header().get_region_id(); @@ -316,7 +328,11 @@ pub trait Simulator { _ => unreachable!(), } } - PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + PeerMsg::simple_write_with_opt( + Box::new(request.take_header()), + write_encoder.encode(), + opts, + ) }; self.async_peer_msg_on_node(node_id, region_id, msg) @@ -1263,6 +1279,43 @@ impl, EK: KvEngine> Cluster { panic!("find no region for {}", log_wrappers::hex_encode_upper(key)); } + pub fn async_request( + &mut self, + mut req: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { + let region_id = req.get_header().get_region_id(); + let leader = self.leader_of_region(region_id).unwrap(); + req.mut_header().set_peer(leader.clone()); + self.sim + .wl() + .async_command_on_node(leader.get_store_id(), req) + } + + pub fn async_request_with_opts( + &mut self, + mut req: RaftCmdRequest, + opts: RaftCmdExtraOpts, + ) -> Result> { + let region_id = req.get_header().get_region_id(); + let leader = self.leader_of_region(region_id).unwrap(); + req.mut_header().set_peer(leader.clone()); + Ok(self + .sim + .wl() + .async_command_on_node_with_opts(leader.get_store_id(), req, opts)) + } + + pub fn async_put( + &mut self, + key: &[u8], + value: &[u8], + ) -> Result> { + let mut region = self.get_region(key); + let reqs = vec![new_put_cmd(key, value)]; + let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); + Ok(self.async_request(put)) + } + pub fn must_put(&mut self, key: &[u8], value: &[u8]) { self.must_put_cf(CF_DEFAULT, key, value); } @@ -1666,6 +1719,50 @@ impl, EK: KvEngine> Cluster { } } + pub fn must_empty_region_removed_records(&mut self, region_id: u64) { + let timer = Instant::now(); + loop { + thread::sleep(Duration::from_millis(100)); + + let leader = match self.leader_of_region(region_id) { + None => continue, + Some(l) => l, + }; + let region_state = self.region_local_state(region_id, leader.get_store_id()); + if region_state.get_removed_records().is_empty() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "merged records and removed records must be empty, {:?}", + region_state + ); + } + } + } + + pub fn must_empty_region_merged_records(&mut self, region_id: u64) { + let timer = Instant::now(); + loop { + thread::sleep(Duration::from_millis(100)); + + let leader = match self.leader_of_region(region_id) { + None => continue, + Some(l) => l, + }; + let region_state = self.region_local_state(region_id, leader.get_store_id()); + if region_state.get_merged_records().is_empty() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "merged records and removed records must be empty, {:?}", + region_state + ); + } + } + } + pub fn get_snap_dir(&self, node_id: u64) -> String { self.sim.rl().get_snap_dir(node_id) } @@ -1780,15 +1877,17 @@ impl, EK: KvEngine> Cluster { } self.leaders.clear(); for store_meta in self.store_metas.values() { - while Arc::strong_count(store_meta) != 1 { + // Limits the loop count of checking. + let mut idx = 0; + while Arc::strong_count(store_meta) != 1 && idx < MAX_WAIT_RELEASE_INTERVAL { std::thread::sleep(Duration::from_millis(10)); + idx += 1; } } self.store_metas.clear(); for sst_worker in self.sst_workers.drain(..) { sst_worker.stop_worker(); } - debug!("all nodes are shut down."); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 7b5d501a59f..5073304e17a 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -222,6 +222,11 @@ impl RaftExtension for TestExtension { self.extension.report_store_unreachable(store_id) } + #[inline] + fn report_store_maybe_tombstone(&self, store_id: u64) { + self.extension.report_store_maybe_tombstone(store_id) + } + #[inline] fn report_snapshot_status( &self, @@ -556,6 +561,7 @@ impl ServerCluster { Arc::clone(&importer), Some(store_meta), resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 805394b1ea0..315150e29c2 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -1,6 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Write, path::Path, sync::Arc, thread, time::Duration}; +use std::{ + fmt::Write, + path::Path, + sync::Arc, + thread, + time::{Duration, Instant}, +}; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{RocksEngine, RocksStatistics}; @@ -8,17 +14,20 @@ use engine_test::raft::RaftTestEngine; use engine_traits::{CfName, KvEngine, TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; use futures::future::BoxFuture; +use grpcio::{ChannelBuilder, Environment}; use kvproto::{ encryptionpb::EncryptionMethod, - kvrpcpb::Context, + kvrpcpb::{Context, DiskFullOpt, GetResponse, Mutation, PrewriteResponse}, metapb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, + tikvpb::TikvClient, }; use raftstore::{store::ReadResponse, Result}; use rand::{prelude::SliceRandom, RngCore}; use server::common::ConfiguredRaftEngine; use tempfile::TempDir; -use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, Config}; +use test_pd_client::TestPdClient; +use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, new_snap_cmd, sleep_ms, Config}; use tikv::{ server::KvEngineFactoryBuilder, storage::{ @@ -27,7 +36,8 @@ use tikv::{ }, }; use tikv_util::{ - config::ReadableDuration, escape, future::block_on_timeout, worker::LazyWorker, HandyRwLock, + config::ReadableDuration, escape, future::block_on_timeout, time::InstantExt, + worker::LazyWorker, HandyRwLock, }; use txn_types::Key; @@ -447,3 +457,136 @@ pub fn wait_down_peers, EK: KvEngine>( peers, count, peer ); } + +pub fn wait_region_epoch_change, EK: KvEngine>( + cluster: &Cluster, + waited_region: &metapb::Region, + timeout: Duration, +) { + let timer = Instant::now(); + loop { + if waited_region.get_region_epoch().get_version() + == cluster + .get_region_epoch(waited_region.get_id()) + .get_version() + { + if timer.saturating_elapsed() > timeout { + panic!( + "region {:?}, region epoch is still not changed.", + waited_region + ); + } + } else { + break; + } + sleep_ms(10); + } +} + +pub struct PeerClient { + pub cli: TikvClient, + pub ctx: Context, +} + +impl PeerClient { + pub fn new( + cluster: &Cluster, EK>, + region_id: u64, + peer: metapb::Peer, + ) -> PeerClient { + let cli = { + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(peer.get_store_id())); + TikvClient::new(channel) + }; + let ctx = { + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(peer); + ctx.set_region_epoch(epoch); + ctx + }; + PeerClient { cli, ctx } + } + + pub fn kv_read(&self, key: Vec, ts: u64) -> GetResponse { + test_raftstore::kv_read(&self.cli, self.ctx.clone(), key, ts) + } + + pub fn must_kv_read_equal(&self, key: Vec, val: Vec, ts: u64) { + test_raftstore::must_kv_read_equal(&self.cli, self.ctx.clone(), key, val, ts) + } + + pub fn must_kv_write(&self, pd_client: &TestPdClient, kvs: Vec, pk: Vec) -> u64 { + test_raftstore::must_kv_write(pd_client, &self.cli, self.ctx.clone(), kvs, pk) + } + + pub fn must_kv_prewrite(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite(&self.cli, self.ctx.clone(), muts, pk, ts) + } + + pub fn try_kv_prewrite( + &self, + muts: Vec, + pk: Vec, + ts: u64, + opt: DiskFullOpt, + ) -> PrewriteResponse { + let mut ctx = self.ctx.clone(); + ctx.disk_full_opt = opt; + test_raftstore::try_kv_prewrite(&self.cli, ctx, muts, pk, ts) + } + + pub fn must_kv_prewrite_async_commit(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + true, + false, + ) + } + + pub fn must_kv_prewrite_one_pc(&self, muts: Vec, pk: Vec, ts: u64) { + test_raftstore::must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + false, + true, + ) + } + + pub fn must_kv_commit(&self, keys: Vec>, start_ts: u64, commit_ts: u64) { + test_raftstore::must_kv_commit( + &self.cli, + self.ctx.clone(), + keys, + start_ts, + commit_ts, + commit_ts, + ) + } + + pub fn must_kv_rollback(&self, keys: Vec>, start_ts: u64) { + test_raftstore::must_kv_rollback(&self.cli, self.ctx.clone(), keys, start_ts) + } + + pub fn must_kv_pessimistic_lock(&self, key: Vec, ts: u64) { + test_raftstore::must_kv_pessimistic_lock(&self.cli, self.ctx.clone(), key, ts) + } + + pub fn must_kv_pessimistic_rollback(&self, key: Vec, ts: u64) { + test_raftstore::must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts, ts) + } +} diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index d48acc4e92b..33430ba3fa8 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -39,6 +39,7 @@ file_system = { workspace = true } futures = "0.3" grpcio = { workspace = true } grpcio-health = { workspace = true } +hybrid_engine = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" @@ -48,6 +49,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" +region_cache_memory_engine = { workspace = true } resolved_ts = { workspace = true } resource_control = { workspace = true } resource_metering = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 23edf0efab1..2521fccb694 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -4,22 +4,26 @@ use std::{ collections::hash_map::Entry as MapEntry, error::Error as StdError, result, - sync::{mpsc, Arc, Mutex, RwLock}, + sync::{ + mpsc::{self}, + Arc, Mutex, RwLock, + }, thread, time::Duration, }; +use ::server::common::KvEngineBuilder; use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; +use engine_rocks::{RocksCompactedEvent, RocksEngine, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, SyncMutable, - WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, + Engines, Iterable, KvEngine, Mutable, Peekable, RaftEngineReadOnly, SnapshotContext, + SyncMutable, WriteBatch, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; -use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture}; +use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture, StreamExt}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::{ApiVersion, Context, DiskFullOpt}, @@ -51,7 +55,6 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; use tikv_util::{ - mpsc::future, thread_group::GroupProperties, time::{Instant, ThreadReadId}, worker::LazyWorker, @@ -62,12 +65,15 @@ use txn_types::WriteBatchFlags; use super::*; use crate::Config; +pub trait KvEngineWithRocks = + KvEngine + KvEngineBuilder; + // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id // isn't allocated by pd, and node id, store id are same. // E,g, for node 1, the node id and store id are both 1. -pub trait Simulator { +pub trait Simulator { // Pass 0 to let pd allocate a node id if db is empty. // If node id > 0, the node must be created in db already, // and the node id must be the same as given argument. @@ -77,11 +83,11 @@ pub trait Simulator { &mut self, node_id: u64, cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, resource_manager: &Option>, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); @@ -90,7 +96,7 @@ pub trait Simulator { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ) -> Result<()> { self.async_command_on_node_with_opts(node_id, request, cb, Default::default()) } @@ -98,13 +104,13 @@ pub trait Simulator { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, opts: RaftCmdExtraOpts, ) -> Result<()>; fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; fn get_snap_dir(&self, node_id: u64) -> String; fn get_snap_mgr(&self, node_id: u64) -> &SnapManager; - fn get_router(&self, node_id: u64) -> Option>; + fn get_router(&self, node_id: u64) -> Option>; fn add_send_filter(&mut self, node_id: u64, filter: Box); fn clear_send_filters(&mut self, node_id: u64); fn add_recv_filter(&mut self, node_id: u64, filter: Box); @@ -117,23 +123,25 @@ pub trait Simulator { fn read( &mut self, + snap_ctx: Option, batch_id: Option, request: RaftCmdRequest, timeout: Duration, ) -> Result { let node_id = request.get_header().get_peer().get_store_id(); - let (cb, mut rx) = make_cb(&request); - self.async_read(node_id, batch_id, request, cb); + let (cb, mut rx) = make_cb::(&request); + self.async_read(snap_ctx, node_id, batch_id, request, cb); rx.recv_timeout(timeout) .map_err(|_| Error::Timeout(format!("request timeout for {:?}", timeout))) } fn async_read( &mut self, + snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ); fn call_command_on_node( @@ -142,7 +150,7 @@ pub trait Simulator { request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb::(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} @@ -157,17 +165,17 @@ pub trait Simulator { } } -pub struct Cluster { +pub struct Cluster> { pub cfg: Config, leaders: HashMap, pub count: usize, pub paths: Vec, - pub dbs: Vec>, + pub dbs: Vec>, pub store_metas: HashMap>>, key_managers: Vec>>, pub io_rate_limiter: Option>, - pub engines: HashMap>, + pub engines: HashMap>, key_managers_map: HashMap>>, pub labels: HashMap>, group_props: HashMap, @@ -180,7 +188,11 @@ pub struct Cluster { resource_manager: Option>, } -impl Cluster { +impl Cluster +where + EK: KvEngineWithRocks, + T: Simulator, +{ // Create the default Store cluster. pub fn new( id: u64, @@ -188,7 +200,7 @@ impl Cluster { sim: Arc>, pd_client: Arc, api_version: ApiVersion, - ) -> Cluster { + ) -> Cluster { // TODO: In the future, maybe it's better to test both case where // `use_delete_range` is true and false Cluster { @@ -248,7 +260,7 @@ impl Cluster { assert!(self.sst_workers_map.insert(node_id, offset).is_none()); } - fn create_engine(&mut self, router: Option>) { + fn create_engine(&mut self, router: Option>) { let (engines, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = create_test_engine(router, self.io_rate_limiter.clone(), &self.cfg); self.dbs.push(engines); @@ -403,7 +415,7 @@ impl Cluster { tikv_util::thread_group::set_properties(previous_prop); } - pub fn get_engine(&self, node_id: u64) -> RocksEngine { + pub fn get_engine(&self, node_id: u64) -> EK { self.engines[&node_id].kv.clone() } @@ -411,7 +423,7 @@ impl Cluster { self.engines[&node_id].raft.clone() } - pub fn get_all_engines(&self, node_id: u64) -> Engines { + pub fn get_all_engines(&self, node_id: u64) -> Engines { self.engines[&node_id].clone() } @@ -440,11 +452,16 @@ impl Cluster { pub fn read( &self, + snap_ctx: Option, batch_id: Option, request: RaftCmdRequest, timeout: Duration, ) -> Result { - match self.sim.wl().read(batch_id, request.clone(), timeout) { + match self + .sim + .wl() + .read(snap_ctx, batch_id, request.clone(), timeout) + { Err(e) => { warn!("failed to read {:?}: {:?}", request, e); Err(e) @@ -468,7 +485,7 @@ impl Cluster { } } let ret = if is_read { - self.sim.wl().read(None, request.clone(), timeout) + self.sim.wl().read(None, None, request.clone(), timeout) } else { self.sim.rl().call_command(request.clone(), timeout) }; @@ -768,7 +785,7 @@ impl Cluster { self.leaders.remove(®ion_id); } - pub fn assert_quorum bool>(&self, mut condition: F) { + pub fn assert_quorum bool>(&self, mut condition: F) { if self.engines.is_empty() { return; } @@ -969,7 +986,7 @@ impl Cluster { pub fn async_request( &mut self, req: RaftCmdRequest, - ) -> Result> { + ) -> Result> { self.async_request_with_opts(req, Default::default()) } @@ -977,21 +994,24 @@ impl Cluster { &mut self, mut req: RaftCmdRequest, opts: RaftCmdExtraOpts, - ) -> Result> { + ) -> Result> { let region_id = req.get_header().get_region_id(); let leader = self.leader_of_region(region_id).unwrap(); req.mut_header().set_peer(leader.clone()); - let (cb, rx) = make_cb(&req); + let (cb, mut rx) = make_cb::(&req); self.sim .rl() .async_command_on_node_with_opts(leader.get_store_id(), req, cb, opts)?; - Ok(rx) + Ok(Box::pin(async move { + let fut = rx.next(); + fut.await.unwrap() + })) } pub fn async_exit_joint( &mut self, region_id: u64, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1007,7 +1027,7 @@ impl Cluster { &mut self, key: &[u8], value: &[u8], - ) -> Result> { + ) -> Result> { let mut region = self.get_region(key); let reqs = vec![new_put_cmd(key, value)]; let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); @@ -1018,7 +1038,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1031,7 +1051,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1320,7 +1340,7 @@ impl Cluster { } } - pub fn restore_kv_meta(&self, region_id: u64, store_id: u64, snap: &RocksSnapshot) { + pub fn restore_kv_meta(&self, region_id: u64, store_id: u64, snap: &EK::Snapshot) { let (meta_start, meta_end) = ( keys::region_meta_prefix(region_id), keys::region_meta_prefix(region_id + 1), @@ -1448,7 +1468,7 @@ impl Cluster { &mut self, region: &metapb::Region, split_key: &[u8], - cb: Callback, + cb: Callback, ) { let leader = self.leader_of_region(region.get_id()).unwrap(); let router = self.sim.rl().get_router(leader.get_store_id()).unwrap(); @@ -1461,6 +1481,7 @@ impl Cluster { split_keys: vec![split_key], callback: cb, source: "test".into(), + share_source_region_size: false, }, ) .unwrap(); @@ -1687,7 +1708,7 @@ impl Cluster { ) } - pub fn merge_region(&mut self, source: u64, target: u64, cb: Callback) { + pub fn merge_region(&mut self, source: u64, target: u64, cb: Callback) { let mut req = self.new_prepare_merge(source, target); let leader = self.leader_of_region(source).unwrap(); req.mut_header().set_peer(leader.clone()); @@ -1858,7 +1879,7 @@ impl Cluster { ctx } - pub fn get_router(&self, node_id: u64) -> Option> { + pub fn get_router(&self, node_id: u64) -> Option> { self.sim.rl().get_router(node_id) } @@ -1932,7 +1953,7 @@ impl Cluster { start_key: None, end_key: None, policy: CheckPolicy::Scan, - source: "test", + source: "bucket", cb, }, ) @@ -1960,7 +1981,7 @@ impl Cluster { } } -impl Drop for Cluster { +impl> Drop for Cluster { fn drop(&mut self) { test_util::clear_failpoints(); self.shutdown(); diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 04dfbd24de1..be38155af6c 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. #![feature(let_chains)] +#![feature(trait_alias)] #[macro_use] extern crate lazy_static; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index f429f27ff8b..5fdd4f24822 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -8,9 +8,9 @@ use std::{ use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; -use engine_traits::{Engines, MiscExt, Peekable}; +use engine_traits::{Engines, KvEngine, SnapshotContext}; use kvproto::{ kvrpcpb::ApiVersion, metapb, @@ -49,18 +49,18 @@ use tikv_util::{ use super::*; use crate::Config; -pub struct ChannelTransportCore { +pub struct ChannelTransportCore { snap_paths: HashMap, - routers: HashMap>>, + routers: HashMap, EK>>, } #[derive(Clone)] -pub struct ChannelTransport { - core: Arc>, +pub struct ChannelTransport { + core: Arc>>, } -impl ChannelTransport { - pub fn new() -> ChannelTransport { +impl ChannelTransport { + pub fn new() -> ChannelTransport { ChannelTransport { core: Arc::new(Mutex::new(ChannelTransportCore { snap_paths: HashMap::default(), @@ -70,13 +70,13 @@ impl ChannelTransport { } } -impl Default for ChannelTransport { +impl Default for ChannelTransport { fn default() -> Self { Self::new() } } -impl Transport for ChannelTransport { +impl Transport for ChannelTransport { fn send(&mut self, msg: RaftMessage) -> Result<()> { let from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); @@ -149,22 +149,22 @@ impl Transport for ChannelTransport { fn flush(&mut self) {} } -type SimulateChannelTransport = SimulateTransport; +type SimulateChannelTransport = SimulateTransport, EK>; -pub struct NodeCluster { - trans: ChannelTransport, +pub struct NodeCluster { + trans: ChannelTransport, pd_client: Arc, - nodes: HashMap>, + nodes: HashMap>, snap_mgrs: HashMap, cfg_controller: HashMap, - simulate_trans: HashMap, + simulate_trans: HashMap>, concurrency_managers: HashMap, #[allow(clippy::type_complexity)] - post_create_coprocessor_host: Option)>>, + post_create_coprocessor_host: Option)>>, } -impl NodeCluster { - pub fn new(pd_client: Arc) -> NodeCluster { +impl NodeCluster { + pub fn new(pd_client: Arc) -> NodeCluster { NodeCluster { trans: ChannelTransport::new(), pd_client, @@ -178,12 +178,12 @@ impl NodeCluster { } } -impl NodeCluster { +impl NodeCluster { #[allow(dead_code)] pub fn get_node_router( &self, node_id: u64, - ) -> SimulateTransport> { + ) -> SimulateTransport, EK> { self.trans .core .lock() @@ -198,17 +198,14 @@ impl NodeCluster { // first argument of `op` is the node_id. // Set this before invoking `run_node`. #[allow(clippy::type_complexity)] - pub fn post_create_coprocessor_host( - &mut self, - op: Box)>, - ) { + pub fn post_create_coprocessor_host(&mut self, op: Box)>) { self.post_create_coprocessor_host = Some(op) } pub fn get_node( &mut self, node_id: u64, - ) -> Option<&mut Node> { + ) -> Option<&mut Node> { self.nodes.get_mut(&node_id) } @@ -221,16 +218,16 @@ impl NodeCluster { } } -impl Simulator for NodeCluster { +impl Simulator for NodeCluster { fn run_node( &mut self, node_id: u64, cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, _resource_manager: &Option>, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); @@ -434,7 +431,7 @@ impl Simulator for NodeCluster { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, opts: RaftCmdExtraOpts, ) -> Result<()> { if !self @@ -462,10 +459,11 @@ impl Simulator for NodeCluster { fn async_read( &mut self, + snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ) { if !self .trans @@ -483,7 +481,7 @@ impl Simulator for NodeCluster { } let mut guard = self.trans.core.lock().unwrap(); let router = guard.routers.get_mut(&node_id).unwrap(); - router.read(batch_id, request, cb).unwrap(); + router.read(snap_ctx, batch_id, request, cb).unwrap(); } fn send_raft_msg(&mut self, msg: raft_serverpb::RaftMessage) -> Result<()> { @@ -514,14 +512,25 @@ impl Simulator for NodeCluster { trans.routers.get_mut(&node_id).unwrap().clear_filters(); } - fn get_router(&self, node_id: u64) -> Option> { + fn get_router(&self, node_id: u64) -> Option> { self.nodes.get(&node_id).map(|node| node.get_router()) } } // Compare to server cluster, node cluster does not have server layer and // storage layer. -pub fn new_node_cluster(id: u64, count: usize) -> Cluster { +pub fn new_node_cluster(id: u64, count: usize) -> Cluster> { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, ApiVersion::V1) +} + +// the hybrid engine with disk engine "RocksEngine" and region cache engine +// "RegionCacheMemoryEngine" is used in the node cluster. +pub fn new_node_cluster_with_hybrid_engine( + id: u64, + count: usize, +) -> Cluster> { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) @@ -529,7 +538,10 @@ pub fn new_node_cluster(id: u64, count: usize) -> Cluster { // This cluster does not support batch split, we expect it to transfer the // `BatchSplit` request to `split` request -pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { +pub fn new_incompatible_node_cluster( + id: u64, + count: usize, +) -> Cluster> { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 8d26bae968d..883a38edb23 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -13,9 +13,9 @@ use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; -use engine_traits::{Engines, MiscExt}; +use engine_traits::{Engines, KvEngine, SnapshotContext}; use futures::executor::block_on; use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; use grpcio_health::HealthService; @@ -86,12 +86,12 @@ use txn_types::TxnExtraScheduler; use super::*; use crate::Config; -type SimulateStoreTransport = SimulateTransport>; +type SimulateStoreTransport = SimulateTransport, EK>; -pub type SimulateEngine = RaftKv; -type SimulateRaftExtension = ::RaftExtension; -type SimulateServerTransport = - SimulateTransport>; +pub type SimulateEngine = RaftKv>; +type SimulateRaftExtension = as Engine>::RaftExtension; +type SimulateServerTransport = + SimulateTransport, PdStoreAddrResolver>, EK>; #[derive(Default, Clone)] pub struct AddressMap { @@ -113,8 +113,8 @@ impl StoreAddrResolver for AddressMap { fn resolve( &self, store_id: u64, - cb: Box) + Send>, - ) -> ServerResult<()> { + cb: Box) + Send>, + ) -> resolve::Result<()> { let addr = self.get(store_id); match addr { Some(addr) => cb(Ok(addr)), @@ -127,29 +127,29 @@ impl StoreAddrResolver for AddressMap { } } -struct ServerMeta { - node: Node, - server: Server, - sim_router: SimulateStoreTransport, - sim_trans: SimulateServerTransport, - raw_router: RaftRouter, - raw_apply_router: ApplyRouter, - gc_worker: GcWorker>, +struct ServerMeta { + node: Node, + server: Server>, + sim_router: SimulateStoreTransport, + sim_trans: SimulateServerTransport, + raw_router: RaftRouter, + raw_apply_router: ApplyRouter, + gc_worker: GcWorker>>, rts_worker: Option>, rsmeter_cleanup: Box, } type PendingServices = Vec Service>>; -type CopHooks = Vec)>>; +type CopHooks = Vec)>>; -pub struct ServerCluster { - metas: HashMap, +pub struct ServerCluster { + metas: HashMap>, addrs: AddressMap, - pub storages: HashMap, + pub storages: HashMap>, pub region_info_accessors: HashMap, - pub importers: HashMap>, + pub importers: HashMap>>, pub pending_services: HashMap, - pub coprocessor_hooks: HashMap, + pub coprocessor_hooks: HashMap>, pub health_services: HashMap, pub security_mgr: Arc, pub txn_extra_schedulers: HashMap>, @@ -163,8 +163,8 @@ pub struct ServerCluster { pub causal_ts_providers: HashMap>, } -impl ServerCluster { - pub fn new(pd_client: Arc) -> ServerCluster { +impl ServerCluster { + pub fn new(pd_client: Arc) -> ServerCluster { let env = Arc::new( EnvBuilder::new() .cq_count(2) @@ -211,19 +211,16 @@ impl ServerCluster { self.addrs.get(node_id).unwrap() } - pub fn get_apply_router(&self, node_id: u64) -> ApplyRouter { + pub fn get_apply_router(&self, node_id: u64) -> ApplyRouter { self.metas.get(&node_id).unwrap().raw_apply_router.clone() } - pub fn get_server_router(&self, node_id: u64) -> SimulateStoreTransport { + pub fn get_server_router(&self, node_id: u64) -> SimulateStoreTransport { self.metas.get(&node_id).unwrap().sim_router.clone() } /// To trigger GC manually. - pub fn get_gc_worker( - &self, - node_id: u64, - ) -> &GcWorker> { + pub fn get_gc_worker(&self, node_id: u64) -> &GcWorker>> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -264,11 +261,11 @@ impl ServerCluster { &mut self, node_id: u64, mut cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, resource_manager: &Option>, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { @@ -451,6 +448,7 @@ impl ServerCluster { Arc::clone(&importer), None, resource_manager.clone(), + Arc::new(region_info_accessor.clone()), ); // Create deadlock service. @@ -495,7 +493,7 @@ impl ServerCluster { ); let debugger = DebuggerImpl::new( - engines.clone(), + Engines::new(engines.kv.get_disk_engine().clone(), engines.raft.clone()), ConfigController::new(cfg.tikv.clone()), Some(store.clone()), ); @@ -667,16 +665,16 @@ impl ServerCluster { } } -impl Simulator for ServerCluster { +impl Simulator for ServerCluster { fn run_node( &mut self, node_id: u64, cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( @@ -727,7 +725,7 @@ impl Simulator for ServerCluster { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, opts: RaftCmdExtraOpts, ) -> Result<()> { let router = match self.metas.get(&node_id) { @@ -739,10 +737,11 @@ impl Simulator for ServerCluster { fn async_read( &mut self, + snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ) { match self.metas.get_mut(&node_id) { None => { @@ -752,7 +751,9 @@ impl Simulator for ServerCluster { cb.invoke_with_response(resp); } Some(meta) => { - meta.sim_router.read(batch_id, request, cb).unwrap(); + meta.sim_router + .read(snap_ctx, batch_id, request, cb) + .unwrap(); } }; } @@ -799,13 +800,13 @@ impl Simulator for ServerCluster { .clear_filters(); } - fn get_router(&self, node_id: u64) -> Option> { + fn get_router(&self, node_id: u64) -> Option> { self.metas.get(&node_id).map(|m| m.raw_router.clone()) } } -impl Cluster { - pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { +impl Cluster> { + pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { self.must_get_snapshot_of_region_with_ctx(region_id, Default::default()) } @@ -813,8 +814,8 @@ impl Cluster { &mut self, region_id: u64, snap_ctx: SnapContext<'_>, - ) -> RegionSnapshot { - let mut try_snapshot = || -> Option> { + ) -> RegionSnapshot { + let mut try_snapshot = || -> Option> { let leader = self.leader_of_region(region_id)?; let store_id = leader.store_id; let epoch = self.get_region_epoch(region_id); @@ -839,7 +840,7 @@ impl Cluster { panic!("failed to get snapshot of region {}", region_id); } - pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { + pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { self.sim.rl().storages[&node_id].raft_extension() } @@ -847,11 +848,7 @@ impl Cluster { self.sim.rl().get_addr(node_id) } - pub fn register_hook( - &self, - node_id: u64, - register: Box)>, - ) { + pub fn register_hook(&self, node_id: u64, register: Box)>) { self.sim .wl() .coprocessor_hooks @@ -861,7 +858,21 @@ impl Cluster { } } -pub fn new_server_cluster(id: u64, count: usize) -> Cluster { +pub fn new_server_cluster( + id: u64, + count: usize, +) -> Cluster> { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, ApiVersion::V1) +} + +// the hybrid engine with disk engine "RocksEngine" and region cache engine +// "RegionCacheMemoryEngine" is used in the server cluster. +pub fn new_server_cluster_with_hybrid_engine( + id: u64, + count: usize, +) -> Cluster> { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) @@ -871,32 +882,49 @@ pub fn new_server_cluster_with_api_ver( id: u64, count: usize, api_ver: ApiVersion, -) -> Cluster { +) -> Cluster> { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, api_ver) } -pub fn new_incompatible_server_cluster(id: u64, count: usize) -> Cluster { +pub fn new_incompatible_server_cluster( + id: u64, + count: usize, +) -> Cluster> { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } -pub fn must_new_cluster_mul(count: usize) -> (Cluster, metapb::Peer, Context) { +pub fn must_new_cluster_mul( + count: usize, +) -> ( + Cluster>, + metapb::Peer, + Context, +) { must_new_and_configure_cluster_mul(count, |_| ()) } pub fn must_new_and_configure_cluster( - configure: impl FnMut(&mut Cluster), -) -> (Cluster, metapb::Peer, Context) { + configure: impl FnMut(&mut Cluster>), +) -> ( + Cluster>, + metapb::Peer, + Context, +) { must_new_and_configure_cluster_mul(1, configure) } fn must_new_and_configure_cluster_mul( count: usize, - mut configure: impl FnMut(&mut Cluster), -) -> (Cluster, metapb::Peer, Context) { + mut configure: impl FnMut(&mut Cluster>), +) -> ( + Cluster>, + metapb::Peer, + Context, +) { let mut cluster = new_server_cluster(0, count); configure(&mut cluster); cluster.run(); @@ -911,13 +939,21 @@ fn must_new_and_configure_cluster_mul( (cluster, leader, ctx) } -pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, Context) { +pub fn must_new_cluster_and_kv_client() -> ( + Cluster>, + TikvClient, + Context, +) { must_new_cluster_and_kv_client_mul(1) } pub fn must_new_cluster_and_kv_client_mul( count: usize, -) -> (Cluster, TikvClient, Context) { +) -> ( + Cluster>, + TikvClient, + Context, +) { let (cluster, leader, ctx) = must_new_cluster_mul(count); let env = Arc::new(Environment::new(1)); @@ -928,7 +964,11 @@ pub fn must_new_cluster_and_kv_client_mul( (cluster, client, ctx) } -pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClient, u64) { +pub fn must_new_cluster_and_debug_client() -> ( + Cluster>, + DebugClient, + u64, +) { let (cluster, leader, _) = must_new_cluster_mul(1); let env = Arc::new(Environment::new(1)); @@ -939,8 +979,12 @@ pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClie (cluster, client, leader.get_store_id()) } -pub fn must_new_cluster_kv_client_and_debug_client() --> (Cluster, TikvClient, DebugClient, Context) { +pub fn must_new_cluster_kv_client_and_debug_client() -> ( + Cluster>, + TikvClient, + DebugClient, + Context, +) { let (cluster, leader, ctx) = must_new_cluster_mul(1); let env = Arc::new(Environment::new(1)); @@ -954,8 +998,12 @@ pub fn must_new_cluster_kv_client_and_debug_client() } pub fn must_new_and_configure_cluster_and_kv_client( - configure: impl FnMut(&mut Cluster), -) -> (Cluster, TikvClient, Context) { + configure: impl FnMut(&mut Cluster>), +) -> ( + Cluster>, + TikvClient, + Context, +) { let (cluster, leader, ctx) = must_new_and_configure_cluster(configure); let env = Arc::new(Environment::new(1)); @@ -966,7 +1014,12 @@ pub fn must_new_and_configure_cluster_and_kv_client( (cluster, client, ctx) } -pub fn setup_cluster() -> (Cluster, TikvClient, String, Context) { +pub fn setup_cluster() -> ( + Cluster>, + TikvClient, + String, + Context, +) { let mut cluster = new_server_cluster(0, 3); cluster.run(); diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index ef569e3987a..3824e0dbe75 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -11,7 +11,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; -use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_traits::{KvEngine, SnapshotContext}; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::eraftpb::MessageType; use raftstore::{ @@ -140,16 +140,19 @@ impl Filter for DelayFilter { } #[derive(Clone)] -pub struct SimulateTransport { +pub struct SimulateTransport { filters: Arc>>>, ch: C, + + _p: PhantomData, } -impl SimulateTransport { - pub fn new(ch: C) -> SimulateTransport { +impl SimulateTransport { + pub fn new(ch: C) -> SimulateTransport { SimulateTransport { filters: Arc::new(RwLock::new(vec![])), ch, + _p: PhantomData, } } @@ -195,7 +198,7 @@ where res } -impl Transport for SimulateTransport { +impl Transport for SimulateTransport { fn send(&mut self, m: RaftMessage) -> Result<()> { let ch = &mut self.ch; filter_send(&self.filters, m, |m| ch.send(m)) @@ -214,49 +217,52 @@ impl Transport for SimulateTransport { } } -impl> StoreRouter for SimulateTransport { - fn send(&self, msg: StoreMsg) -> Result<()> { +impl> StoreRouter for SimulateTransport { + fn send(&self, msg: StoreMsg) -> Result<()> { StoreRouter::send(&self.ch, msg) } } -impl> ProposalRouter for SimulateTransport { +impl> ProposalRouter<::Snapshot> + for SimulateTransport +{ fn send( &self, - cmd: RaftCommand, - ) -> std::result::Result<(), TrySendError>> { - ProposalRouter::::send(&self.ch, cmd) + cmd: RaftCommand<::Snapshot>, + ) -> std::result::Result<(), TrySendError::Snapshot>>> { + ProposalRouter::<::Snapshot>::send(&self.ch, cmd) } } -impl> CasualRouter for SimulateTransport { - fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - CasualRouter::::send(&self.ch, region_id, msg) +impl> CasualRouter for SimulateTransport { + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::::send(&self.ch, region_id, msg) } } -impl> SignificantRouter for SimulateTransport { - fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { +impl> SignificantRouter for SimulateTransport { + fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { self.ch.significant_send(region_id, msg) } } -impl> RaftStoreRouter for SimulateTransport { +impl> RaftStoreRouter for SimulateTransport { fn send_raft_msg(&self, msg: RaftMessage) -> Result<()> { filter_send(&self.filters, msg, |m| self.ch.send_raft_msg(m)) } - fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) {} + fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) {} } -impl> LocalReadRouter for SimulateTransport { +impl> LocalReadRouter for SimulateTransport { fn read( &mut self, + snap_ctx: Option, read_id: Option, req: RaftCmdRequest, - cb: Callback, + cb: Callback, ) -> RaftStoreResult<()> { - self.ch.read(read_id, req, cb) + self.ch.read(snap_ctx, read_id, req, cb) } fn release_snapshot_cache(&mut self) { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 02a74136bb6..019a7416a7a 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -13,15 +13,19 @@ use collections::HashMap; use encryption_export::{ data_key_manager_from_config, DataKeyManager, FileConfig, MasterKeyConfig, }; -use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; +use engine_rocks::{ + config::BlobRunMode, RocksCompactedEvent, RocksEngine, RocksSnapshot, RocksStatistics, +}; use engine_test::raft::RaftTestEngine; use engine_traits::{ CfName, CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, - CF_DEFAULT, CF_RAFT, + CF_DEFAULT, CF_RAFT, CF_WRITE, }; +use fail::fail_point; use file_system::IoRateLimiter; use futures::{executor::block_on, future::BoxFuture, StreamExt}; use grpcio::{ChannelBuilder, Environment}; +use hybrid_engine::HybridEngine; use kvproto::{ encryptionpb::EncryptionMethod, kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, @@ -43,7 +47,8 @@ use raftstore::{ RaftRouterCompactedEventSender, Result, }; use rand::{seq::SliceRandom, RngCore}; -use server::common::ConfiguredRaftEngine; +use region_cache_memory_engine::RegionCacheMemoryEngine; +use server::common::{ConfiguredRaftEngine, KvEngineBuilder}; use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::{ @@ -60,7 +65,9 @@ use tikv_util::{ }; use txn_types::Key; -use crate::{Cluster, Config, RawEngine, ServerCluster, Simulator}; +use crate::{Cluster, Config, KvEngineWithRocks, RawEngine, ServerCluster, Simulator}; + +pub type HybridEngineImpl = HybridEngine; pub fn must_get( engine: &impl RawEngine, @@ -81,15 +88,14 @@ pub fn must_get( } debug!("last try to get {}", log_wrappers::hex_encode_upper(key)); let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); - if value.is_none() && res.is_none() - || value.is_some() && res.is_some() && value.unwrap() == &*res.unwrap() - { + if value == res.as_ref().map(|r| r.as_ref()) { return; } panic!( - "can't get value {:?} for key {}", + "can't get value {:?} for key {}, actual={:?}", value.map(escape), - log_wrappers::hex_encode_upper(key) + log_wrappers::hex_encode_upper(key), + res ) } @@ -174,9 +180,20 @@ pub fn new_tikv_config_with_api_ver(cluster_id: u64, api_ver: ApiVersion) -> Tik let mut cfg = TEST_CONFIG.clone(); cfg.server.cluster_id = cluster_id; cfg.storage.set_api_version(api_ver); + cfg.raft_store.pd_report_min_resolved_ts_interval = config(ReadableDuration::secs(1)); cfg } +fn config(interval: ReadableDuration) -> ReadableDuration { + fail_point!("mock_min_resolved_ts_interval", |_| { + ReadableDuration::millis(50) + }); + fail_point!("mock_min_resolved_ts_interval_disable", |_| { + ReadableDuration::millis(0) + }); + interval +} + // Create a base request. pub fn new_base_request(region_id: u64, epoch: RegionEpoch, read_quorum: bool) -> RaftCmdRequest { let mut req = RaftCmdRequest::default(); @@ -385,14 +402,20 @@ pub fn check_raft_cmd_request(cmd: &RaftCmdRequest) -> bool { is_read } -pub fn make_cb( +pub fn make_cb_rocks( cmd: &RaftCmdRequest, ) -> (Callback, future::Receiver) { + make_cb::(cmd) +} + +pub fn make_cb( + cmd: &RaftCmdRequest, +) -> (Callback, future::Receiver) { let is_read = check_raft_cmd_request(cmd); let (tx, rx) = future::bounded(1, future::WakePolicy::Immediately); let mut detector = CallbackLeakDetector::default(); let cb = if is_read { - Callback::read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { detector.called = true; // we don't care error actually. let _ = tx.send(resp.response); @@ -407,12 +430,12 @@ pub fn make_cb( (cb, rx) } -pub fn make_cb_ext( +pub fn make_cb_ext( cmd: &RaftCmdRequest, proposed: Option, committed: Option, -) -> (Callback, future::Receiver) { - let (cb, receiver) = make_cb(cmd); +) -> (Callback, future::Receiver) { + let (cb, receiver) = make_cb::(cmd); if let Callback::Write { cb, .. } = cb { (Callback::write_ext(cb, proposed, committed), receiver) } else { @@ -421,8 +444,8 @@ pub fn make_cb_ext( } // Issue a read request on the specified peer. -pub fn read_on_peer( - cluster: &mut Cluster, +pub fn read_on_peer>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -436,11 +459,11 @@ pub fn read_on_peer( read_quorum, ); request.mut_header().set_peer(peer); - cluster.read(None, request, timeout) + cluster.read(None, None, request, timeout) } -pub fn async_read_on_peer( - cluster: &mut Cluster, +pub fn async_read_on_peer>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -458,17 +481,20 @@ pub fn async_read_on_peer( request.mut_header().set_replica_read(replica_read); let (tx, mut rx) = future::bounded(1, future::WakePolicy::Immediately); let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); - cluster.sim.wl().async_read(node_id, None, request, cb); + cluster + .sim + .wl() + .async_read(None, node_id, None, request, cb); Box::pin(async move { let fut = rx.next(); fut.await.unwrap() }) } -pub fn batch_read_on_peer( - cluster: &mut Cluster, +pub fn batch_read_on_peer>( + cluster: &mut Cluster, requests: &[(metapb::Peer, metapb::Region)], -) -> Vec> { +) -> Vec> { let batch_id = Some(ThreadReadId::new()); let (tx, rx) = mpsc::sync_channel(3); let mut results = vec![]; @@ -489,7 +515,7 @@ pub fn batch_read_on_peer( cluster .sim .wl() - .async_read(node_id, batch_id.clone(), request, cb); + .async_read(None, node_id, batch_id.clone(), request, cb); len += 1; } while results.len() < len { @@ -499,8 +525,8 @@ pub fn batch_read_on_peer( results.into_iter().map(|resp| resp.1).collect() } -pub fn read_index_on_peer( - cluster: &mut Cluster, +pub fn read_index_on_peer>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, read_quorum: bool, @@ -513,11 +539,11 @@ pub fn read_index_on_peer( read_quorum, ); request.mut_header().set_peer(peer); - cluster.read(None, request, timeout) + cluster.read(None, None, request, timeout) } -pub fn async_read_index_on_peer( - cluster: &mut Cluster, +pub fn async_read_index_on_peer>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -538,19 +564,22 @@ pub fn async_read_index_on_peer( request.mut_header().set_peer(peer); let (tx, mut rx) = future::bounded(1, future::WakePolicy::Immediately); let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); - cluster.sim.wl().async_read(node_id, None, request, cb); + cluster + .sim + .wl() + .async_read(None, node_id, None, request, cb); Box::pin(async move { let fut = rx.next(); fut.await.unwrap() }) } -pub fn async_command_on_node( - cluster: &mut Cluster, +pub fn async_command_on_node>( + cluster: &mut Cluster, node_id: u64, request: RaftCmdRequest, ) -> BoxFuture<'static, RaftCmdResponse> { - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb::(&request); cluster .sim .rl() @@ -572,8 +601,8 @@ pub fn must_get_value(resp: &RaftCmdResponse) -> Vec { resp.get_responses()[0].get_get().get_value().to_vec() } -pub fn must_read_on_peer( - cluster: &mut Cluster, +pub fn must_read_on_peer>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -591,8 +620,8 @@ pub fn must_read_on_peer( } } -pub fn must_error_read_on_peer( - cluster: &mut Cluster, +pub fn must_error_read_on_peer>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -617,19 +646,22 @@ pub fn must_contains_error(resp: &RaftCmdResponse, msg: &str) { assert!(err_msg.contains(msg), "{:?}", resp); } -pub fn create_test_engine( +pub fn create_test_engine( // TODO: pass it in for all cases. - router: Option>, + router: Option>, limiter: Option>, cfg: &Config, ) -> ( - Engines, + Engines, Option>, TempDir, LazyWorker, Arc, Option>, -) { +) +where + EK: KvEngine + KvEngineBuilder, +{ let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); let mut cfg = cfg.clone(); cfg.storage.data_dir = dir.path().to_str().unwrap().to_string(); @@ -657,8 +689,9 @@ pub fn create_test_engine( })); } let factory = builder.build(); - let engine = factory.create_shared_db(dir.path()).unwrap(); - let engines = Engines::new(engine, raft_engine); + let disk_engine = factory.create_shared_db(dir.path()).unwrap(); + let kv_engine: EK = KvEngineBuilder::build(disk_engine); + let engines = Engines::new(kv_engine, raft_engine); ( engines, key_manager, @@ -669,11 +702,11 @@ pub fn create_test_engine( ) } -pub fn configure_for_request_snapshot(cluster: &mut Cluster) { +pub fn configure_for_request_snapshot(config: &mut Config) { // We don't want to generate snapshots due to compact log. - cluster.cfg.raft_store.raft_log_gc_threshold = 1000; - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); - cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); + config.raft_store.raft_log_gc_threshold = 1000; + config.raft_store.raft_log_gc_count_limit = Some(1000); + config.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); } pub fn configure_for_hibernate(config: &mut Config) { @@ -735,8 +768,8 @@ pub fn configure_for_lease_read( election_timeout } -pub fn configure_for_enable_titan( - cluster: &mut Cluster, +pub fn configure_for_enable_titan>( + cluster: &mut Cluster, min_blob_size: ReadableSize, ) { cluster.cfg.rocksdb.titan.enabled = true; @@ -747,11 +780,15 @@ pub fn configure_for_enable_titan( cluster.cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize::kb(0); } -pub fn configure_for_disable_titan(cluster: &mut Cluster) { +pub fn configure_for_disable_titan>( + cluster: &mut Cluster, +) { cluster.cfg.rocksdb.titan.enabled = false; } -pub fn configure_for_encryption(cluster: &mut Cluster) { +pub fn configure_for_encryption>( + cluster: &mut Cluster, +) { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let master_key_file = manifest_dir.join("src/master-key.data"); @@ -765,8 +802,8 @@ pub fn configure_for_encryption(cluster: &mut Cluster) { } } -pub fn configure_for_causal_ts( - cluster: &mut Cluster, +pub fn configure_for_causal_ts>( + cluster: &mut Cluster, renew_interval: &str, renew_batch_min_size: u32, ) { @@ -776,16 +813,24 @@ pub fn configure_for_causal_ts( } /// Keep putting random kvs until specified size limit is reached. -pub fn put_till_size( - cluster: &mut Cluster, +pub fn put_till_size>( + cluster: &mut Cluster, limit: u64, range: &mut dyn Iterator, ) -> Vec { put_cf_till_size(cluster, CF_DEFAULT, limit, range) } -pub fn put_cf_till_size( - cluster: &mut Cluster, +pub fn put_till_count>( + cluster: &mut Cluster, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + put_cf_till_count(cluster, CF_WRITE, limit, range) +} + +pub fn put_cf_till_size>( + cluster: &mut Cluster, cf: &'static str, limit: u64, range: &mut dyn Iterator, @@ -816,6 +861,36 @@ pub fn put_cf_till_size( key.into_bytes() } +pub fn put_cf_till_count>( + cluster: &mut Cluster, + cf: &'static str, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + assert!(limit > 0); + let mut len = 0; + let mut rng = rand::thread_rng(); + let mut key = String::new(); + let mut value = vec![0; 64]; + while len < limit { + let batch_size = std::cmp::min(5, limit - len); + let mut reqs = vec![]; + for _ in 0..batch_size { + key.clear(); + let key_id = range.next().unwrap(); + write!(key, "{:09}", key_id).unwrap(); + rng.fill_bytes(&mut value); + reqs.push(new_put_cf_cmd(cf, key.as_bytes(), &value)); + } + len += batch_size; + cluster.batch_put(key.as_bytes(), reqs).unwrap(); + // Approximate size of memtable is inaccurate for small data, + // we flush it to SST so we can use the size properties instead. + cluster.must_flush_cf(cf, true); + } + key.into_bytes() +} + pub fn new_mutation(op: Op, k: &[u8], v: &[u8]) -> Mutation { let mut mutation = Mutation::default(); mutation.set_op(op); @@ -959,6 +1034,7 @@ pub fn must_kv_prewrite_with( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -968,7 +1044,7 @@ pub fn must_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; + prewrite_req.pessimistic_actions = pessimistic_actions; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -995,6 +1071,7 @@ pub fn try_kv_prewrite_with( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -1005,6 +1082,7 @@ pub fn try_kv_prewrite_with( client, ctx, muts, + pessimistic_actions, pk, ts, for_update_ts, @@ -1018,6 +1096,7 @@ pub fn try_kv_prewrite_with_impl( client: &TikvClient, ctx: Context, muts: Vec, + pessimistic_actions: Vec, pk: Vec, ts: u64, for_update_ts: u64, @@ -1027,7 +1106,7 @@ pub fn try_kv_prewrite_with_impl( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; + prewrite_req.pessimistic_actions = pessimistic_actions; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -1047,7 +1126,7 @@ pub fn try_kv_prewrite( pk: Vec, ts: u64, ) -> PrewriteResponse { - try_kv_prewrite_with(client, ctx, muts, pk, ts, 0, false, false) + try_kv_prewrite_with(client, ctx, muts, vec![], pk, ts, 0, false, false) } pub fn try_kv_prewrite_pessimistic( @@ -1057,7 +1136,18 @@ pub fn try_kv_prewrite_pessimistic( pk: Vec, ts: u64, ) -> PrewriteResponse { - try_kv_prewrite_with(client, ctx, muts, pk, ts, ts, false, false) + let len = muts.len(); + try_kv_prewrite_with( + client, + ctx, + muts, + vec![DoPessimisticCheck; len], + pk, + ts, + ts, + false, + false, + ) } pub fn must_kv_prewrite( @@ -1067,7 +1157,7 @@ pub fn must_kv_prewrite( pk: Vec, ts: u64, ) { - must_kv_prewrite_with(client, ctx, muts, pk, ts, 0, false, false) + must_kv_prewrite_with(client, ctx, muts, vec![], pk, ts, 0, false, false) } pub fn must_kv_prewrite_pessimistic( @@ -1077,7 +1167,18 @@ pub fn must_kv_prewrite_pessimistic( pk: Vec, ts: u64, ) { - must_kv_prewrite_with(client, ctx, muts, pk, ts, ts, false, false) + let len = muts.len(); + must_kv_prewrite_with( + client, + ctx, + muts, + vec![DoPessimisticCheck; len], + pk, + ts, + ts, + false, + false, + ) } pub fn must_kv_commit( @@ -1233,6 +1334,50 @@ pub fn must_check_txn_status( resp } +pub fn must_kv_have_locks( + client: &TikvClient, + ctx: Context, + ts: u64, + start_key: &[u8], + end_key: &[u8], + expected_locks: &[( + // key + &[u8], + Op, + // start_ts + u64, + // for_update_ts + u64, + )], +) { + let mut req = ScanLockRequest::default(); + req.set_context(ctx); + req.set_limit(100); + req.set_start_key(start_key.to_vec()); + req.set_end_key(end_key.to_vec()); + req.set_max_version(ts); + let resp = client.kv_scan_lock(&req).unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(resp.error.is_none(), "{:?}", resp.get_error()); + + assert_eq!( + resp.locks.len(), + expected_locks.len(), + "lock count not match, expected: {:?}; got: {:?}", + expected_locks, + resp.locks + ); + + for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in + resp.locks.into_iter().zip(expected_locks.iter()) + { + assert_eq!(lock_info.get_key(), *expected_key); + assert_eq!(lock_info.get_lock_type(), *expected_op); + assert_eq!(lock_info.get_lock_version(), *expected_start_ts); + assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); + } +} + pub fn get_tso(pd_client: &TestPdClient) -> u64 { block_on(pd_client.get_tso()).unwrap().into_inner() } @@ -1394,7 +1539,11 @@ pub struct PeerClient { } impl PeerClient { - pub fn new(cluster: &Cluster, region_id: u64, peer: metapb::Peer) -> PeerClient { + pub fn new( + cluster: &Cluster>, + region_id: u64, + peer: metapb::Peer, + ) -> PeerClient { let cli = { let env = Arc::new(Environment::new(1)); let channel = @@ -1441,11 +1590,31 @@ impl PeerClient { } pub fn must_kv_prewrite_async_commit(&self, muts: Vec, pk: Vec, ts: u64) { - must_kv_prewrite_with(&self.cli, self.ctx.clone(), muts, pk, ts, 0, true, false) + must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + true, + false, + ) } pub fn must_kv_prewrite_one_pc(&self, muts: Vec, pk: Vec, ts: u64) { - must_kv_prewrite_with(&self.cli, self.ctx.clone(), muts, pk, ts, 0, false, true) + must_kv_prewrite_with( + &self.cli, + self.ctx.clone(), + muts, + vec![], + pk, + ts, + 0, + false, + true, + ) } pub fn must_kv_commit(&self, keys: Vec>, start_ts: u64, commit_ts: u64) { @@ -1481,7 +1650,11 @@ pub fn peer_on_store(region: &metapb::Region, store_id: u64) -> metapb::Peer { .clone() } -pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, region_id: u64) { +pub fn wait_for_synced( + cluster: &mut Cluster>, + node_id: u64, + region_id: u64, +) { let mut storage = cluster .sim .read() @@ -1511,7 +1684,10 @@ pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, regio assert!(snapshot.ext().is_max_ts_synced()); } -pub fn test_delete_range(cluster: &mut Cluster, cf: CfName) { +pub fn test_delete_range>( + cluster: &mut Cluster, + cf: CfName, +) { let data_set: Vec<_> = (1..500) .map(|i| { ( @@ -1544,8 +1720,8 @@ pub fn test_delete_range(cluster: &mut Cluster, cf: CfName) { } } -pub fn put_with_timeout( - cluster: &mut Cluster, +pub fn put_with_timeout>( + cluster: &mut Cluster, node_id: u64, key: &[u8], value: &[u8], @@ -1562,7 +1738,11 @@ pub fn put_with_timeout( cluster.call_command_on_node(node_id, req, timeout) } -pub fn wait_down_peers(cluster: &Cluster, count: u64, peer: Option) { +pub fn wait_down_peers>( + cluster: &Cluster, + count: u64, + peer: Option, +) { let mut peers = cluster.get_down_peers(); for _ in 1..1000 { if peers.len() == count as usize && peer.as_ref().map_or(true, |p| peers.contains_key(p)) { diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index 17fa91f3005..97ea7bf0d24 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -23,6 +23,7 @@ test-engines-panic = [ [dependencies] api_version = { workspace = true } collections = { workspace = true } +engine_rocks = { workspace = true } futures = "0.3" kvproto = { workspace = true } pd_client = { workspace = true } diff --git a/components/test_storage/src/assert_storage.rs b/components/test_storage/src/assert_storage.rs index 3a641a322a2..d4cdbdb2698 100644 --- a/components/test_storage/src/assert_storage.rs +++ b/components/test_storage/src/assert_storage.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use api_version::{ApiV1, KvFormat}; +use engine_rocks::RocksEngine as RocksDb; use kvproto::{ kvrpcpb::{Context, KeyRange, LockInfo}, metapb, @@ -44,11 +45,11 @@ impl AssertionStorage { } } -impl AssertionStorage { +impl AssertionStorage, F> { pub fn new_raft_storage_with_store_count( count: usize, key: &str, - ) -> (Cluster, Self) { + ) -> (Cluster>, Self) { let (cluster, store, ctx) = new_raft_storage_with_store_count::(count, key); let storage = Self { store, ctx }; (cluster, storage) @@ -56,7 +57,7 @@ impl AssertionStorage { pub fn update_with_key_byte( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, key: &[u8], ) -> metapb::Region { // ensure the leader of range which contains current key has been elected @@ -79,7 +80,7 @@ impl AssertionStorage { pub fn delete_ok_for_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -98,7 +99,7 @@ impl AssertionStorage { fn get_from_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, key: &[u8], ts: impl Into, ) -> Option { @@ -116,7 +117,7 @@ impl AssertionStorage { pub fn get_none_from_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, key: &[u8], ts: impl Into, ) { @@ -125,7 +126,7 @@ impl AssertionStorage { pub fn put_ok_for_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, key: &[u8], value: &[u8], start_ts: impl Into, @@ -138,7 +139,7 @@ impl AssertionStorage { pub fn batch_put_ok_for_cluster<'a>( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, keys: &[impl AsRef<[u8]>], vals: impl Iterator, start_ts: impl Into, @@ -162,7 +163,7 @@ impl AssertionStorage { fn two_pc_ok_for_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, prewrite_mutations: Vec, key: &[u8], commit_keys: Vec, @@ -206,7 +207,7 @@ impl AssertionStorage { pub fn gc_ok_for_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, region_key: &[u8], mut region: metapb::Region, safe_point: impl Into, @@ -225,7 +226,7 @@ impl AssertionStorage { pub fn test_txn_store_gc3_for_cluster( &mut self, - cluster: &mut Cluster, + cluster: &mut Cluster>, key_prefix: u8, ) { let key_len = 10_000; diff --git a/components/test_storage/src/util.rs b/components/test_storage/src/util.rs index e91125ba001..54f82375afe 100644 --- a/components/test_storage/src/util.rs +++ b/components/test_storage/src/util.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use api_version::KvFormat; +use engine_rocks::RocksEngine; use kvproto::kvrpcpb::Context; use test_raftstore::{new_server_cluster, Cluster, ServerCluster, SimulateEngine}; use tikv_util::HandyRwLock; @@ -55,7 +56,11 @@ macro_rules! follower_raft_engine { pub fn new_raft_engine( count: usize, key: &str, -) -> (Cluster, SimulateEngine, Context) { +) -> ( + Cluster>, + SimulateEngine, + Context, +) { let mut cluster = new_server_cluster(0, count); let (engine, ctx) = prepare_raft_engine!(cluster, key); (cluster, engine, ctx) @@ -65,8 +70,8 @@ pub fn new_raft_storage_with_store_count( count: usize, key: &str, ) -> ( - Cluster, - SyncTestStorage, + Cluster>, + SyncTestStorage, F>, Context, ) { let (cluster, engine, ctx) = new_raft_engine(count, key); diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs index 6f27475ff2c..137d9dd22c3 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs @@ -28,45 +28,39 @@ impl Encoding for EncodingGbk { #[inline] // GBK lower and upper follows https://dev.mysql.com/worklog/task/?id=4583. fn lower(s: &str, writer: BytesWriter) -> BytesGuard { - let res = s.chars().flat_map(|ch| { - let c = ch as u32; - match c { - 0x216A..=0x216B => char::from_u32(c), - _ => char::from_u32(c).unwrap().to_lowercase().next(), - } + let res = s.chars().flat_map(|ch| match ch as u32 { + 0x216A..=0x216B => Some(ch), + _ => unicode_to_lower(ch), }); writer.write_from_char_iter(res) } #[inline] fn upper(s: &str, writer: BytesWriter) -> BytesGuard { - let res = s.chars().flat_map(|ch| { - let c = ch as u32; - match c { - 0x00E0..=0x00E1 - | 0x00E8..=0x00EA - | 0x00EC..=0x00ED - | 0x00F2..=0x00F3 - | 0x00F9..=0x00FA - | 0x00FC - | 0x0101 - | 0x0113 - | 0x011B - | 0x012B - | 0x0144 - | 0x0148 - | 0x014D - | 0x016B - | 0x01CE - | 0x01D0 - | 0x01D2 - | 0x01D4 - | 0x01D6 - | 0x01D8 - | 0x01DA - | 0x01DC => char::from_u32(c), - _ => char::from_u32(c).unwrap().to_uppercase().next(), - } + let res = s.chars().flat_map(|ch| match ch as u32 { + 0x00E0..=0x00E1 + | 0x00E8..=0x00EA + | 0x00EC..=0x00ED + | 0x00F2..=0x00F3 + | 0x00F9..=0x00FA + | 0x00FC + | 0x0101 + | 0x0113 + | 0x011B + | 0x012B + | 0x0144 + | 0x0148 + | 0x014D + | 0x016B + | 0x01CE + | 0x01D0 + | 0x01D2 + | 0x01D4 + | 0x01D6 + | 0x01D8 + | 0x01DA + | 0x01DC => Some(ch), + _ => unicode_to_upper(ch), }); writer.write_from_char_iter(res) } diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs index b2434105ce5..268b11aad41 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs @@ -2,12 +2,14 @@ mod ascii; mod gbk; +mod unicode_letter; mod utf8; use std::str; pub use ascii::*; pub use gbk::*; +pub use unicode_letter::*; pub use utf8::*; use super::Encoding; diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/unicode_letter.rs b/components/tidb_query_datatype/src/codec/collation/encoding/unicode_letter.rs new file mode 100644 index 00000000000..e83af2723c5 --- /dev/null +++ b/components/tidb_query_datatype/src/codec/collation/encoding/unicode_letter.rs @@ -0,0 +1,550 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +/// In order to keep the same behavoir as TiDB that uses go standard library to +/// implement lower and upper functions. Below code is ported from https://github.com/golang/go/blob/go1.21.3/src/unicode/letter.go. +const UPPER_CASE: usize = 0; +const LOWER_CASE: usize = 1; +const TITLE_CASE: usize = 2; +const MAX_CASE: usize = 3; + +const MAX_ASCII: i32 = 0x7F; +const MAX_RUNE: i32 = 0x10FFFF; +const REPLACEMENT_CHAR: i32 = 0xFFFD; + +const UPPER_LOWER: i32 = MAX_RUNE + 1; + +static CASE_TABLE: &[(i32, i32, [i32; MAX_CASE])] = &[ + (0x0041, 0x005A, [0, 32, 0]), + (0x0061, 0x007A, [-32, 0, -32]), + (0x00B5, 0x00B5, [743, 0, 743]), + (0x00C0, 0x00D6, [0, 32, 0]), + (0x00D8, 0x00DE, [0, 32, 0]), + (0x00E0, 0x00F6, [-32, 0, -32]), + (0x00F8, 0x00FE, [-32, 0, -32]), + (0x00FF, 0x00FF, [121, 0, 121]), + (0x0100, 0x012F, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0130, 0x0130, [0, -199, 0]), + (0x0131, 0x0131, [-232, 0, -232]), + (0x0132, 0x0137, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0139, 0x0148, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x014A, 0x0177, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0178, 0x0178, [0, -121, 0]), + (0x0179, 0x017E, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x017F, 0x017F, [-300, 0, -300]), + (0x0180, 0x0180, [195, 0, 195]), + (0x0181, 0x0181, [0, 210, 0]), + (0x0182, 0x0185, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0186, 0x0186, [0, 206, 0]), + (0x0187, 0x0188, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0189, 0x018A, [0, 205, 0]), + (0x018B, 0x018C, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x018E, 0x018E, [0, 79, 0]), + (0x018F, 0x018F, [0, 202, 0]), + (0x0190, 0x0190, [0, 203, 0]), + (0x0191, 0x0192, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0193, 0x0193, [0, 205, 0]), + (0x0194, 0x0194, [0, 207, 0]), + (0x0195, 0x0195, [97, 0, 97]), + (0x0196, 0x0196, [0, 211, 0]), + (0x0197, 0x0197, [0, 209, 0]), + (0x0198, 0x0199, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x019A, 0x019A, [163, 0, 163]), + (0x019C, 0x019C, [0, 211, 0]), + (0x019D, 0x019D, [0, 213, 0]), + (0x019E, 0x019E, [130, 0, 130]), + (0x019F, 0x019F, [0, 214, 0]), + (0x01A0, 0x01A5, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01A6, 0x01A6, [0, 218, 0]), + (0x01A7, 0x01A8, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01A9, 0x01A9, [0, 218, 0]), + (0x01AC, 0x01AD, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01AE, 0x01AE, [0, 218, 0]), + (0x01AF, 0x01B0, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01B1, 0x01B2, [0, 217, 0]), + (0x01B3, 0x01B6, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01B7, 0x01B7, [0, 219, 0]), + (0x01B8, 0x01B9, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01BC, 0x01BD, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01BF, 0x01BF, [56, 0, 56]), + (0x01C4, 0x01C4, [0, 2, 1]), + (0x01C5, 0x01C5, [-1, 1, 0]), + (0x01C6, 0x01C6, [-2, 0, -1]), + (0x01C7, 0x01C7, [0, 2, 1]), + (0x01C8, 0x01C8, [-1, 1, 0]), + (0x01C9, 0x01C9, [-2, 0, -1]), + (0x01CA, 0x01CA, [0, 2, 1]), + (0x01CB, 0x01CB, [-1, 1, 0]), + (0x01CC, 0x01CC, [-2, 0, -1]), + (0x01CD, 0x01DC, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01DD, 0x01DD, [-79, 0, -79]), + (0x01DE, 0x01EF, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01F1, 0x01F1, [0, 2, 1]), + (0x01F2, 0x01F2, [-1, 1, 0]), + (0x01F3, 0x01F3, [-2, 0, -1]), + (0x01F4, 0x01F5, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x01F6, 0x01F6, [0, -97, 0]), + (0x01F7, 0x01F7, [0, -56, 0]), + (0x01F8, 0x021F, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0220, 0x0220, [0, -130, 0]), + (0x0222, 0x0233, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x023A, 0x023A, [0, 10795, 0]), + (0x023B, 0x023C, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x023D, 0x023D, [0, -163, 0]), + (0x023E, 0x023E, [0, 10792, 0]), + (0x023F, 0x0240, [10815, 0, 10815]), + (0x0241, 0x0242, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0243, 0x0243, [0, -195, 0]), + (0x0244, 0x0244, [0, 69, 0]), + (0x0245, 0x0245, [0, 71, 0]), + (0x0246, 0x024F, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0250, 0x0250, [10783, 0, 10783]), + (0x0251, 0x0251, [10780, 0, 10780]), + (0x0252, 0x0252, [10782, 0, 10782]), + (0x0253, 0x0253, [-210, 0, -210]), + (0x0254, 0x0254, [-206, 0, -206]), + (0x0256, 0x0257, [-205, 0, -205]), + (0x0259, 0x0259, [-202, 0, -202]), + (0x025B, 0x025B, [-203, 0, -203]), + (0x025C, 0x025C, [42319, 0, 42319]), + (0x0260, 0x0260, [-205, 0, -205]), + (0x0261, 0x0261, [42315, 0, 42315]), + (0x0263, 0x0263, [-207, 0, -207]), + (0x0265, 0x0265, [42280, 0, 42280]), + (0x0266, 0x0266, [42308, 0, 42308]), + (0x0268, 0x0268, [-209, 0, -209]), + (0x0269, 0x0269, [-211, 0, -211]), + (0x026A, 0x026A, [42308, 0, 42308]), + (0x026B, 0x026B, [10743, 0, 10743]), + (0x026C, 0x026C, [42305, 0, 42305]), + (0x026F, 0x026F, [-211, 0, -211]), + (0x0271, 0x0271, [10749, 0, 10749]), + (0x0272, 0x0272, [-213, 0, -213]), + (0x0275, 0x0275, [-214, 0, -214]), + (0x027D, 0x027D, [10727, 0, 10727]), + (0x0280, 0x0280, [-218, 0, -218]), + (0x0282, 0x0282, [42307, 0, 42307]), + (0x0283, 0x0283, [-218, 0, -218]), + (0x0287, 0x0287, [42282, 0, 42282]), + (0x0288, 0x0288, [-218, 0, -218]), + (0x0289, 0x0289, [-69, 0, -69]), + (0x028A, 0x028B, [-217, 0, -217]), + (0x028C, 0x028C, [-71, 0, -71]), + (0x0292, 0x0292, [-219, 0, -219]), + (0x029D, 0x029D, [42261, 0, 42261]), + (0x029E, 0x029E, [42258, 0, 42258]), + (0x0345, 0x0345, [84, 0, 84]), + (0x0370, 0x0373, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0376, 0x0377, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x037B, 0x037D, [130, 0, 130]), + (0x037F, 0x037F, [0, 116, 0]), + (0x0386, 0x0386, [0, 38, 0]), + (0x0388, 0x038A, [0, 37, 0]), + (0x038C, 0x038C, [0, 64, 0]), + (0x038E, 0x038F, [0, 63, 0]), + (0x0391, 0x03A1, [0, 32, 0]), + (0x03A3, 0x03AB, [0, 32, 0]), + (0x03AC, 0x03AC, [-38, 0, -38]), + (0x03AD, 0x03AF, [-37, 0, -37]), + (0x03B1, 0x03C1, [-32, 0, -32]), + (0x03C2, 0x03C2, [-31, 0, -31]), + (0x03C3, 0x03CB, [-32, 0, -32]), + (0x03CC, 0x03CC, [-64, 0, -64]), + (0x03CD, 0x03CE, [-63, 0, -63]), + (0x03CF, 0x03CF, [0, 8, 0]), + (0x03D0, 0x03D0, [-62, 0, -62]), + (0x03D1, 0x03D1, [-57, 0, -57]), + (0x03D5, 0x03D5, [-47, 0, -47]), + (0x03D6, 0x03D6, [-54, 0, -54]), + (0x03D7, 0x03D7, [-8, 0, -8]), + (0x03D8, 0x03EF, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x03F0, 0x03F0, [-86, 0, -86]), + (0x03F1, 0x03F1, [-80, 0, -80]), + (0x03F2, 0x03F2, [7, 0, 7]), + (0x03F3, 0x03F3, [-116, 0, -116]), + (0x03F4, 0x03F4, [0, -60, 0]), + (0x03F5, 0x03F5, [-96, 0, -96]), + (0x03F7, 0x03F8, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x03F9, 0x03F9, [0, -7, 0]), + (0x03FA, 0x03FB, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x03FD, 0x03FF, [0, -130, 0]), + (0x0400, 0x040F, [0, 80, 0]), + (0x0410, 0x042F, [0, 32, 0]), + (0x0430, 0x044F, [-32, 0, -32]), + (0x0450, 0x045F, [-80, 0, -80]), + (0x0460, 0x0481, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x048A, 0x04BF, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x04C0, 0x04C0, [0, 15, 0]), + (0x04C1, 0x04CE, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x04CF, 0x04CF, [-15, 0, -15]), + (0x04D0, 0x052F, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x0531, 0x0556, [0, 48, 0]), + (0x0561, 0x0586, [-48, 0, -48]), + (0x10A0, 0x10C5, [0, 7264, 0]), + (0x10C7, 0x10C7, [0, 7264, 0]), + (0x10CD, 0x10CD, [0, 7264, 0]), + (0x10D0, 0x10FA, [3008, 0, 0]), + (0x10FD, 0x10FF, [3008, 0, 0]), + (0x13A0, 0x13EF, [0, 38864, 0]), + (0x13F0, 0x13F5, [0, 8, 0]), + (0x13F8, 0x13FD, [-8, 0, -8]), + (0x1C80, 0x1C80, [-6254, 0, -6254]), + (0x1C81, 0x1C81, [-6253, 0, -6253]), + (0x1C82, 0x1C82, [-6244, 0, -6244]), + (0x1C83, 0x1C84, [-6242, 0, -6242]), + (0x1C85, 0x1C85, [-6243, 0, -6243]), + (0x1C86, 0x1C86, [-6236, 0, -6236]), + (0x1C87, 0x1C87, [-6181, 0, -6181]), + (0x1C88, 0x1C88, [35266, 0, 35266]), + (0x1C90, 0x1CBA, [0, -3008, 0]), + (0x1CBD, 0x1CBF, [0, -3008, 0]), + (0x1D79, 0x1D79, [35332, 0, 35332]), + (0x1D7D, 0x1D7D, [3814, 0, 3814]), + (0x1D8E, 0x1D8E, [35384, 0, 35384]), + (0x1E00, 0x1E95, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x1E9B, 0x1E9B, [-59, 0, -59]), + (0x1E9E, 0x1E9E, [0, -7615, 0]), + (0x1EA0, 0x1EFF, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x1F00, 0x1F07, [8, 0, 8]), + (0x1F08, 0x1F0F, [0, -8, 0]), + (0x1F10, 0x1F15, [8, 0, 8]), + (0x1F18, 0x1F1D, [0, -8, 0]), + (0x1F20, 0x1F27, [8, 0, 8]), + (0x1F28, 0x1F2F, [0, -8, 0]), + (0x1F30, 0x1F37, [8, 0, 8]), + (0x1F38, 0x1F3F, [0, -8, 0]), + (0x1F40, 0x1F45, [8, 0, 8]), + (0x1F48, 0x1F4D, [0, -8, 0]), + (0x1F51, 0x1F51, [8, 0, 8]), + (0x1F53, 0x1F53, [8, 0, 8]), + (0x1F55, 0x1F55, [8, 0, 8]), + (0x1F57, 0x1F57, [8, 0, 8]), + (0x1F59, 0x1F59, [0, -8, 0]), + (0x1F5B, 0x1F5B, [0, -8, 0]), + (0x1F5D, 0x1F5D, [0, -8, 0]), + (0x1F5F, 0x1F5F, [0, -8, 0]), + (0x1F60, 0x1F67, [8, 0, 8]), + (0x1F68, 0x1F6F, [0, -8, 0]), + (0x1F70, 0x1F71, [74, 0, 74]), + (0x1F72, 0x1F75, [86, 0, 86]), + (0x1F76, 0x1F77, [100, 0, 100]), + (0x1F78, 0x1F79, [128, 0, 128]), + (0x1F7A, 0x1F7B, [112, 0, 112]), + (0x1F7C, 0x1F7D, [126, 0, 126]), + (0x1F80, 0x1F87, [8, 0, 8]), + (0x1F88, 0x1F8F, [0, -8, 0]), + (0x1F90, 0x1F97, [8, 0, 8]), + (0x1F98, 0x1F9F, [0, -8, 0]), + (0x1FA0, 0x1FA7, [8, 0, 8]), + (0x1FA8, 0x1FAF, [0, -8, 0]), + (0x1FB0, 0x1FB1, [8, 0, 8]), + (0x1FB3, 0x1FB3, [9, 0, 9]), + (0x1FB8, 0x1FB9, [0, -8, 0]), + (0x1FBA, 0x1FBB, [0, -74, 0]), + (0x1FBC, 0x1FBC, [0, -9, 0]), + (0x1FBE, 0x1FBE, [-7205, 0, -7205]), + (0x1FC3, 0x1FC3, [9, 0, 9]), + (0x1FC8, 0x1FCB, [0, -86, 0]), + (0x1FCC, 0x1FCC, [0, -9, 0]), + (0x1FD0, 0x1FD1, [8, 0, 8]), + (0x1FD8, 0x1FD9, [0, -8, 0]), + (0x1FDA, 0x1FDB, [0, -100, 0]), + (0x1FE0, 0x1FE1, [8, 0, 8]), + (0x1FE5, 0x1FE5, [7, 0, 7]), + (0x1FE8, 0x1FE9, [0, -8, 0]), + (0x1FEA, 0x1FEB, [0, -112, 0]), + (0x1FEC, 0x1FEC, [0, -7, 0]), + (0x1FF3, 0x1FF3, [9, 0, 9]), + (0x1FF8, 0x1FF9, [0, -128, 0]), + (0x1FFA, 0x1FFB, [0, -126, 0]), + (0x1FFC, 0x1FFC, [0, -9, 0]), + (0x2126, 0x2126, [0, -7517, 0]), + (0x212A, 0x212A, [0, -8383, 0]), + (0x212B, 0x212B, [0, -8262, 0]), + (0x2132, 0x2132, [0, 28, 0]), + (0x214E, 0x214E, [-28, 0, -28]), + (0x2160, 0x216F, [0, 16, 0]), + (0x2170, 0x217F, [-16, 0, -16]), + (0x2183, 0x2184, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x24B6, 0x24CF, [0, 26, 0]), + (0x24D0, 0x24E9, [-26, 0, -26]), + (0x2C00, 0x2C2F, [0, 48, 0]), + (0x2C30, 0x2C5F, [-48, 0, -48]), + (0x2C60, 0x2C61, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2C62, 0x2C62, [0, -10743, 0]), + (0x2C63, 0x2C63, [0, -3814, 0]), + (0x2C64, 0x2C64, [0, -10727, 0]), + (0x2C65, 0x2C65, [-10795, 0, -10795]), + (0x2C66, 0x2C66, [-10792, 0, -10792]), + (0x2C67, 0x2C6C, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2C6D, 0x2C6D, [0, -10780, 0]), + (0x2C6E, 0x2C6E, [0, -10749, 0]), + (0x2C6F, 0x2C6F, [0, -10783, 0]), + (0x2C70, 0x2C70, [0, -10782, 0]), + (0x2C72, 0x2C73, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2C75, 0x2C76, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2C7E, 0x2C7F, [0, -10815, 0]), + (0x2C80, 0x2CE3, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2CEB, 0x2CEE, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2CF2, 0x2CF3, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0x2D00, 0x2D25, [-7264, 0, -7264]), + (0x2D27, 0x2D27, [-7264, 0, -7264]), + (0x2D2D, 0x2D2D, [-7264, 0, -7264]), + (0xA640, 0xA66D, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA680, 0xA69B, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA722, 0xA72F, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA732, 0xA76F, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA779, 0xA77C, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA77D, 0xA77D, [0, -35332, 0]), + (0xA77E, 0xA787, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA78B, 0xA78C, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA78D, 0xA78D, [0, -42280, 0]), + (0xA790, 0xA793, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA794, 0xA794, [48, 0, 48]), + (0xA796, 0xA7A9, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA7AA, 0xA7AA, [0, -42308, 0]), + (0xA7AB, 0xA7AB, [0, -42319, 0]), + (0xA7AC, 0xA7AC, [0, -42315, 0]), + (0xA7AD, 0xA7AD, [0, -42305, 0]), + (0xA7AE, 0xA7AE, [0, -42308, 0]), + (0xA7B0, 0xA7B0, [0, -42258, 0]), + (0xA7B1, 0xA7B1, [0, -42282, 0]), + (0xA7B2, 0xA7B2, [0, -42261, 0]), + (0xA7B3, 0xA7B3, [0, 928, 0]), + (0xA7B4, 0xA7C3, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA7C4, 0xA7C4, [0, -48, 0]), + (0xA7C5, 0xA7C5, [0, -42307, 0]), + (0xA7C6, 0xA7C6, [0, -35384, 0]), + (0xA7C7, 0xA7CA, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA7D0, 0xA7D1, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA7D6, 0xA7D9, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xA7F5, 0xA7F6, [UPPER_LOWER, UPPER_LOWER, UPPER_LOWER]), + (0xAB53, 0xAB53, [-928, 0, -928]), + (0xAB70, 0xABBF, [-38864, 0, -38864]), + (0xFF21, 0xFF3A, [0, 32, 0]), + (0xFF41, 0xFF5A, [-32, 0, -32]), + (0x10400, 0x10427, [0, 40, 0]), + (0x10428, 0x1044F, [-40, 0, -40]), + (0x104B0, 0x104D3, [0, 40, 0]), + (0x104D8, 0x104FB, [-40, 0, -40]), + (0x10570, 0x1057A, [0, 39, 0]), + (0x1057C, 0x1058A, [0, 39, 0]), + (0x1058C, 0x10592, [0, 39, 0]), + (0x10594, 0x10595, [0, 39, 0]), + (0x10597, 0x105A1, [-39, 0, -39]), + (0x105A3, 0x105B1, [-39, 0, -39]), + (0x105B3, 0x105B9, [-39, 0, -39]), + (0x105BB, 0x105BC, [-39, 0, -39]), + (0x10C80, 0x10CB2, [0, 64, 0]), + (0x10CC0, 0x10CF2, [-64, 0, -64]), + (0x118A0, 0x118BF, [0, 32, 0]), + (0x118C0, 0x118DF, [-32, 0, -32]), + (0x16E40, 0x16E5F, [0, 32, 0]), + (0x16E60, 0x16E7F, [-32, 0, -32]), + (0x1E900, 0x1E921, [0, 34, 0]), + (0x1E922, 0x1E943, [-34, 0, -34]), +]; + +fn to_case(case: usize, ch: i32) -> i32 { + if case >= MAX_CASE { + return REPLACEMENT_CHAR; + } + // binary search over ranges + let mut lo = 0; + let mut hi = CASE_TABLE.len(); + while lo < hi { + let m = lo + (hi - lo) / 2; + let cr = CASE_TABLE[m]; + if cr.0 <= ch && ch <= cr.1 { + let delta = cr.2[case]; + if delta > MAX_RUNE { + // In an Upper-Lower sequence, which always starts with + // an UpperCase letter, the real deltas always look like: + // {0, 1, 0} UpperCase (Lower is next) + // {-1, 0, -1} LowerCase (Upper, Title are previous) + // The characters at even offsets from the beginning of the + // sequence are upper case; the ones at odd offsets are lower. + // The correct mapping can be done by clearing or setting the low + // bit in the sequence offset. + // The constants UpperCase and TitleCase are even while LowerCase + // is odd so we take the low bit from case. + return cr.0 + (((ch - cr.0) & !1) | (case as i32 & 1)); + } + return ch + delta; + } + if ch < cr.0 { + hi = m; + } else { + lo = m + 1; + } + } + ch +} + +pub fn unicode_to_upper(ch: char) -> Option { + let mut r = ch as i32; + if r < MAX_ASCII { + if 'a' as i32 <= r && r <= 'z' as i32 { + r -= ('a' as i32) - ('A' as i32); + } + char::from_u32(r as u32) + } else { + char::from_u32(to_case(UPPER_CASE, r) as u32) + } +} + +pub fn unicode_to_lower(ch: char) -> Option { + let mut r = ch as i32; + if r < MAX_ASCII { + if 'A' as i32 <= r && r <= 'Z' as i32 { + r += ('a' as i32) - ('A' as i32); + } + char::from_u32(r as u32) + } else { + char::from_u32(to_case(LOWER_CASE, r) as u32) + } +} + +pub fn unicode_to_title(ch: char) -> Option { + let mut r = ch as i32; + if r < MAX_ASCII { + if 'a' as i32 <= r && r <= 'z' as i32 { + r -= ('a' as i32) - ('A' as i32); + } + char::from_u32(r as u32) + } else { + char::from_u32(to_case(TITLE_CASE, r) as u32) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + static CASE_TEST: &[(usize, u32, u32)] = &[ + // ASCII (special-cased so test carefully) + (UPPER_CASE, '\n' as u32, '\n' as u32), + (UPPER_CASE, 'a' as u32, 'A' as u32), + (UPPER_CASE, 'A' as u32, 'A' as u32), + (UPPER_CASE, '7' as u32, '7' as u32), + (LOWER_CASE, '\n' as u32, '\n' as u32), + (LOWER_CASE, 'a' as u32, 'a' as u32), + (LOWER_CASE, 'A' as u32, 'a' as u32), + (LOWER_CASE, '7' as u32, '7' as u32), + (TITLE_CASE, '\n' as u32, '\n' as u32), + (TITLE_CASE, 'a' as u32, 'A' as u32), + (TITLE_CASE, 'A' as u32, 'A' as u32), + (TITLE_CASE, '7' as u32, '7' as u32), + // Latin-1: easy to read the tests! + (UPPER_CASE, 0x80, 0x80), + (UPPER_CASE, 'Å' as u32, 'Å' as u32), + (UPPER_CASE, 'å' as u32, 'Å' as u32), + (LOWER_CASE, 0x80, 0x80), + (LOWER_CASE, 'Å' as u32, 'å' as u32), + (LOWER_CASE, 'å' as u32, 'å' as u32), + (TITLE_CASE, 0x80, 0x80), + (TITLE_CASE, 'Å' as u32, 'Å' as u32), + (TITLE_CASE, 'å' as u32, 'Å' as u32), + // 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049 + (UPPER_CASE, 0x0130, 'İ' as u32), + (LOWER_CASE, 0x0130, 'i' as u32), + (UPPER_CASE, 0x0131, 'I' as u32), + (LOWER_CASE, 0x0131, 0x0131), + (TITLE_CASE, 0x0131, 'I' as u32), + // 0133;LATIN SMALL LIGATURE IJ;Ll;0;L; 0069 006A;;;;N;LATIN SMALL LETTER I + // J;;0132;;0132 + (UPPER_CASE, 0x0133, 0x0132), + (LOWER_CASE, 0x0133, 0x0133), + (TITLE_CASE, 0x0133, 0x0132), + // 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; + (UPPER_CASE, 0x212A, 0x212A), + (LOWER_CASE, 0x212A, 'k' as u32), + (TITLE_CASE, 0x212A, 0x212A), + // From an UpperLower sequence + // A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641; + (UPPER_CASE, 0xA640, 0xA640), + (LOWER_CASE, 0xA640, 0xA641), + (TITLE_CASE, 0xA640, 0xA640), + // A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640 + (UPPER_CASE, 0xA641, 0xA640), + (LOWER_CASE, 0xA641, 0xA641), + (TITLE_CASE, 0xA641, 0xA640), + // A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F; + (UPPER_CASE, 0xA64E, 0xA64E), + (LOWER_CASE, 0xA64E, 0xA64F), + (TITLE_CASE, 0xA64E, 0xA64E), + // A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E + (UPPER_CASE, 0xA65F, 0xA65E), + (LOWER_CASE, 0xA65F, 0xA65F), + (TITLE_CASE, 0xA65F, 0xA65E), + // From another UpperLower sequence + // 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L + // ACUTE;;;013A; + (UPPER_CASE, 0x0139, 0x0139), + (LOWER_CASE, 0x0139, 0x013A), + (TITLE_CASE, 0x0139, 0x0139), + // 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L; 004C 00B7;;;;N;;;;0140; + (UPPER_CASE, 0x013f, 0x013f), + (LOWER_CASE, 0x013f, 0x0140), + (TITLE_CASE, 0x013f, 0x013f), + // 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N + // HACEK;;0147;;0147 + (UPPER_CASE, 0x0148, 0x0147), + (LOWER_CASE, 0x0148, 0x0148), + (TITLE_CASE, 0x0148, 0x0147), + // Lowercase lower than uppercase. + // AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8 + (UPPER_CASE, 0xab78, 0x13a8), + (LOWER_CASE, 0xab78, 0xab78), + (TITLE_CASE, 0xab78, 0x13a8), + (UPPER_CASE, 0x13a8, 0x13a8), + (LOWER_CASE, 0x13a8, 0xab78), + (TITLE_CASE, 0x13a8, 0x13a8), + // Last block in the 5.1.0 table + // 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428; + (UPPER_CASE, 0x10400, 0x10400), + (LOWER_CASE, 0x10400, 0x10428), + (TITLE_CASE, 0x10400, 0x10400), + // 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F; + (UPPER_CASE, 0x10427, 0x10427), + (LOWER_CASE, 0x10427, 0x1044F), + (TITLE_CASE, 0x10427, 0x10427), + // 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400 + (UPPER_CASE, 0x10428, 0x10400), + (LOWER_CASE, 0x10428, 0x10428), + (TITLE_CASE, 0x10428, 0x10400), + // 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427 + (UPPER_CASE, 0x1044F, 0x10427), + (LOWER_CASE, 0x1044F, 0x1044F), + (TITLE_CASE, 0x1044F, 0x10427), + // First one not in the 5.1.0 table + // 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;; + (UPPER_CASE, 0x10450, 0x10450), + (LOWER_CASE, 0x10450, 0x10450), + (TITLE_CASE, 0x10450, 0x10450), + // Non-letters with case. + (LOWER_CASE, 0x2161, 0x2171), + (UPPER_CASE, 0x0345, 0x0399), + ]; + + #[test] + fn test_case() { + for &(case, input, output) in CASE_TEST { + if case == UPPER_CASE { + assert_eq!( + unicode_to_upper(char::from_u32(input).unwrap()).unwrap() as u32, + output + ); + } else if case == LOWER_CASE { + assert_eq!( + unicode_to_lower(char::from_u32(input).unwrap()).unwrap() as u32, + output + ); + } else { + assert_eq!( + unicode_to_title(char::from_u32(input).unwrap()).unwrap() as u32, + output + ); + } + } + } +} diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 22127e62f49..93cf0c8ca55 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -141,13 +141,13 @@ pub trait Encoding { #[inline] fn lower(s: &str, writer: BytesWriter) -> BytesGuard { - let res = s.chars().flat_map(char::to_lowercase); + let res = s.chars().flat_map(|ch| encoding::unicode_to_lower(ch)); writer.write_from_char_iter(res) } #[inline] fn upper(s: &str, writer: BytesWriter) -> BytesGuard { - let res = s.chars().flat_map(char::to_uppercase); + let res = s.chars().flat_map(|ch| encoding::unicode_to_upper(ch)); writer.write_from_char_iter(res) } } diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 143ec6c7760..bc18d7192f9 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -590,17 +590,24 @@ fn do_div_mod_impl( rhs: &Decimal, mut frac_incr: u8, do_mod: bool, + result_frac_cnt: Option, ) -> Option> { let r_frac_cnt = word_cnt!(rhs.frac_cnt) * DIGITS_PER_WORD; let (r_idx, r_prec) = rhs.remove_leading_zeroes(rhs.int_cnt + r_frac_cnt); if r_prec == 0 { + // short-circuit everything: rhs == 0 return None; } let l_frac_cnt = word_cnt!(lhs.frac_cnt) * DIGITS_PER_WORD; let (l_idx, l_prec) = lhs.remove_leading_zeroes(lhs.int_cnt + l_frac_cnt); if l_prec == 0 { - return Some(Res::Ok(Decimal::zero())); + // short-circuit everything: lhs == 0 + if let Some(result_frac) = result_frac_cnt { + return Some(Res::Ok(Decimal::new(0, result_frac, false))); + } else { + return Some(Res::Ok(Decimal::zero())); + } } frac_incr = frac_incr.saturating_sub(l_frac_cnt - lhs.frac_cnt + r_frac_cnt - rhs.frac_cnt); @@ -784,8 +791,9 @@ fn do_div_mod_impl( Some(res) } +#[allow(dead_code)] fn do_div_mod(lhs: &Decimal, rhs: &Decimal, frac_incr: u8, do_mod: bool) -> Option> { - do_div_mod_impl(lhs, rhs, frac_incr, do_mod) + do_div_mod_impl(lhs, rhs, frac_incr, do_mod, None) } /// `do_mul` multiplies two decimals. @@ -1704,7 +1712,7 @@ impl Decimal { fn div(&self, rhs: &Decimal, frac_incr: u8) -> Option> { let result_frac_cnt = cmp::min(self.result_frac_cnt.saturating_add(frac_incr), MAX_FRACTION); - let mut res = do_div_mod(self, rhs, frac_incr, false); + let mut res = do_div_mod_impl(self, rhs, frac_incr, false, Some(result_frac_cnt)); if let Some(ref mut dec) = res { dec.result_frac_cnt = result_frac_cnt; } @@ -2362,7 +2370,7 @@ impl<'a, 'b> Rem<&'a Decimal> for &'b Decimal { type Output = Option>; fn rem(self, rhs: &'a Decimal) -> Self::Output { let result_frac_cnt = cmp::max(self.result_frac_cnt, rhs.result_frac_cnt); - let mut res = do_div_mod_impl(self, rhs, 0, true); + let mut res = do_div_mod_impl(self, rhs, 0, true, Some(result_frac_cnt)); if let Some(ref mut dec) = res { dec.result_frac_cnt = result_frac_cnt; } @@ -3545,17 +3553,28 @@ mod tests { assert_eq!(res, rem_exp.map(|s| s.to_owned())); } - let div_cases = vec![( - "-43791957044243810000000000000000000000000000000000000000000000000000000000000", - "-0.0000000000000000000000000000000000000000000000000012867433602814482", - Res::Overflow( - "34033171179267041433424155279291553259014210153022524070386565694757521640", + let div_cases = vec![ + ( + "-43791957044243810000000000000000000000000000000000000000000000000000000000000", + "-0.0000000000000000000000000000000000000000000000000012867433602814482", + Res::Overflow( + "34033171179267041433424155279291553259014210153022524070386565694757521640", + ), ), - )]; - for (lhs_str, rhs_str, rem_exp) in div_cases { + ("0", "0.5", Res::Ok("0.0000")), + ]; + for (lhs_str, rhs_str, div_exp) in div_cases { let lhs: Decimal = lhs_str.parse().unwrap(); let rhs: Decimal = rhs_str.parse().unwrap(); let res = (&lhs / &rhs).unwrap().map(|d| d.to_string()); + assert_eq!(res, div_exp.map(|s| s.to_owned())) + } + + let rem_cases = vec![("0", "0.5", Res::Ok("0.0"))]; + for (lhs_str, rhs_str, rem_exp) in rem_cases { + let lhs: Decimal = lhs_str.parse().unwrap(); + let rhs: Decimal = rhs_str.parse().unwrap(); + let res = (lhs % rhs).unwrap().map(|d| d.to_string()); assert_eq!(res, rem_exp.map(|s| s.to_owned())) } } diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index 06f4454b36d..8a56ac5ac68 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -140,7 +140,10 @@ impl Collation { } pub fn is_bin_collation(&self) -> bool { - matches!(self, Collation::Utf8Mb4Bin | Collation::Latin1Bin) + matches!( + self, + Collation::Utf8Mb4Bin | Collation::Latin1Bin | Collation::Utf8Mb40900Bin + ) } } @@ -333,6 +336,10 @@ pub trait FieldTypeAccessor { .map(|col| col.is_bin_collation()) .unwrap_or(false) || self.is_varchar_like()) + && self + .collation() + .map(|col| col != Collation::Utf8Mb40900Bin) + .unwrap_or(false) } } @@ -455,6 +462,7 @@ mod tests { use std::i32; use super::*; + use crate::builder::FieldTypeBuilder; fn field_types() -> Vec { vec![ @@ -583,4 +591,31 @@ mod tests { } } } + + #[test] + fn test_need_restored_data() { + let cases = vec![ + (FieldTypeTp::String, Collation::Binary, false), + (FieldTypeTp::VarString, Collation::Binary, false), + (FieldTypeTp::String, Collation::Utf8Mb4Bin, false), + (FieldTypeTp::VarString, Collation::Utf8Mb4Bin, true), + (FieldTypeTp::String, Collation::Utf8Mb4GeneralCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb4GeneralCi, true), + (FieldTypeTp::String, Collation::Utf8Mb4UnicodeCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb4UnicodeCi, true), + (FieldTypeTp::String, Collation::Utf8Mb40900AiCi, true), + (FieldTypeTp::VarString, Collation::Utf8Mb40900AiCi, true), + (FieldTypeTp::String, Collation::Utf8Mb40900Bin, false), + (FieldTypeTp::VarString, Collation::Utf8Mb40900Bin, false), + (FieldTypeTp::String, Collation::GbkBin, true), + (FieldTypeTp::VarString, Collation::GbkBin, true), + (FieldTypeTp::String, Collation::GbkChineseCi, true), + (FieldTypeTp::VarString, Collation::GbkChineseCi, true), + ]; + + for (tp, collation, result) in cases { + let ft = FieldTypeBuilder::new().tp(tp).collation(collation).build(); + assert_eq!(ft.need_restored_data(), result) + } + } } diff --git a/components/tidb_query_expr/Cargo.toml b/components/tidb_query_expr/Cargo.toml index 8a178401905..60bbde91c31 100644 --- a/components/tidb_query_expr/Cargo.toml +++ b/components/tidb_query_expr/Cargo.toml @@ -10,6 +10,7 @@ base64 = "0.13" bstr = "0.2.8" byteorder = "1.2" codec = { workspace = true } +crypto = { workspace = true } file_system = { workspace = true } flate2 = { version = "=1.0.11", default-features = false, features = ["zlib"] } hex = "0.4" @@ -17,9 +18,8 @@ log_wrappers = { workspace = true } match-template = "0.0.1" num = { version = "0.3", default-features = false } num-traits = "0.2" -openssl = { version = "0.10" } +openssl = { workspace = true } protobuf = "2" -rand = "0.8.3" regex = "1.1" safemem = { version = "0.3", default-features = false } serde = "1.0" diff --git a/components/tidb_query_expr/src/impl_arithmetic.rs b/components/tidb_query_expr/src/impl_arithmetic.rs index 2f48fec4693..5960e69c2cd 100644 --- a/components/tidb_query_expr/src/impl_arithmetic.rs +++ b/components/tidb_query_expr/src/impl_arithmetic.rs @@ -4,7 +4,7 @@ use num_traits::identities::Zero; use tidb_query_codegen::rpn_fn; use tidb_query_common::Result; use tidb_query_datatype::{ - codec::{self, data_type::*, div_i64, div_i64_with_u64, div_u64_with_i64, Error}, + codec::{self, data_type::*, div_i64, div_i64_with_u64, div_u64_with_i64, mysql::Res, Error}, expr::EvalContext, }; @@ -452,21 +452,39 @@ fn int_divide_decimal(ctx: &mut EvalContext, lhs: &Decimal, rhs: &Decimal) -> Re let result = arithmetic_with_ctx::(ctx, lhs, rhs)?; if let Some(result) = result { let result = result.as_i64(); - Ok(if result.is_truncated() { - Some(result.unwrap()) - } else { - result - .into_result_with_overflow_err( - ctx, - Error::overflow("BIGINT", format!("({} / {})", lhs, rhs)), - ) - .map(Some)? - }) + match result { + Res::Ok(i) => Ok(Some(i)), + Res::Truncated(i) => Ok(Some(i)), + _ => Err(Error::overflow("BIGINT", format!("({} / {})", lhs, rhs)).into()), + } } else { Ok(None) } } +#[rpn_fn(capture = [ctx])] +#[inline] +fn int_divide_decimal_unsigned( + ctx: &mut EvalContext, + lhs: &Decimal, + rhs: &Decimal, +) -> Result> { + let result = arithmetic_with_ctx::(ctx, lhs, rhs)?; + if let Some(result) = result { + let unsigned_result = result.as_u64(); + if unsigned_result.is_overflow() { + let signed_result = result.as_i64(); + return if signed_result.unwrap() == 0 && signed_result.is_truncated() { + Ok(Some(0)) + } else { + Err(Error::overflow("BIGINT UNSIGNED", format!("({} / {})", lhs, rhs)).into()) + }; + } + return Ok(Some(unsigned_result.unwrap() as i64)); + } + Ok(None) +} + pub struct DecimalDivide; impl ArithmeticOpWithCtx for DecimalDivide { @@ -962,6 +980,7 @@ mod tests { // divide by zero (Some("0.0"), Some("0.0"), None), (None, None, None), + (Some("0"), Some("45584"), Some(0)), ]; for (lhs, rhs, expected) in test_cases { @@ -995,6 +1014,38 @@ mod tests { } } + #[test] + fn test_int_divide_decimal_unsigned_overflow() { + let lft = FieldTypeBuilder::new() + .tp(FieldTypeTp::NewDecimal) + .flag(FieldTypeFlag::UNSIGNED) + .build(); + let rft = FieldTypeBuilder::new() + .tp(FieldTypeTp::NewDecimal) + .flag(FieldTypeFlag::UNSIGNED) + .build(); + let output: Option = RpnFnScalarEvaluator::new() + .push_param_with_field_type(Decimal::from(1), lft) + .push_param_with_field_type(Decimal::from_f64(-2_f64).unwrap(), rft) + .evaluate(ScalarFuncSig::IntDivideDecimal) + .unwrap(); + assert_eq!(output, Some(0)); + + let lft = FieldTypeBuilder::new() + .tp(FieldTypeTp::NewDecimal) + .flag(FieldTypeFlag::UNSIGNED) + .build(); + let rft = FieldTypeBuilder::new() + .tp(FieldTypeTp::NewDecimal) + .flag(FieldTypeFlag::UNSIGNED) + .build(); + let output: Result> = RpnFnScalarEvaluator::new() + .push_param_with_field_type(Decimal::from(1), lft) + .push_param_with_field_type(Decimal::from_f64(-1_f64).unwrap(), rft) + .evaluate(ScalarFuncSig::IntDivideDecimal); + assert!(output.is_err(), "should be error"); + } + #[test] fn test_real_multiply() { let should_pass = vec![(1.01001, -0.01, Real::new(-0.0101001).ok())]; diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 76e90f79c5b..16e33e71d13 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -1038,10 +1038,10 @@ fn cast_bytes_like_as_duration( val: &[u8], overflow_as_null: bool, ) -> Result> { - let val = std::str::from_utf8(val).map_err(Error::Encoding)?; + let val = String::from_utf8_lossy(val); let result = Duration::parse_consider_overflow( ctx, - val, + &val, extra.ret_field_type.get_decimal() as i8, overflow_as_null, ); @@ -6450,6 +6450,7 @@ mod tests { b"-17:51:04.78", b"17:51:04.78", b"-17:51:04.78", + b"\x92\x6b", ]; test_as_duration_helper( diff --git a/components/tidb_query_expr/src/impl_encryption.rs b/components/tidb_query_expr/src/impl_encryption.rs index 9c26826c03b..03686d3755e 100644 --- a/components/tidb_query_expr/src/impl_encryption.rs +++ b/components/tidb_query_expr/src/impl_encryption.rs @@ -3,13 +3,14 @@ use std::io::Read; use byteorder::{ByteOrder, LittleEndian}; +use crypto::rand; use flate2::{ read::{ZlibDecoder, ZlibEncoder}, Compression, }; use openssl::hash::{self, MessageDigest}; use tidb_query_codegen::rpn_fn; -use tidb_query_common::Result; +use tidb_query_common::{error::EvaluateError, Result}; use tidb_query_datatype::{ codec::data_type::*, expr::{Error, EvalContext}, @@ -190,9 +191,12 @@ pub fn random_bytes(_ctx: &mut EvalContext, arg: Option<&Int>) -> Result MAX_RAND_BYTES_LENGTH { return Err(Error::overflow("length", "random_bytes").into()); } - Ok(Some( - (0..*arg as usize).map(|_| rand::random::()).collect(), - )) + let len = *arg as usize; + let mut rand_bytes = vec![0; len]; + rand::rand_bytes(&mut rand_bytes).map_err(|_| { + EvaluateError::Other("SSL library can't generate random bytes".to_owned()) + })?; + Ok(Some(rand_bytes)) } _ => Ok(None), } diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index f3b9b03c287..c86e8d22ccb 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -635,15 +635,22 @@ fn field(args: &[Option<&T>]) -> Result #[rpn_fn(nullable, varg, min_args = 1)] #[inline] -fn field_bytes(args: &[Option]) -> Result> { +fn field_bytes(args: &[Option]) -> Result> { Ok(Some(match args[0] { // As per the MySQL doc, if the first argument is NULL, this function always returns 0. None => 0, - Some(val) => args - .iter() - .skip(1) - .position(|&i| i == Some(val)) - .map_or(0, |pos| (pos + 1) as i64), + Some(val) => { + for (pos, arg) in args.iter().enumerate().skip(1) { + if arg.is_none() { + continue; + } + match C::sort_compare(val, arg.unwrap()) { + Ok(Ordering::Equal) => return Ok(Some(pos as i64)), + _ => continue, + } + } + 0 + } })) } @@ -2853,6 +2860,10 @@ mod tests { Some("قاعدة البيانات".as_bytes().to_vec()), Some("قاعدة البيانات".as_bytes().to_vec()), ), + ( + Some("ßßåı".as_bytes().to_vec()), + Some("ßßÅI".as_bytes().to_vec()), + ), (None, None), ]; @@ -2913,64 +2924,45 @@ mod tests { #[test] fn test_gbk_lower_upper() { // Test GBK string case - let sig = vec![ScalarFuncSig::Lower, ScalarFuncSig::Upper]; - for s in sig { - let output = RpnFnScalarEvaluator::new() - .push_param_with_field_type( - Some("àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ".as_bytes().to_vec()).clone(), - FieldTypeBuilder::new() - .tp(FieldTypeTp::VarString) - .charset(CHARSET_GBK) - .build(), - ) - .evaluate(s) - .unwrap(); - assert_eq!( - output, - Some("àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ".as_bytes().to_vec()) - ); - } - } - - #[test] - fn test_lower() { - // Test non-binary string case let cases = vec![ - (Some(b"HELLO".to_vec()), Some(b"hello".to_vec())), - (Some(b"123".to_vec()), Some(b"123".to_vec())), ( - Some("CAFÉ".as_bytes().to_vec()), - Some("café".as_bytes().to_vec()), + ScalarFuncSig::LowerUtf8, + "àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ".as_bytes().to_vec(), + "àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ".as_bytes().to_vec(), ), ( - Some("数据库".as_bytes().to_vec()), - Some("数据库".as_bytes().to_vec()), + ScalarFuncSig::UpperUtf8, + "àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ".as_bytes().to_vec(), + "àáèéêìíòóùúüāēěīńňōūǎǐǒǔǖǘǚǜⅪⅫ".as_bytes().to_vec(), ), ( - Some("НОЧЬ НА ОКРАИНЕ МОСКВЫ".as_bytes().to_vec()), - Some("ночь на окраине москвы".as_bytes().to_vec()), + ScalarFuncSig::LowerUtf8, + "İİIIÅI".as_bytes().to_vec(), + "iiiiåi".as_bytes().to_vec(), ), ( - Some("قاعدة البيانات".as_bytes().to_vec()), - Some("قاعدة البيانات".as_bytes().to_vec()), + ScalarFuncSig::UpperUtf8, + "ßßåı".as_bytes().to_vec(), + "ßßÅI".as_bytes().to_vec(), ), - (None, None), ]; - - for (arg, exp) in cases { - let output = RpnFnScalarEvaluator::new() + for (s, input, output) in cases { + let result = RpnFnScalarEvaluator::new() .push_param_with_field_type( - arg.clone(), + Some(input).clone(), FieldTypeBuilder::new() .tp(FieldTypeTp::VarString) - .charset(CHARSET_UTF8MB4) + .charset(CHARSET_GBK) .build(), ) - .evaluate(ScalarFuncSig::Lower) + .evaluate(s) .unwrap(); - assert_eq!(output, exp); + assert_eq!(result, Some(output),); } + } + #[test] + fn test_lower() { // Test binary string case let cases = vec![ (Some(b"hello".to_vec()), Some(b"hello".to_vec())), @@ -2990,6 +2982,10 @@ mod tests { Some("قاعدة البيانات".as_bytes().to_vec()), Some("قاعدة البيانات".as_bytes().to_vec()), ), + ( + Some("İİIIÅI".as_bytes().to_vec()), + Some("İİIIÅI".as_bytes().to_vec()), + ), (None, None), ]; @@ -3036,6 +3032,10 @@ mod tests { Some("قاعدة البيانات".as_bytes().to_vec()), Some("قاعدة البيانات".as_bytes().to_vec()), ), + ( + Some("İİIIÅI".as_bytes().to_vec()), + Some("iiiiåi".as_bytes().to_vec()), + ), (None, None), ]; @@ -3214,6 +3214,7 @@ mod tests { Some(b"baz".to_vec()), ], Some(1), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3223,6 +3224,7 @@ mod tests { Some(b"hello".to_vec()), ], Some(0), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3232,6 +3234,7 @@ mod tests { Some(b"hello".to_vec()), ], Some(3), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3244,6 +3247,7 @@ mod tests { Some(b"Hello".to_vec()), ], Some(6), + Collation::Utf8Mb4Bin, ), ( vec![ @@ -3252,14 +3256,37 @@ mod tests { Some(b"Hello World!".to_vec()), ], Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![None, None, Some(b"Hello World!".to_vec())], + Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![Some(b"Hello World!".to_vec())], + Some(0), + Collation::Utf8Mb4Bin, + ), + ( + vec![ + Some(b"a".to_vec()), + Some(b"A".to_vec()), + Some(b"a".to_vec()), + ], + Some(1), + Collation::Utf8Mb4GeneralCi, ), - (vec![None, None, Some(b"Hello World!".to_vec())], Some(0)), - (vec![Some(b"Hello World!".to_vec())], Some(0)), ]; - for (args, expect_output) in test_cases { + for (args, expect_output, collation) in test_cases { let output = RpnFnScalarEvaluator::new() .push_params(args) + .return_field_type( + FieldTypeBuilder::new() + .tp(FieldTypeTp::Long) + .collation(collation), + ) .evaluate(ScalarFuncSig::FieldString) .unwrap(); assert_eq!(output, expect_output); diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index c2ef6722148..50e10681587 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -284,6 +284,13 @@ fn divide_mapper(lhs_is_unsigned: bool, rhs_is_unsigned: bool) -> RpnFnMeta { } } +fn divide_decimal_mapper(lhs_is_unsigned: bool, rhs_is_unsigned: bool) -> RpnFnMeta { + match (lhs_is_unsigned, rhs_is_unsigned) { + (false, false) => int_divide_decimal_fn_meta(), + _ => int_divide_decimal_unsigned_fn_meta(), + } +} + fn map_rhs_int_sig(value: ScalarFuncSig, children: &[Expr], mapper: F) -> Result where F: Fn(bool) -> RpnFnMeta, @@ -357,27 +364,7 @@ pub fn map_unary_minus_int_func(value: ScalarFuncSig, children: &[Expr]) -> Resu } } -fn map_lower_sig(value: ScalarFuncSig, children: &[Expr]) -> Result { - if children.len() != 1 { - return Err(other_err!( - "ScalarFunction {:?} (params = {}) is not supported in batch mode", - value, - children.len() - )); - } - if children[0].get_field_type().is_binary_string_like() { - Ok(lower_fn_meta()) - } else { - let ret_field_type = children[0].get_field_type(); - Ok(match_template_charset! { - TT, match Charset::from_name(ret_field_type.get_charset()).map_err(tidb_query_datatype::codec::Error::from)? { - Charset::TT => lower_utf8_fn_meta::(), - } - }) - } -} - -fn map_upper_sig(value: ScalarFuncSig, children: &[Expr]) -> Result { +fn map_upper_utf8_sig(value: ScalarFuncSig, children: &[Expr]) -> Result { if children.len() != 1 { return Err(other_err!( "ScalarFunction {:?} (params = {}) is not supported in batch mode", @@ -409,6 +396,14 @@ fn map_lower_utf8_sig(value: ScalarFuncSig, children: &[Expr]) -> Result Result { + Ok(match_template_collator! { + TT, match ret_field_type.as_accessor().collation().map_err(tidb_query_datatype::codec::Error::from)? { + Collation::TT => field_bytes_fn_meta::() + } + }) +} + #[rustfmt::skip] fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { let value = expr.get_sig(); @@ -433,7 +428,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::DivideDecimal => arithmetic_with_ctx_fn_meta::(), ScalarFuncSig::DivideReal => arithmetic_with_ctx_fn_meta::(), ScalarFuncSig::IntDivideInt => map_int_sig(value, children, divide_mapper)?, - ScalarFuncSig::IntDivideDecimal => int_divide_decimal_fn_meta(), + ScalarFuncSig::IntDivideDecimal => map_int_sig(value, children, divide_decimal_mapper)?, ScalarFuncSig::ModReal => arithmetic_fn_meta::(), ScalarFuncSig::ModDecimal => arithmetic_with_ctx_fn_meta::(), ScalarFuncSig::ModInt => map_int_sig(value, children, mod_mapper)?, @@ -779,15 +774,15 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::Insert => insert_fn_meta(), ScalarFuncSig::InsertUtf8 => insert_utf8_fn_meta(), ScalarFuncSig::RightUtf8 => right_utf8_fn_meta(), - ScalarFuncSig::UpperUtf8 => map_upper_sig(value, children)?, + ScalarFuncSig::UpperUtf8 => map_upper_utf8_sig(value, children)?, ScalarFuncSig::Upper => upper_fn_meta(), - ScalarFuncSig::Lower => map_lower_sig(value, children)?, ScalarFuncSig::LowerUtf8 => map_lower_utf8_sig(value, children)?, + ScalarFuncSig::Lower => lower_fn_meta(), ScalarFuncSig::Locate2Args => locate_2_args_fn_meta(), ScalarFuncSig::Locate3Args => locate_3_args_fn_meta(), ScalarFuncSig::FieldInt => field_fn_meta::(), ScalarFuncSig::FieldReal => field_fn_meta::(), - ScalarFuncSig::FieldString => field_bytes_fn_meta(), + ScalarFuncSig::FieldString => map_field_string_sig(ft)?, ScalarFuncSig::Elt => elt_fn_meta(), ScalarFuncSig::MakeSet => make_set_fn_meta(), ScalarFuncSig::Space => space_fn_meta(), diff --git a/components/tikv_alloc/src/default.rs b/components/tikv_alloc/src/default.rs index 2674331c3cd..5133d76e172 100644 --- a/components/tikv_alloc/src/default.rs +++ b/components/tikv_alloc/src/default.rs @@ -8,6 +8,7 @@ use crate::AllocStats; pub fn dump_stats() -> String { String::new() } + pub fn dump_prof(_path: &str) -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } @@ -24,6 +25,14 @@ pub fn deactivate_prof() -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } +pub fn set_prof_sample(_rate: u64) -> ProfResult<()> { + Err(ProfError::MemProfilingNotEnabled) +} + +pub fn is_profiling_active() -> bool { + false +} + /// # Safety /// /// It is safe. The unsafe marker is just for matching the function signature. diff --git a/components/tikv_alloc/src/jemalloc.rs b/components/tikv_alloc/src/jemalloc.rs index 876afa9fcd5..245f6280b71 100644 --- a/components/tikv_alloc/src/jemalloc.rs +++ b/components/tikv_alloc/src/jemalloc.rs @@ -133,7 +133,7 @@ pub fn remove_thread_memory_accessor() { use std::thread::ThreadId; -pub use self::profiling::{activate_prof, deactivate_prof, dump_prof}; +pub use self::profiling::*; pub fn dump_stats() -> String { let mut buf = Vec::with_capacity(1024); @@ -311,6 +311,21 @@ mod profiling { // C string should end with a '\0'. const PROF_ACTIVE: &[u8] = b"prof.active\0"; const PROF_DUMP: &[u8] = b"prof.dump\0"; + const PROF_RESET: &[u8] = b"prof.reset\0"; + const OPT_PROF: &[u8] = b"opt.prof\0"; + + pub fn set_prof_sample(rate: u64) -> ProfResult<()> { + let rate = (rate as f64).log2().ceil() as usize; + unsafe { + if let Err(e) = tikv_jemalloc_ctl::raw::write(PROF_RESET, rate) { + return Err(ProfError::JemallocError(format!( + "failed to set prof sample: {}", + e + ))); + } + } + Ok(()) + } pub fn activate_prof() -> ProfResult<()> { unsafe { @@ -351,22 +366,44 @@ mod profiling { Ok(()) } + pub fn is_profiling_active() -> bool { + match unsafe { tikv_jemalloc_ctl::raw::read(PROF_ACTIVE) } { + Err(e) => { + panic!("is_profiling_active: {:?}", e); + } + Ok(prof) => prof, + } + } + + pub fn is_profiling_enabled() -> bool { + match unsafe { tikv_jemalloc_ctl::raw::read(OPT_PROF) } { + Err(e) => { + // Shouldn't be possible since mem-profiling is set + panic!("is_profiling_enabled: {:?}", e); + } + Ok(prof) => prof, + } + } + #[cfg(test)] mod tests { use std::fs; use tempfile::Builder; - const OPT_PROF: &[u8] = b"opt.prof\0"; + use super::*; - fn is_profiling_on() -> bool { - match unsafe { tikv_jemalloc_ctl::raw::read(OPT_PROF) } { - Err(e) => { - // Shouldn't be possible since mem-profiling is set - panic!("is_profiling_on: {:?}", e); - } - Ok(prof) => prof, - } + #[test] + #[ignore = "#ifdef MALLOC_CONF"] + fn test_profiling_active() { + // Make sure somebody has turned on profiling + assert!(is_profiling_enabled(), "set MALLOC_CONF=prof:true"); + activate_prof().unwrap(); + assert!(is_profiling_active()); + deactivate_prof().unwrap(); + assert!(!is_profiling_active()); + + super::set_prof_sample(512 * 1024 * 1024).unwrap(); } // Only trigger this test with jemallocs `opt.prof` set to @@ -382,7 +419,7 @@ mod profiling { #[ignore = "#ifdef MALLOC_CONF"] fn test_profiling_memory_ifdef_malloc_conf() { // Make sure somebody has turned on profiling - assert!(is_profiling_on(), "set MALLOC_CONF=prof:true"); + assert!(is_profiling_enabled(), "set MALLOC_CONF=prof:true"); let dir = Builder::new() .prefix("test_profiling_memory") @@ -391,11 +428,11 @@ mod profiling { let os_path = dir.path().to_path_buf().join("test1.dump").into_os_string(); let path = os_path.into_string().unwrap(); - super::dump_prof(&path).unwrap(); + dump_prof(&path).unwrap(); let os_path = dir.path().to_path_buf().join("test2.dump").into_os_string(); let path = os_path.into_string().unwrap(); - super::dump_prof(&path).unwrap(); + dump_prof(&path).unwrap(); let files = fs::read_dir(dir.path()).unwrap().count(); assert_eq!(files, 2); @@ -431,4 +468,10 @@ mod profiling { pub fn deactivate_prof() -> ProfResult<()> { Err(ProfError::MemProfilingNotEnabled) } + pub fn set_prof_sample(_rate: u64) -> ProfResult<()> { + Err(ProfError::MemProfilingNotEnabled) + } + pub fn is_profiling_active() -> bool { + false + } } diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 7d517de2cba..6df829ad925 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -27,6 +27,7 @@ test-engines-panic = [ [dependencies] backtrace = "0.3" collections = { workspace = true } +encryption = { workspace = true } engine_panic = { workspace = true } engine_rocks = { workspace = true } engine_test = { workspace = true } diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 25f58352750..1fe61b78633 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -553,7 +553,7 @@ pub enum ErrorInner { Request(ErrorHeader), #[error("timeout after {0:?}")] Timeout(Duration), - #[error("an empty requets")] + #[error("an empty request")] EmptyRequest, #[error("key is locked (backoff or cleanup) {0:?}")] KeyIsLocked(kvproto::kvrpcpb::LockInfo), diff --git a/components/tikv_kv/src/raft_extension.rs b/components/tikv_kv/src/raft_extension.rs index 26c9e687ef6..7ab4c1c030d 100644 --- a/components/tikv_kv/src/raft_extension.rs +++ b/components/tikv_kv/src/raft_extension.rs @@ -32,6 +32,9 @@ pub trait RaftExtension: Clone + Send { /// Report the target store is unreachable. fn report_store_unreachable(&self, _store_id: u64) {} + /// Report the target store may be tombstone. + fn report_store_maybe_tombstone(&self, _store_id: u64) {} + /// Report the status of snapshot. fn report_snapshot_status(&self, _region_id: u64, _to_peer_id: u64, _status: SnapshotStatus) {} diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 21099974d2d..551b933faeb 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -64,7 +64,7 @@ impl Runnable for Runner { match t { Task::Write(modifies, cb) => cb(write_modifies(&self.0.kv, modifies)), Task::Snapshot(sender) => { - let _ = sender.send(Arc::new(self.0.kv.snapshot())); + let _ = sender.send(Arc::new(self.0.kv.snapshot(None))); } Task::Pause(dur) => std::thread::sleep(dur), } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 6de354fa259..9250dd03cb0 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -37,7 +37,7 @@ nix = "0.24" num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } parking_lot_core = "0.9.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } @@ -52,12 +52,13 @@ slog-async = "2.3" slog-global = { workspace = true } slog-json = "2.3" slog-term = "2.4" +strum = { version = "0.20", features = ["derive"] } sysinfo = "0.26" thiserror = "1.0" tikv_alloc = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-executor = "0.1" +tokio-executor = { workspace = true } tokio-timer = { workspace = true } tracker = { workspace = true } url = "2" diff --git a/components/tikv_util/src/codec/stream_event.rs b/components/tikv_util/src/codec/stream_event.rs index 5b00cad6372..3c1a04f77e3 100644 --- a/components/tikv_util/src/codec/stream_event.rs +++ b/components/tikv_util/src/codec/stream_event.rs @@ -6,6 +6,13 @@ use bytes::{Buf, Bytes}; use crate::{codec::Result, Either}; +// Note: maybe allow them to be different lifetime. +// But not necessary for now, so keep it simple...? +pub struct Rewrite<'a> { + from: &'a [u8], + to: &'a [u8], +} + pub trait Iterator { fn next(&mut self) -> Result<()>; @@ -19,10 +26,12 @@ pub trait Iterator { pub struct EventIterator<'a> { buf: &'a [u8], offset: usize, - key_offset: usize, value_offset: usize, - key_len: usize, value_len: usize, + + key_buf: Vec, + + rewrite_rule: Option>, } impl EventIterator<'_> { @@ -30,10 +39,21 @@ impl EventIterator<'_> { EventIterator { buf, offset: 0, - key_offset: 0, - key_len: 0, + key_buf: vec![], value_offset: 0, value_len: 0, + rewrite_rule: None, + } + } + + pub fn with_rewriting<'a>(buf: &'a [u8], from: &'a [u8], to: &'a [u8]) -> EventIterator<'a> { + EventIterator { + buf, + offset: 0, + key_buf: vec![], + value_offset: 0, + value_len: 0, + rewrite_rule: Some(Rewrite { from, to }), } } @@ -42,14 +62,47 @@ impl EventIterator<'_> { self.offset += 4; result } + + fn consume_key_with_len(&mut self, key_len: usize) { + self.key_buf.clear(); + self.key_buf.reserve(key_len); + self.key_buf + .extend_from_slice(&self.buf[self.offset..self.offset + key_len]); + self.offset += key_len; + } + + fn move_to_next_key_with_rewrite(&mut self) { + let key_len = self.get_size() as usize; + let rewrite = self.rewrite_rule.as_ref().expect("rewrite rule not set"); + if key_len < rewrite.from.len() + || &self.buf[self.offset..self.offset + rewrite.from.len()] != rewrite.from + { + self.consume_key_with_len(key_len); + return; + } + self.key_buf.clear(); + self.key_buf + .reserve(rewrite.to.len() + key_len - rewrite.from.len()); + self.key_buf.extend_from_slice(rewrite.to); + self.key_buf + .extend_from_slice(&self.buf[self.offset + rewrite.from.len()..self.offset + key_len]); + self.offset += key_len; + } + + fn fetch_key_buffer_and_move_to_value(&mut self) { + if self.rewrite_rule.is_some() { + self.move_to_next_key_with_rewrite() + } else { + let key_len = self.get_size() as usize; + self.consume_key_with_len(key_len); + } + } } impl Iterator for EventIterator<'_> { fn next(&mut self) -> Result<()> { if self.valid() { - self.key_len = self.get_size() as usize; - self.key_offset = self.offset; - self.offset += self.key_len; + self.fetch_key_buffer_and_move_to_value(); self.value_len = self.get_size() as usize; self.value_offset = self.offset; @@ -63,7 +116,7 @@ impl Iterator for EventIterator<'_> { } fn key(&self) -> &[u8] { - &self.buf[self.key_offset..self.key_offset + self.key_len] + &self.key_buf[..] } fn value(&self) -> &[u8] { @@ -155,4 +208,44 @@ mod tests { } assert_eq!(count, index); } + + #[test] + fn test_rewrite() { + let mut rng = rand::thread_rng(); + let mut event = vec![]; + let mut keys = vec![]; + let mut vals = vec![]; + let count = 20; + + for _i in 0..count { + let should_rewrite = rng.gen::(); + let mut key: Vec = std::iter::once(if should_rewrite { b'k' } else { b'l' }) + .chain((0..100).map(|_| rng.gen_range(0..255))) + .collect(); + let val: Vec = (0..100).map(|_| rng.gen_range(0..255)).collect(); + let e = EventEncoder::encode_event(&key, &val); + for s in e { + event.extend_from_slice(s.as_ref()); + } + if should_rewrite { + key[0] = b'r'; + } + keys.push(key); + vals.push(val); + } + + let mut iter = EventIterator::with_rewriting(&event, b"k", b"r"); + + let mut index = 0_usize; + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + assert_eq!(iter.key(), keys[index]); + assert_eq!(iter.value(), vals[index]); + index += 1; + } + assert_eq!(count, index); + } } diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index c3d240d3c4f..7b3e6cd2469 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -15,6 +15,10 @@ use std::{ time::Duration, }; +use chrono::{ + format::{self, Fixed, Item, Parsed}, + DateTime, FixedOffset, Local, NaiveTime, TimeZone, Timelike, +}; use online_config::ConfigValue; use serde::{ de::{self, Unexpected, Visitor}, @@ -522,6 +526,166 @@ impl<'de> Deserialize<'de> for ReadableDuration { } } +#[derive(Clone, Debug, Copy, PartialEq)] +pub struct ReadableOffsetTime(pub NaiveTime, pub FixedOffset); + +impl From for ConfigValue { + fn from(ot: ReadableOffsetTime) -> ConfigValue { + ConfigValue::OffsetTime((ot.0, ot.1)) + } +} + +impl From for ReadableOffsetTime { + fn from(c: ConfigValue) -> ReadableOffsetTime { + if let ConfigValue::OffsetTime(ot) = c { + ReadableOffsetTime(ot.0, ot.1) + } else { + panic!("expect: ConfigValue::OffsetTime, got: {:?}", c) + } + } +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)] +pub struct ReadableSchedule(pub Vec); + +impl From for ConfigValue { + fn from(otv: ReadableSchedule) -> ConfigValue { + ConfigValue::Schedule(otv.0.into_iter().map(|ot| (ot.0, ot.1)).collect::>()) + } +} + +impl From for ReadableSchedule { + fn from(c: ConfigValue) -> ReadableSchedule { + if let ConfigValue::Schedule(otv) = c { + ReadableSchedule( + otv.into_iter() + .map(|(o, t)| ReadableOffsetTime(o, t)) + .collect::>(), + ) + } else { + panic!("expect: ConfigValue::Schedule, got: {:?}", c) + } + } +} + +impl ReadableSchedule { + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn is_scheduled_this_hour(&self, datetime: &DateTime) -> bool { + self.0.iter().any(|time| time.hour_matches(datetime)) + } + + pub fn is_scheduled_this_hour_minute(&self, datetime: &DateTime) -> bool { + self.0 + .iter() + .any(|time| time.hour_minutes_matches(datetime)) + } +} + +impl FromStr for ReadableOffsetTime { + type Err = String; + + fn from_str(ot_str: &str) -> Result { + let (time, offset) = if let Some((time_str, offset_str)) = ot_str.split_once(' ') { + let time = NaiveTime::parse_from_str(time_str, "%H:%M").map_err(|e| e.to_string())?; + let offset = parse_offset(offset_str)?; + (time, offset) + } else { + let time = NaiveTime::parse_from_str(ot_str, "%H:%M").map_err(|e| e.to_string())?; + (time, local_offset()) + }; + Ok(ReadableOffsetTime(time, offset)) + } +} + +/// Returns the `FixedOffset` for the timezone this `tikv` server has been +/// configured to use. +fn local_offset() -> FixedOffset { + let &offset = Local::now().offset(); + offset +} + +/// Parses the offset specified by `str`. +/// Note: `FixedOffset` in latest `chrono` implements `FromStr`. Once we are +/// able to upgrade to it (`components/tidb_query_datatype` requires a large +/// refactoring that is outside the scope of this PR), we can remove this +/// method. +fn parse_offset(offset_str: &str) -> Result { + let mut parsed = Parsed::new(); + format::parse( + &mut parsed, + offset_str, + [Item::Fixed(Fixed::TimezoneOffsetZ)].iter(), + ) + .map_err(|e| e.to_string())?; + parsed.to_fixed_offset().map_err(|e| e.to_string()) +} + +impl fmt::Display for ReadableOffsetTime { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {}", self.0, self.1) + } +} + +impl ReadableOffsetTime { + /// Converts `datetime` from `Tz` to the same timezone as this instance and + /// returns `true` if the hour of the day is matches hour of this + /// instance. + pub fn hour_matches(&self, datetime: &DateTime) -> bool { + self.convert_to_this_offset(datetime).hour() == self.0.hour() + } + + /// Converts `datetime` from `Tz` to the same timezone as this instance and + /// returns `true` if hours and minutes match this instance. + pub fn hour_minutes_matches(&self, datetime: &DateTime) -> bool { + let time = self.convert_to_this_offset(datetime); + time.hour() == self.0.hour() && time.minute() == self.0.minute() + } + + fn convert_to_this_offset(&self, datetime: &DateTime) -> NaiveTime { + datetime.with_timezone(&self.1).time() + } +} + +impl Serialize for ReadableOffsetTime { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut buffer = String::new(); + write!(buffer, "{}", self).unwrap(); + serializer.serialize_str(&buffer) + } +} + +impl<'de> Deserialize<'de> for ReadableOffsetTime { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct OffTimeVisitor; + + impl<'de> Visitor<'de> for OffTimeVisitor { + type Value = ReadableOffsetTime; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("valid duration") + } + + fn visit_str(self, off_time_str: &str) -> Result + where + E: de::Error, + { + off_time_str.parse().map_err(E::custom) + } + } + + deserializer.deserialize_str(OffTimeVisitor) + } +} + pub fn normalize_path>(path: P) -> PathBuf { use std::path::Component; let mut components = path.as_ref().components().peekable(); @@ -1424,9 +1588,10 @@ macro_rules! numeric_enum_serializing_mod { /// States: /// 1. Init - Only source directory contains Raft data. /// 2. Migrating - A marker file contains the path of source directory. The -/// source directory contains a complete copy of Raft data. Target -/// directory may exist. 3. Completed - Only target directory contains Raft -/// data. Marker file may exist. +/// source directory contains a complete copy of Raft data. Target +/// directory may exist. +/// 3. Completed - Only target directory contains Raft data. Marker file may +/// exist. pub struct RaftDataStateMachine { root: PathBuf, in_progress_marker: PathBuf, @@ -1517,7 +1682,7 @@ impl RaftDataStateMachine { pub fn after_dump_data(&mut self) { assert!(Self::data_exists(&self.source)); assert!(Self::data_exists(&self.target)); - Self::must_remove(&self.source); // Enters the `Completed` state. + Self::must_remove_except(&self.source, &self.target); // Enters the `Completed` state. Self::must_remove(&self.in_progress_marker); } @@ -1569,6 +1734,31 @@ impl RaftDataStateMachine { } } + // Remove all files and directories under `remove_path` except `retain_path`. + fn must_remove_except(remove_path: &Path, retain_path: &Path) { + if !remove_path.exists() { + info!("Path not exists"; "path" => %remove_path.display()); + return; + } + if !remove_path.is_dir() { + info!("Path is not a directory, so remove directly"; "path" => %remove_path.display()); + Self::must_remove(remove_path); + return; + } + if !retain_path.starts_with(remove_path) { + info!("Removing directory as retain path is not under remove path"; "retain path" => %retain_path.display(), "remove path" => %remove_path.display()); + Self::must_remove(remove_path); + return; + } + + for entry in fs::read_dir(remove_path).unwrap() { + let sub_path = entry.unwrap().path(); + if sub_path != retain_path { + Self::must_remove(&sub_path); + } + } + } + fn must_rename_dir(from: &Path, to: &Path) { fs::rename(from, to).unwrap(); let mut dir = to.to_path_buf(); @@ -1576,11 +1766,35 @@ impl RaftDataStateMachine { Self::sync_dir(&dir); } - fn data_exists(path: &Path) -> bool { - if !path.exists() || !path.is_dir() { + #[inline] + fn dir_exists(path: &Path) -> bool { + path.exists() && path.is_dir() + } + + pub fn raftengine_exists(path: &Path) -> bool { + if !Self::dir_exists(path) { + return false; + } + fs::read_dir(path).unwrap().any(|entry| { + if let Ok(e) = entry { + let p = e.path(); + p.is_file() && p.extension().map_or(false, |ext| ext == "raftlog") + } else { + false + } + }) + } + + pub fn raftdb_exists(path: &Path) -> bool { + if !Self::dir_exists(path) { return false; } - fs::read_dir(path).unwrap().next().is_some() + let current_file_path = path.join("CURRENT"); + current_file_path.exists() && current_file_path.is_file() + } + + pub fn data_exists(path: &Path) -> bool { + Self::raftengine_exists(path) || Self::raftdb_exists(path) } fn sync_dir(dir: &Path) { @@ -1771,6 +1985,90 @@ mod tests { assert!(toml::from_str::("d = 23").is_err()); } + #[test] + fn test_readable_offset_time() { + let decode_cases = vec![ + ( + "23:00 +0000", + ReadableOffsetTime( + NaiveTime::from_hms_opt(23, 00, 00).unwrap(), + FixedOffset::east_opt(0).unwrap(), + ), + ), + ( + "03:00", + ReadableOffsetTime(NaiveTime::from_hms_opt(3, 00, 00).unwrap(), local_offset()), + ), + ( + "13:23 +09:30", + ReadableOffsetTime( + NaiveTime::from_hms_opt(13, 23, 00).unwrap(), + FixedOffset::east_opt(3600 * 9 + 1800).unwrap(), + ), + ), + ( + "09:30 -08:00", + ReadableOffsetTime( + NaiveTime::from_hms_opt(9, 30, 00).unwrap(), + FixedOffset::west_opt(3600 * 8).unwrap(), + ), + ), + ]; + for (encoded, expected) in decode_cases { + let actual = encoded.parse::().unwrap_or_else(|e| { + panic!( + "error parsing encoded={} expected={} error={}", + encoded, expected, e + ) + }); + assert_eq!(actual, expected); + } + let time = ReadableOffsetTime( + NaiveTime::from_hms_opt(9, 30, 00).unwrap(), + FixedOffset::west_opt(0).unwrap(), + ); + assert_eq!(format!("{}", time), "09:30:00 +00:00"); + let dt = DateTime::parse_from_rfc3339("2023-10-27T09:39:57-00:00").unwrap(); + assert!(time.hour_matches(&dt)); + assert!(!time.hour_minutes_matches(&dt)); + let dt = DateTime::parse_from_rfc3339("2023-10-27T09:30:57-00:00").unwrap(); + assert!(time.hour_minutes_matches(&dt)); + } + + #[test] + fn test_readable_schedule() { + let schedule = ReadableSchedule( + vec!["09:30 +00:00", "23:00 +00:00"] + .into_iter() + .flat_map(ReadableOffsetTime::from_str) + .collect::>(), + ); + + let time_a = DateTime::parse_from_rfc3339("2023-10-27T09:30:57-00:00").unwrap(); + let time_b = DateTime::parse_from_rfc3339("2023-10-28T09:00:57-00:00").unwrap(); + let time_c = DateTime::parse_from_rfc3339("2023-10-27T23:15:00-00:00").unwrap(); + let time_d = DateTime::parse_from_rfc3339("2023-10-27T23:00:00-00:00").unwrap(); + let time_e = DateTime::parse_from_rfc3339("2023-10-27T20:00:00-00:00").unwrap(); + + // positives for schedule by hour + assert!(schedule.is_scheduled_this_hour(&time_a)); + assert!(schedule.is_scheduled_this_hour(&time_b)); + assert!(schedule.is_scheduled_this_hour(&time_c)); + assert!(schedule.is_scheduled_this_hour(&time_d)); + + // negatives for schedule by hour + assert!(!schedule.is_scheduled_this_hour(&time_e)); + + // positives for schedule by hour and minute + assert!(schedule.is_scheduled_this_hour_minute(&time_a)); + assert!(schedule.is_scheduled_this_hour_minute(&time_d)); + + // negatives for schedule by hour and minute + assert!(!schedule.is_scheduled_this_hour_minute(&time_b)); + assert!(!schedule.is_scheduled_this_hour_minute(&time_c)); + assert!(!schedule.is_scheduled_this_hour_minute(&time_e)); + } + #[test] fn test_canonicalize_path() { let tmp = Builder::new() @@ -2100,6 +2398,98 @@ yyy = 100 ); } + #[test] + fn test_raft_engine_switch() { + // default setting, raft-db and raft-engine are not in the same place, need + // dump raft data from raft-db + let dir = tempfile::Builder::new().tempdir().unwrap(); + let root = dir.path().join("root"); + let source = root.join("source"); + fs::create_dir_all(&source).unwrap(); + let raftdb_data = source.join("CURRENT"); + fs::File::create(raftdb_data).unwrap(); + let target = root.join("target"); + fs::create_dir_all(&target).unwrap(); + let mut state = RaftDataStateMachine::new( + root.to_str().unwrap(), + source.to_str().unwrap(), + target.to_str().unwrap(), + ); + state.validate(true).unwrap(); + let should_dump = state.before_open_target(); + assert!(should_dump); + fs::remove_dir_all(&root).unwrap(); + + // raft-db is eventually moved, can't dump from raft-db + let dir = tempfile::Builder::new().tempdir().unwrap(); + let root = dir.path().join("root"); + let source = root.join("source"); + fs::create_dir_all(&source).unwrap(); + let target = root.join("target"); + fs::create_dir_all(&target).unwrap(); + state = RaftDataStateMachine::new( + root.to_str().unwrap(), + source.to_str().unwrap(), + target.to_str().unwrap(), + ); + state.validate(true).unwrap_err(); + fs::remove_dir_all(&root).unwrap(); + + // when setting raft-db dir, raft-engine dir is not set, raft-engine dir + // inherit from raft-db dir, need to dump raft data from raft-db + let dir = tempfile::Builder::new().tempdir().unwrap(); + let root = dir.path().join("root"); + let source = root.join("source"); + fs::create_dir_all(&source).unwrap(); + let raftdb_data = source.join("CURRENT"); + fs::File::create(raftdb_data).unwrap(); + let target = source.join("target"); + fs::create_dir_all(&target).unwrap(); + state = RaftDataStateMachine::new( + root.to_str().unwrap(), + source.to_str().unwrap(), + target.to_str().unwrap(), + ); + state.validate(true).unwrap(); + let should_dump = state.before_open_target(); + assert!(should_dump); + fs::remove_dir_all(&root).unwrap(); + + // inherit scenario raft-db is eventually moved, can't dump from raft-db + let dir = tempfile::Builder::new().tempdir().unwrap(); + let root = dir.path().join("root"); + let source = root.join("source"); + fs::create_dir_all(&source).unwrap(); + let target = source.join("target"); + fs::create_dir_all(&target).unwrap(); + state = RaftDataStateMachine::new( + root.to_str().unwrap(), + source.to_str().unwrap(), + target.to_str().unwrap(), + ); + state.validate(true).unwrap_err(); + fs::remove_dir_all(&root).unwrap(); + + // raft-db dump from raft-engine + let dir = tempfile::Builder::new().tempdir().unwrap(); + let root = dir.path().join("root"); + let source = root.join("source"); + fs::create_dir_all(&source).unwrap(); + let raftdb_data = source.join("CURRENT"); + fs::File::create(raftdb_data).unwrap(); + let target = source.join("target"); + fs::create_dir_all(&target).unwrap(); + let mut state = RaftDataStateMachine::new( + root.to_str().unwrap(), + source.to_str().unwrap(), + target.to_str().unwrap(), + ); + state.validate(true).unwrap(); + let should_dump = state.before_open_target(); + assert!(should_dump); + fs::remove_dir_all(&root).unwrap(); + } + #[test] fn test_raft_data_migration() { fn run_migration(root: &Path, source: &Path, target: &Path, check: F) { @@ -2122,12 +2512,15 @@ yyy = 100 fs::write(&marker, backup_marker).unwrap(); } - let source_file = source.join("file"); - let target_file = target.join("file"); + let mut source_file = source.join("CURRENT"); + let target_file = target.join("0000000000000001.raftlog"); if !target.exists() { fs::create_dir_all(target).unwrap(); check(); } + if !source_file.exists() { + source_file = source.join("0000000000000001.raftlog"); + } fs::copy(source_file, target_file).unwrap(); check(); state.after_dump_data_with_check(&check); @@ -2159,7 +2552,7 @@ yyy = 100 let target = root.join("target"); fs::create_dir_all(&target).unwrap(); // Write some data into source. - let source_file = source.join("file"); + let source_file = source.join("CURRENT"); File::create(source_file).unwrap(); let backup = dir.path().join("backup"); @@ -2175,4 +2568,163 @@ yyy = 100 copy_dir(&backup, &root).unwrap(); }); } + + #[test] + fn test_must_remove_except() { + fn create_raftdb(path: &Path) { + fs::create_dir(path).unwrap(); + // CURRENT file as the marker of raftdb. + let raftdb_data = path.join("CURRENT"); + fs::File::create(raftdb_data).unwrap(); + } + + fn create_raftengine(path: &Path) { + fs::create_dir(path).unwrap(); + let raftengine_data = path.join("raftengine_data"); + fs::File::create(raftengine_data).unwrap(); + } + + fn create_test_root(path: &Path) { + fs::create_dir(path).unwrap(); + } + + fn raftengine_must_exist(path: &Path) { + assert!(path.exists()); + let raftengine_data = path.join("raftengine_data"); + assert!(raftengine_data.exists()); + } + + fn raftdb_must_not_exist(path: &Path) { + assert!(!path.exists()); + let raftdb_data = path.join("raftdb_data"); + assert!(!raftdb_data.exists()); + } + let test_dir = tempfile::Builder::new() + .tempdir() + .unwrap() + .into_path() + .join("test_must_remove_except"); + + // before: + // test_must_remove_except + // ├── raftdb + // │ └── raftdb_data + // └── raftengine + // └── raftengine_data + // + // after: + // test_must_remove_except + // └── raftengine + // └── raftengine_data + create_test_root(&test_dir); + let raftdb_dir = test_dir.join("raftdb"); + let raftengine_dir = test_dir.join("raftengine"); + create_raftdb(&raftdb_dir); + create_raftengine(&raftengine_dir); + RaftDataStateMachine::must_remove_except(&raftdb_dir, &raftengine_dir); + raftengine_must_exist(&raftengine_dir); + raftdb_must_not_exist(&raftdb_dir); + fs::remove_dir_all(&test_dir).unwrap(); + + // before: + // test_must_remove_except/ + // └── raftdb + // ├── raftdb_data + // └── raftengine + // └── raftengine_data + // + // after: + // test_must_remove_except/ + // └── raftdb + // └── raftengine + // └── raftengine_data + create_test_root(&test_dir); + let raftdb_dir = test_dir.join("raftdb"); + let raftengine_dir = raftdb_dir.join("raftengine"); + create_raftdb(&raftdb_dir); + create_raftengine(&raftengine_dir); + RaftDataStateMachine::must_remove_except(&raftdb_dir, &raftengine_dir); + raftengine_must_exist(&raftengine_dir); + assert!(!test_dir.join("raftdb/raftdb_data").exists()); + fs::remove_dir_all(&test_dir).unwrap(); + + // before: + // test_must_remove_except/ + // └── raftengine + // ├── raftdb + // │ └── raftdb_data + // └── raftengine_data + // + // after: + // test_must_remove_except/ + // └── raftengine + // └── raftengine_data + create_test_root(&test_dir); + let raftengine_dir = test_dir.join("raftengine"); + let raftdb_dir = raftengine_dir.join("raftdb"); + create_raftengine(&raftengine_dir); + create_raftdb(&raftdb_dir); + RaftDataStateMachine::must_remove_except(&raftdb_dir, &raftengine_dir); + raftengine_must_exist(&raftengine_dir); + raftdb_must_not_exist(&raftdb_dir); + fs::remove_dir_all(&test_dir).unwrap(); + + // before: + // test_must_remove_except/ + // ├── raftdb_data + // └── raftengine + // └── raftengine_data + // + // after: + // test_must_remove_except/ + // └── raftengine + // └── raftengine_data + create_test_root(&test_dir); + let raftdb_data = test_dir.join("raftdb_data"); + fs::File::create(raftdb_data).unwrap(); + let raftengine_dir = test_dir.join("raftengine"); + create_raftengine(&raftengine_dir); + RaftDataStateMachine::must_remove_except(&test_dir, &raftengine_dir); + raftengine_must_exist(&raftengine_dir); + assert!(!test_dir.join("raftdb_data").exists()); + fs::remove_dir_all(&test_dir).unwrap(); + } + + #[test] + fn test_raft_data_exist() { + fn clear_dir(path: &PathBuf) { + if path.exists() { + fs::remove_dir_all(path).unwrap(); + } + fs::create_dir(path).unwrap(); + } + let test_dir = tempfile::Builder::new().tempdir().unwrap().into_path(); + + clear_dir(&test_dir); + fs::File::create(test_dir.join("0000000000000001.raftlog")).unwrap(); + assert!(RaftDataStateMachine::raftengine_exists(&test_dir)); + + clear_dir(&test_dir); + fs::File::create(test_dir.join("0000000000000001.raftlog")).unwrap(); + fs::File::create(test_dir.join("trash")).unwrap(); + assert!(RaftDataStateMachine::raftengine_exists(&test_dir)); + + clear_dir(&test_dir); + fs::File::create(test_dir.join("raftlog")).unwrap(); + assert!(!RaftDataStateMachine::raftengine_exists(&test_dir)); + + clear_dir(&test_dir); + assert!(!RaftDataStateMachine::raftengine_exists(&test_dir)); + + clear_dir(&test_dir); + fs::File::create(test_dir.join("CURRENT")).unwrap(); + assert!(RaftDataStateMachine::raftdb_exists(&test_dir)); + + clear_dir(&test_dir); + fs::File::create(test_dir.join("NOT_CURRENT")).unwrap(); + assert!(!RaftDataStateMachine::raftdb_exists(&test_dir)); + + clear_dir(&test_dir); + assert!(!RaftDataStateMachine::raftdb_exists(&test_dir)); + } } diff --git a/components/tikv_util/src/deadline.rs b/components/tikv_util/src/deadline.rs index 84463f507b9..64416999fe3 100644 --- a/components/tikv_util/src/deadline.rs +++ b/components/tikv_util/src/deadline.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use fail::fail_point; +use kvproto::errorpb; use super::time::{Duration, Instant}; @@ -58,3 +59,11 @@ impl Deadline { std::time::Instant::now() + self.deadline.duration_since(Instant::now_coarse()) } } + +const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; + +pub fn set_deadline_exceeded_busy_error(e: &mut errorpb::Error) { + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); + e.set_server_is_busy(server_is_busy_err); +} diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index b8aa578a878..908f32db86f 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -32,7 +32,6 @@ use nix::{ sys::wait::{wait, WaitStatus}, unistd::{fork, ForkResult}, }; -use rand::rngs::ThreadRng; use crate::sys::thread::StdThreadBuildWrapper; @@ -54,6 +53,7 @@ pub mod memory; pub mod metrics; pub mod mpsc; pub mod quota_limiter; +pub mod resource_control; pub mod store; pub mod stream; pub mod sys; @@ -133,38 +133,6 @@ pub fn slices_in_range(entry: &VecDeque, low: usize, high: usize) -> (&[T] } } -pub struct DefaultRng { - rng: ThreadRng, -} - -impl DefaultRng { - fn new() -> DefaultRng { - DefaultRng { - rng: rand::thread_rng(), - } - } -} - -impl Default for DefaultRng { - fn default() -> DefaultRng { - DefaultRng::new() - } -} - -impl Deref for DefaultRng { - type Target = ThreadRng; - - fn deref(&self) -> &ThreadRng { - &self.rng - } -} - -impl DerefMut for DefaultRng { - fn deref_mut(&mut self) -> &mut ThreadRng { - &mut self.rng - } -} - /// A handy shortcut to replace `RwLock` write/read().unwrap() pattern to /// shortcut wl and rl. pub trait HandyRwLock { diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index fd351eecbd4..91bd5013c1e 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -83,6 +83,18 @@ macro_rules! trace(($($args:tt)+) => { ::slog_global::trace!($($args)+) };); +/// Logs a infor or debug level message using the slog global logger. +#[macro_export] +macro_rules! info_or_debug{ + ($cond:expr; $($args:tt)+) => { + if $cond { + info!($($args)+) + } else { + debug!($($args)+) + } + }; +} + use std::fmt::{self, Display, Write}; use slog::{BorrowedKV, OwnedKVList, Record, KV}; diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index 5ebe9468a50..c321f56a1b5 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -6,6 +6,7 @@ mod formatter; use std::{ env, fmt, io::{self, BufWriter}, + num::NonZeroU64, path::{Path, PathBuf}, sync::{ atomic::{AtomicUsize, Ordering}, @@ -15,7 +16,10 @@ use std::{ }; use log::{self, SetLoggerError}; -use slog::{self, slog_o, Drain, FnValue, Key, OwnedKVList, PushFnValue, Record, KV}; +use slog::{ + self, slog_o, Drain, FnValue, Key, OwnedKV, OwnedKVList, PushFnValue, Record, + SendSyncRefUnwindSafeKV, KV, +}; pub use slog::{FilterFn, Level}; use slog_async::{Async, AsyncGuard, OverflowStrategy}; use slog_term::{Decorator, PlainDecorator, RecordDecorator}; @@ -85,7 +89,7 @@ where }; let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - (slog::Logger::root(filtered, slog_o!()), Some(guard)) + (slog::Logger::root(filtered, get_values()), Some(guard)) } else { let drain = LogAndFuse(Mutex::new(drain)); let drain = SlowLogFilter { @@ -93,7 +97,7 @@ where inner: drain, }; let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); - (slog::Logger::root(filtered, slog_o!()), None) + (slog::Logger::root(filtered, get_values()), None) }; set_global_logger(level, init_stdlog, logger, guard) @@ -628,6 +632,18 @@ fn write_log_fields( Ok(()) } +fn format_thread_id(thread_id: NonZeroU64) -> String { + format!("{:#0x}", thread_id) +} + +fn get_values() -> OwnedKV { + slog_o!( + "thread_id" => FnValue(|_| { + format_thread_id(std::thread::current().id().as_u64()) + }) + ) +} + struct Serializer<'a> { decorator: &'a mut dyn RecordDecorator, } @@ -679,7 +695,7 @@ impl<'a> slog::Serializer for Serializer<'a> { #[cfg(test)] mod tests { - use std::{cell::RefCell, io, io::Write, str::from_utf8}; + use std::{cell::RefCell, io, io::Write, str::from_utf8, sync::RwLock, time::Duration}; use chrono::DateTime; use regex::Regex; @@ -705,8 +721,6 @@ mod tests { } fn log_format_cases(logger: slog::Logger) { - use std::time::Duration; - // Empty message is not recommend, just for test purpose here. slog_info!(logger, ""); slog_info!(logger, "Welcome"); @@ -763,21 +777,25 @@ mod tests { fn test_log_format_text() { let decorator = PlainSyncDecorator::new(TestWriter); let drain = TikvFormat::new(decorator, true).fuse(); - let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); + let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); log_format_cases(logger); - let expect = r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] -[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] -[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] -[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] -[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] -[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] -"#; + let thread_id = format_thread_id(std::thread::current().id().as_u64()); + let expect = format!( + r#"[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:469] [Welcome] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:470] ["Welcome TiKV"] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:471] [欢迎] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:472] ["欢迎 TiKV"] [thread_id={0}] +[2019/01/15 13:40:39.615 +08:00] [INFO] [mod.rs:455] ["failed to fetch URL"] [backoff=3s] [attempt=3] [url=http://example.com] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:460] ["failed to \"fetch\" [URL]: http://example.com"] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [DEBUG] [mod.rs:463] ["Slow query"] ["process keys"=1500] [duration=123ns] [sql="SELECT * FROM TABLE WHERE ID=\"abc\""] [thread_id={0}] +[2019/01/15 13:40:39.619 +08:00] [WARN] [mod.rs:473] [Type] [Other=-inf] [Score=inf] [Counter=NaN] [thread_id={0}] +[2019/01/16 16:56:04.854 +08:00] [INFO] [mod.rs:391] ["more type tests"] [str_array="[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]"] [u8=34] [is_None=None] [is_false=false] [is_true=true] ["store ids"="[1, 2, 3]"] [url-peers="[\"peer1\", \"peer 2\"]"] [urls="[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]"] [field2="in quote"] [field1=no_quote] [thread_id={0}] +"#, + thread_id + ); BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); @@ -811,21 +829,25 @@ mod tests { fn test_log_format_json() { use serde_json::{from_str, Value}; let drain = Mutex::new(json_format(TestWriter, true)).map(slog::Fuse); - let logger = slog::Logger::root_typed(drain, slog_o!()).into_erased(); + let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); log_format_cases(logger); - let expect = r#"{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":""} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com"} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\""} -{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null} -{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote"} -"#; + let thread_id = format_thread_id(std::thread::current().id().as_u64()); + let expect = format!( + r#"{{"time":"2020/05/16 15:49:52.449 +08:00","level":"INFO","caller":"mod.rs:469","message":"","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:469","message":"Welcome","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:470","message":"Welcome TiKV","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:471","message":"欢迎","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:472","message":"欢迎 TiKV","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:455","message":"failed to fetch URL","backoff":"3s","attempt":3,"url":"http://example.com","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"INFO","caller":"mod.rs:460","message":"failed to \"fetch\" [URL]: http://example.com","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"DEBUG","caller":"mod.rs:463","message":"Slow query","process keys":1500,"duration":"123ns","sql":"SELECT * FROM TABLE WHERE ID=\"abc\"","thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.450 +08:00","level":"WARN","caller":"mod.rs:473","message":"Type","Other":null,"Score":null,"Counter":null,"thread_id":"{0}"}} +{{"time":"2020/05/16 15:49:52.451 +08:00","level":"INFO","caller":"mod.rs:391","message":"more type tests","str_array":"[\"💖\", \"�\", \"☺☻☹\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©\", \"\\\\x80\\\\x80\\\\x80\\\\x80\", \"XML\"]","u8":34,"is_None":null,"is_false":false,"is_true":true,"store ids":"[1, 2, 3]","url-peers":"[\"peer1\", \"peer 2\"]","urls":"[\"http://xxx.com:2347\", \"http://xxx.com:2432\"]","field2":"in quote","field1":"no_quote","thread_id":"{0}"}} +"#, + thread_id + ); BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); @@ -1074,4 +1096,48 @@ mod tests { } }); } + + static THREAD_SAFE_BUFFER: RwLock> = RwLock::new(Vec::new()); + + struct ThreadSafeWriter; + impl Write for ThreadSafeWriter { + fn write(&mut self, data: &[u8]) -> io::Result { + let mut buffer = THREAD_SAFE_BUFFER.write().unwrap(); + buffer.write(data) + } + + fn flush(&mut self) -> io::Result<()> { + let mut buffer = THREAD_SAFE_BUFFER.write().unwrap(); + buffer.flush() + } + } + + #[test] + fn test_threadid() { + let drain = TikvFormat::new(PlainSyncDecorator::new(ThreadSafeWriter), true).fuse(); + let logger = slog::Logger::root_typed(drain, get_values()).into_erased(); + + slog_info!(logger, "Hello from the first thread"); + let this_threadid = thread::current().id().as_u64(); + let this_threadid = format_thread_id(this_threadid); + + let handle = thread::spawn(move || { + slog_info!(logger, "Hello from the second thread"); + }); + let other_threadid = handle.thread().id().as_u64(); + let other_threadid = format_thread_id(other_threadid); + handle.join().unwrap(); + + let expected = vec![this_threadid, other_threadid]; + + let re = Regex::new(r"\[thread_id=(.*?)\]").unwrap(); + let buffer = THREAD_SAFE_BUFFER.read().unwrap(); + let output = from_utf8(&buffer).unwrap(); + let actual: Vec<&str> = output + .lines() + .map(|line| re.captures(line).unwrap()) + .map(|captures| captures.get(1).unwrap().as_str()) + .collect(); + assert_eq!(expected, actual); + } } diff --git a/components/tikv_util/src/lru.rs b/components/tikv_util/src/lru.rs index 76fad6e8a34..302bfc9264b 100644 --- a/components/tikv_util/src/lru.rs +++ b/components/tikv_util/src/lru.rs @@ -135,6 +135,10 @@ impl Trace { r.key.as_ptr().read() } } + + fn get_tail(&self) -> &K { + unsafe { self.tail.as_ref().prev.as_ref().key.assume_init_ref() } + } } impl Drop for Trace { @@ -174,14 +178,52 @@ impl SizePolicy for CountTracker { } } -pub struct LruCache +/// Some [`EvictPolicy`] (e.g. the `TxnStatusCache` in +/// `tikv::storage::txn::txn_status_cache` module) may need to know what the +/// entry bing popped out is to determine if it really can be popped. But there +/// is performance cost to always get the tail entry. So we pass this interface +/// to the `should_evict` function. An implementation of `EvictPolicy` can read +/// the tail entry only when it really needs. +pub trait GetTailEntry { + fn get_tail_entry(&self) -> Option<(&K, &V)>; +} + +/// An [`EvictPolicy`] defines how the [`LruCache`] should determine an entry +/// at the tail should be popped out. +pub trait EvictPolicy { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + get_tail_entry: &impl GetTailEntry, + ) -> bool; +} + +/// The default [`EvictPolicy`] of [`LruCache`], which pops out entries at the +/// tail when the limit specified by `capacity` is exceeded. +pub struct EvictOnFull; + +impl EvictPolicy for EvictOnFull { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + _: &impl GetTailEntry, + ) -> bool { + capacity < current_size + } +} + +pub struct LruCache where T: SizePolicy, + E: EvictPolicy, { map: HashMap>, trace: Trace, capacity: usize, size_policy: T, + evict_policy: E, } impl LruCache @@ -189,18 +231,30 @@ where T: SizePolicy, { pub fn with_capacity_sample_and_trace( - mut capacity: usize, + capacity: usize, sample_mask: usize, size_policy: T, ) -> LruCache { + Self::new(capacity, sample_mask, size_policy, EvictOnFull) + } +} + +impl LruCache +where + T: SizePolicy, + E: EvictPolicy, +{ + pub fn new(mut capacity: usize, sample_mask: usize, size_policy: T, evict_policy: E) -> Self { + // The capacity is at least 1. if capacity == 0 { capacity = 1; } - LruCache { + Self { map: HashMap::default(), trace: Trace::new(sample_mask), capacity, size_policy, + evict_policy, } } @@ -215,10 +269,18 @@ where self.trace.clear(); self.size_policy.on_reset(0); } + + /// Get the capacity limited on the `LruCache`. #[inline] pub fn capacity(&self) -> usize { self.capacity } + + /// Get the capacity actually allocated by the internal data structure. + #[inline] + pub fn internal_allocated_capacity(&self) -> usize { + self.map.capacity() + } } impl LruCache @@ -234,25 +296,36 @@ where } } -impl LruCache +impl LruCache where K: Eq + Hash + Clone + std::fmt::Debug, T: SizePolicy, + E: EvictPolicy, { #[inline] - pub fn insert(&mut self, key: K, value: V) { + fn insert_impl(&mut self, key: K, value: V, replace: bool) -> bool { + let mut inserted = true; let mut old_key = None; let current_size = SizePolicy::::current(&self.size_policy); + // In case the current size exactly equals to capacity, we also expect to reuse + // tail when inserting. Use `current_size + 1` to include the case. + let should_evict_on_insert = + self.evict_policy + .should_evict(current_size + 1, self.capacity, self); match self.map.entry(key) { HashMapEntry::Occupied(mut e) => { - self.size_policy.on_remove(e.key(), &e.get().value); - self.size_policy.on_insert(e.key(), &value); - let mut entry = e.get_mut(); - self.trace.promote(entry.record); - entry.value = value; + if replace { + self.size_policy.on_remove(e.key(), &e.get().value); + self.size_policy.on_insert(e.key(), &value); + let mut entry = e.get_mut(); + self.trace.promote(entry.record); + entry.value = value; + } else { + inserted = false; + } } HashMapEntry::Vacant(v) => { - let record = if self.capacity <= current_size { + let record = if should_evict_on_insert { let res = self.trace.reuse_tail(v.key().clone()); old_key = Some(res.0); res.1 @@ -274,7 +347,8 @@ where // Perhaps we can reject entries larger than capacity goes in the LRU cache, but // that is impossible for now: the `SizePolicy` trait doesn't provide the // interface of querying the actual size of an item. - self.evict_until_fit() + self.evict_until_fit(); + inserted } fn evict_until_fit(&mut self) { @@ -283,7 +357,7 @@ where let current_size = self.size_policy.current(); // Should we keep at least one entry? So our users won't lose their fresh record // once it exceeds the capacity. - if current_size <= cap || self.map.is_empty() { + if !self.evict_policy.should_evict(current_size, cap, self) || self.map.is_empty() { break; } let key = self.trace.remove_tail(); @@ -292,6 +366,18 @@ where } } + #[inline] + pub fn insert(&mut self, key: K, value: V) { + self.insert_impl(key, value, true); + } + + /// Insert an entry if the key doesn't exist before. The existing entry + /// won't be replaced and won't be promoted to the most-recent place. + #[inline] + pub fn insert_if_not_exist(&mut self, key: K, value: V) -> bool { + self.insert_impl(key, value, false) + } + #[inline] pub fn remove(&mut self, key: &K) -> Option { if let Some(v) = self.map.remove(key) { @@ -313,6 +399,12 @@ where } } + /// Get an item by key without promoting the item. + #[inline] + pub fn get_no_promote(&self, key: &K) -> Option<&V> { + self.map.get(key).map(|v| &v.value) + } + #[inline] pub fn get_mut(&mut self, key: &K) -> Option<&mut V> { match self.map.get_mut(key) { @@ -355,17 +447,37 @@ where } } -unsafe impl Send for LruCache +impl GetTailEntry for LruCache +where + K: Eq + Hash + Clone + std::fmt::Debug, + T: SizePolicy, + E: EvictPolicy, +{ + fn get_tail_entry(&self) -> Option<(&K, &V)> { + if self.is_empty() { + return None; + } + + let k = self.trace.get_tail(); + self.map + .get_key_value(k) + .map(|(k, entry)| (k, &entry.value)) + } +} + +unsafe impl Send for LruCache where K: Send, V: Send, T: Send + SizePolicy, + E: Send + EvictPolicy, { } -impl Drop for LruCache +impl Drop for LruCache where T: SizePolicy, + E: EvictPolicy, { fn drop(&mut self) { self.clear(); @@ -626,4 +738,61 @@ mod tests { assert!(cache.size() <= 42); } } + + #[test] + fn test_get_no_promote() { + let mut cache = LruCache::with_capacity_sample_and_trace(3, 0, CountTracker::default()); + cache.insert(1, 1); + cache.insert(2, 2); + cache.insert(3, 3); + assert_eq!(cache.size(), 3); + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + cache.insert(4, 4); + assert_eq!(cache.size(), 3); + // Key 1 is not promoted, so it's popped out first. + assert!(cache.get_no_promote(&1).is_none()); + // Other entries are not affected. + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + assert_eq!(*cache.get_no_promote(&4).unwrap(), 4); + } + + #[test] + fn test_insert_if_not_exist() { + let mut cache = LruCache::with_capacity_sample_and_trace(4, 0, CountTracker::default()); + assert!(cache.insert_if_not_exist(1, 1)); + assert!(cache.insert_if_not_exist(2, 2)); + assert!(cache.insert_if_not_exist(3, 3)); + assert_eq!(cache.size(), 3); + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + + assert!(!cache.insert_if_not_exist(1, 11)); + // Not updated. + assert_eq!(*cache.get_no_promote(&1).unwrap(), 1); + + assert!(cache.insert_if_not_exist(4, 4)); + assert!(!cache.insert_if_not_exist(2, 22)); + // Not updated. + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + + assert_eq!(cache.size(), 4); + assert!(cache.insert_if_not_exist(5, 5)); + assert_eq!(cache.size(), 4); + // key 1 is not promoted, so it's first popped out. + assert!(cache.get_no_promote(&1).is_none()); + assert_eq!(*cache.get_no_promote(&2).unwrap(), 2); + + assert!(cache.insert_if_not_exist(6, 6)); + assert_eq!(cache.size(), 4); + // key 2 is not promoted either, so it's first popped out. + assert!(cache.get_no_promote(&2).is_none()); + assert_eq!(*cache.get_no_promote(&3).unwrap(), 3); + + assert!(cache.insert_if_not_exist(7, 7)); + assert_eq!(cache.size(), 4); + assert!(cache.get_no_promote(&3).is_none()); + assert_eq!(*cache.get_no_promote(&4).unwrap(), 4); + } } diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index 0a2f49461c5..15ffece4425 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -1,6 +1,12 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::mem; +use std::{ + mem, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; use kvproto::{ encryptionpb::EncryptionMeta, @@ -28,6 +34,12 @@ pub trait HeapSize { } } +impl HeapSize for [u8] { + fn heap_size(&self) -> usize { + self.len() * mem::size_of::() + } +} + impl HeapSize for Region { fn heap_size(&self) -> usize { let mut size = self.start_key.capacity() + self.end_key.capacity(); @@ -65,3 +77,159 @@ impl HeapSize for RaftCmdRequest { + mem::size_of_val(&self.status_request) } } + +#[derive(Debug)] +pub struct MemoryQuotaExceeded; + +impl std::error::Error for MemoryQuotaExceeded {} + +impl_display_as_debug!(MemoryQuotaExceeded); + +pub struct MemoryQuota { + in_use: AtomicUsize, + capacity: AtomicUsize, +} + +pub struct OwnedAllocated { + allocated: usize, + from: Arc, +} + +impl OwnedAllocated { + pub fn new(target: Arc) -> Self { + Self { + allocated: 0, + from: target, + } + } + + pub fn alloc(&mut self, bytes: usize) -> Result<(), MemoryQuotaExceeded> { + self.from.alloc(bytes)?; + self.allocated += bytes; + Ok(()) + } +} + +impl Drop for OwnedAllocated { + fn drop(&mut self) { + self.from.free(self.allocated) + } +} + +impl MemoryQuota { + pub fn new(capacity: usize) -> MemoryQuota { + MemoryQuota { + in_use: AtomicUsize::new(0), + capacity: AtomicUsize::new(capacity), + } + } + + pub fn in_use(&self) -> usize { + self.in_use.load(Ordering::Relaxed) + } + + pub fn capacity(&self) -> usize { + self.capacity.load(Ordering::Relaxed) + } + + pub fn set_capacity(&self, capacity: usize) { + self.capacity.store(capacity, Ordering::Relaxed); + } + + pub fn alloc(&self, bytes: usize) -> Result<(), MemoryQuotaExceeded> { + let capacity = self.capacity.load(Ordering::Relaxed); + let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); + loop { + if in_use_bytes + bytes > capacity { + return Err(MemoryQuotaExceeded); + } + let new_in_use_bytes = in_use_bytes + bytes; + match self.in_use.compare_exchange_weak( + in_use_bytes, + new_in_use_bytes, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return Ok(()), + Err(current) => in_use_bytes = current, + } + } + } + + pub fn free(&self, bytes: usize) { + let mut in_use_bytes = self.in_use.load(Ordering::Relaxed); + loop { + // Saturating at the numeric bounds instead of overflowing. + let new_in_use_bytes = in_use_bytes - std::cmp::min(bytes, in_use_bytes); + match self.in_use.compare_exchange_weak( + in_use_bytes, + new_in_use_bytes, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return, + Err(current) => in_use_bytes = current, + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_quota() { + let quota = MemoryQuota::new(100); + quota.alloc(10).unwrap(); + assert_eq!(quota.in_use(), 10); + quota.alloc(100).unwrap_err(); + assert_eq!(quota.in_use(), 10); + quota.free(5); + assert_eq!(quota.in_use(), 5); + quota.alloc(95).unwrap(); + assert_eq!(quota.in_use(), 100); + quota.free(95); + assert_eq!(quota.in_use(), 5); + } + + #[test] + fn test_resize_memory_quota() { + let quota = MemoryQuota::new(100); + quota.alloc(10).unwrap(); + assert_eq!(quota.in_use(), 10); + quota.alloc(100).unwrap_err(); + assert_eq!(quota.in_use(), 10); + quota.set_capacity(200); + quota.alloc(100).unwrap(); + assert_eq!(quota.in_use(), 110); + quota.set_capacity(50); + quota.alloc(100).unwrap_err(); + assert_eq!(quota.in_use(), 110); + quota.free(100); + assert_eq!(quota.in_use(), 10); + quota.alloc(40).unwrap(); + assert_eq!(quota.in_use(), 50); + } + + #[test] + fn test_allocated() { + let quota = Arc::new(MemoryQuota::new(100)); + let mut allocated = OwnedAllocated::new(Arc::clone("a)); + allocated.alloc(42).unwrap(); + assert_eq!(quota.in_use(), 42); + quota.alloc(59).unwrap_err(); + allocated.alloc(16).unwrap(); + assert_eq!(quota.in_use(), 58); + let mut allocated2 = OwnedAllocated::new(Arc::clone("a)); + allocated2.alloc(8).unwrap(); + allocated2.alloc(40).unwrap_err(); + assert_eq!(quota.in_use(), 66); + quota.alloc(4).unwrap(); + assert_eq!(quota.in_use(), 70); + drop(allocated); + assert_eq!(quota.in_use(), 12); + drop(allocated2); + assert_eq!(quota.in_use(), 4); + } +} diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 700691f1189..9a71dbc0c5e 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -8,9 +8,8 @@ pub mod future; pub mod priority_queue; use std::{ - cell::Cell, sync::{ - atomic::{AtomicBool, AtomicIsize, Ordering}, + atomic::{AtomicBool, AtomicIsize, AtomicUsize, Ordering}, Arc, }, time::Duration, @@ -208,7 +207,7 @@ const CHECK_INTERVAL: usize = 8; /// A sender of channel that limits the maximun pending messages count loosely. pub struct LooseBoundedSender { sender: Sender, - tried_cnt: Cell, + tried_cnt: AtomicUsize, limit: usize, } @@ -230,25 +229,23 @@ impl LooseBoundedSender { /// Send a message regardless its capacity limit. #[inline] pub fn force_send(&self, t: T) -> Result<(), SendError> { - let cnt = self.tried_cnt.get(); - self.tried_cnt.set(cnt + 1); + self.tried_cnt.fetch_add(1, Ordering::AcqRel); self.sender.send(t) } /// Attempts to send a message into the channel without blocking. #[inline] pub fn try_send(&self, t: T) -> Result<(), TrySendError> { - let cnt = self.tried_cnt.get(); let check_interval = || { fail_point!("loose_bounded_sender_check_interval", |_| 0); CHECK_INTERVAL }; - if cnt < check_interval() { - self.tried_cnt.set(cnt + 1); - } else if self.len() < self.limit { - self.tried_cnt.set(1); - } else { - return Err(TrySendError::Full(t)); + if self.tried_cnt.fetch_add(1, Ordering::AcqRel) >= check_interval() { + if self.len() < self.limit { + self.tried_cnt.store(1, Ordering::Release); + } else { + return Err(TrySendError::Full(t)); + } } match self.sender.send(t) { @@ -275,7 +272,7 @@ impl Clone for LooseBoundedSender { fn clone(&self) -> LooseBoundedSender { LooseBoundedSender { sender: self.sender.clone(), - tried_cnt: self.tried_cnt.clone(), + tried_cnt: AtomicUsize::new(0), limit: self.limit, } } @@ -287,7 +284,7 @@ pub fn loose_bounded(cap: usize) -> (LooseBoundedSender, Receiver) { ( LooseBoundedSender { sender, - tried_cnt: Cell::new(0), + tried_cnt: AtomicUsize::new(0), limit: cap, }, receiver, diff --git a/components/tikv_util/src/resource_control.rs b/components/tikv_util/src/resource_control.rs new file mode 100644 index 00000000000..c7b46c2ddab --- /dev/null +++ b/components/tikv_util/src/resource_control.rs @@ -0,0 +1,191 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +/// This mod provide some utility types and functions for resource control. +use std::borrow::Cow; + +use kvproto::kvrpcpb::ResourceControlContext; +use strum::{EnumCount, EnumIter}; + +/// default resource group name +pub const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; + +const OVERRIDE_PRIORITY_MASK: u8 = 0b1000_0000; +const RESOURCE_GROUP_NAME_MASK: u8 = 0b0100_0000; + +#[derive(Clone, Default)] +pub struct TaskMetadata<'a> { + // The first byte is a bit map to indicate which field exists, + // then append override priority if nonzero, + // then append resource group name if not default + metadata: Cow<'a, [u8]>, +} + +impl<'a> TaskMetadata<'a> { + pub fn deep_clone(&self) -> TaskMetadata<'static> { + TaskMetadata { + metadata: Cow::Owned(self.metadata.to_vec()), + } + } + + pub fn from_ctx(ctx: &ResourceControlContext) -> Self { + let mut mask = 0; + let mut buf = vec![]; + if ctx.override_priority != 0 { + mask |= OVERRIDE_PRIORITY_MASK; + } + if !ctx.resource_group_name.is_empty() + && ctx.resource_group_name != DEFAULT_RESOURCE_GROUP_NAME + { + mask |= RESOURCE_GROUP_NAME_MASK; + } + if mask == 0 { + // if all are default value, no need to write anything to save copy cost + return Self { + metadata: Cow::Owned(buf), + }; + } + buf.push(mask); + if mask & OVERRIDE_PRIORITY_MASK != 0 { + buf.extend_from_slice(&(ctx.override_priority as u32).to_ne_bytes()); + } + if mask & RESOURCE_GROUP_NAME_MASK != 0 { + buf.extend_from_slice(ctx.resource_group_name.as_bytes()); + } + Self { + metadata: Cow::Owned(buf), + } + } + + pub fn to_vec(self) -> Vec { + self.metadata.into_owned() + } + + pub fn override_priority(&self) -> u32 { + if self.metadata.is_empty() { + return 0; + } + if self.metadata[0] & OVERRIDE_PRIORITY_MASK == 0 { + return 0; + } + u32::from_ne_bytes(self.metadata[1..5].try_into().unwrap()) + } + + pub fn group_name(&self) -> &[u8] { + if self.metadata.is_empty() { + return DEFAULT_RESOURCE_GROUP_NAME.as_bytes(); + } + if self.metadata[0] & RESOURCE_GROUP_NAME_MASK == 0 { + return DEFAULT_RESOURCE_GROUP_NAME.as_bytes(); + } + let start = if self.metadata[0] & OVERRIDE_PRIORITY_MASK != 0 { + 5 + } else { + 1 + }; + &self.metadata[start..] + } +} + +impl<'a> From<&'a [u8]> for TaskMetadata<'a> { + fn from(bytes: &'a [u8]) -> Self { + Self { + metadata: Cow::Borrowed(bytes), + } + } +} + +// return the TaskPriority value from task metadata. +pub fn priority_from_task_meta(meta: &[u8]) -> TaskPriority { + let priority = TaskMetadata::from(meta).override_priority(); + // mapping (high(15), medium(8), low(1)) -> (0, 1, 2) + debug_assert!(priority <= 16); + TaskPriority::from(priority) +} + +#[derive(Copy, Clone, Eq, PartialEq, EnumCount, EnumIter, Debug)] +#[repr(usize)] +pub enum TaskPriority { + High = 0, + Medium = 1, + Low = 2, +} + +impl TaskPriority { + // reexport enum count, caller can use it without importing `EnumCount`. + pub const PRIORITY_COUNT: usize = Self::COUNT; + pub fn as_str(&self) -> &'static str { + match *self { + TaskPriority::High => "high", + TaskPriority::Medium => "medium", + TaskPriority::Low => "low", + } + } + + pub fn priorities() -> [Self; Self::COUNT] { + use TaskPriority::*; + [High, Medium, Low] + } +} + +impl From for TaskPriority { + fn from(value: u32) -> Self { + // map the resource group priority value (1,8,16) to (Low,Medium,High) + // 0 means the priority is not set, so map it to medium by default. + if value == 0 { + Self::Medium + } else if value < 6 { + Self::Low + } else if value < 11 { + Self::Medium + } else { + Self::High + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_task_metadata() { + let cases = [ + ("default", 0u32), + ("default", 6u32), + ("test", 0u32), + ("test", 15u32), + ]; + + let metadata = TaskMetadata::from_ctx(&ResourceControlContext::default()); + assert_eq!(metadata.group_name(), b"default"); + for (group_name, priority) in cases { + let metadata = TaskMetadata::from_ctx(&ResourceControlContext { + resource_group_name: group_name.to_string(), + override_priority: priority as u64, + ..Default::default() + }); + assert_eq!(metadata.override_priority(), priority); + assert_eq!(metadata.group_name(), group_name.as_bytes()); + let vec = metadata.to_vec(); + let metadata1 = TaskMetadata::from(vec.as_slice()); + assert_eq!(metadata1.override_priority(), priority); + assert_eq!(metadata1.group_name(), group_name.as_bytes()); + } + } + + #[test] + fn test_task_priority() { + use TaskPriority::*; + let cases = [ + (0, Medium), + (1, Low), + (7, Medium), + (8, Medium), + (15, High), + (16, High), + ]; + for (value, priority) in cases { + assert_eq!(TaskPriority::from(value), priority); + } + } +} diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index c3919e42619..a22732a7aae 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -7,7 +7,7 @@ use std::{ future::Future, sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, Mutex, + Arc, }, time::{Duration, Instant}, }; @@ -20,13 +20,13 @@ use futures::{ stream::StreamExt, }; use prometheus::IntGauge; -use yatp::{Remote, ThreadPool}; +use yatp::Remote; use super::metrics::*; use crate::{ future::{block_on_timeout, poll_future_notify}, timer::GLOBAL_TIMER_HANDLE, - yatp_pool::{DefaultTicker, YatpPoolBuilder}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, }; #[derive(PartialEq)] @@ -222,7 +222,15 @@ impl LazyWorker { } pub fn remote(&self) -> Remote { - self.worker.remote.clone() + self.worker.remote() + } + + pub fn pool_size(&self) -> usize { + self.worker.pool_size() + } + + pub fn pool(&self) -> FuturePool { + self.worker.pool() } } @@ -301,11 +309,8 @@ impl> Builder { let pool = YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix(self.name) .thread_count(self.thread_count, self.thread_count, self.thread_count) - .build_single_level_pool(); - let remote = pool.remote().clone(); - let pool = Arc::new(Mutex::new(Some(pool))); + .build_future_pool(); Worker { - remote, stop: Arc::new(AtomicBool::new(false)), pool, counter: Arc::new(AtomicUsize::new(0)), @@ -318,8 +323,7 @@ impl> Builder { /// A worker that can schedule time consuming tasks. #[derive(Clone)] pub struct Worker { - pool: Arc>>>, - remote: Remote, + pool: FuturePool, pending_capacity: usize, counter: Arc, stop: Arc, @@ -371,7 +375,7 @@ impl Worker { .interval(std::time::Instant::now(), interval) .compat(); let stop = self.stop.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { while !stop.load(Ordering::Relaxed) && let Some(Ok(_)) = interval.next().await { @@ -389,7 +393,7 @@ impl Worker { .interval(std::time::Instant::now(), interval) .compat(); let stop = self.stop.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { while !stop.load(Ordering::Relaxed) && let Some(Ok(_)) = interval.next().await { @@ -403,7 +407,7 @@ impl Worker { where F: Future + Send + 'static, { - self.remote.spawn(f); + let _ = self.pool.spawn(f); } fn delay_notify(tx: UnboundedSender>, timeout: Duration) { @@ -438,10 +442,8 @@ impl Worker { /// Stops the worker thread. pub fn stop(&self) { - if let Some(pool) = self.pool.lock().unwrap().take() { - self.stop.store(true, Ordering::Release); - pool.shutdown(); - } + self.stop.store(true, Ordering::Release); + self.pool.shutdown(); } /// Checks if underlying worker can't handle task immediately. @@ -451,7 +453,15 @@ impl Worker { } pub fn remote(&self) -> Remote { - self.remote.clone() + self.pool.remote().clone() + } + + pub fn pool_size(&self) -> usize { + self.pool.get_pool_size() + } + + pub fn pool(&self) -> FuturePool { + self.pool.clone() } fn start_impl( @@ -461,7 +471,7 @@ impl Worker { metrics_pending_task_count: IntGauge, ) { let counter = self.counter.clone(); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { let mut handle = RunnableWrapper { inner: runner }; while let Some(msg) = receiver.next().await { match msg { @@ -488,7 +498,7 @@ impl Worker { let counter = self.counter.clone(); let timeout = runner.get_interval(); Self::delay_notify(tx.clone(), timeout); - self.remote.spawn(async move { + let _ = self.pool.spawn(async move { let mut handle = RunnableWrapper { inner: runner }; while let Some(msg) = receiver.next().await { match msg { diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 827ffbbdce2..75d65fe4641 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -17,13 +17,15 @@ use prometheus::{IntCounter, IntGauge}; use tracker::TrackedFuture; use yatp::{queue::Extras, task::future}; +use crate::resource_control::{priority_from_task_meta, TaskPriority}; + pub type ThreadPool = yatp::ThreadPool; use super::metrics; #[derive(Clone)] struct Env { - metrics_running_task_count: IntGauge, + metrics_running_task_count_by_priority: [IntGauge; TaskPriority::PRIORITY_COUNT], metrics_handled_task_count: IntCounter, } @@ -46,8 +48,9 @@ impl crate::AssertSync for FuturePool {} impl FuturePool { pub fn from_pool(pool: ThreadPool, name: &str, pool_size: usize, max_tasks: usize) -> Self { let env = Env { - metrics_running_task_count: metrics::FUTUREPOOL_RUNNING_TASK_VEC - .with_label_values(&[name]), + metrics_running_task_count_by_priority: TaskPriority::priorities().map(|p| { + metrics::FUTUREPOOL_RUNNING_TASK_VEC.with_label_values(&[name, p.as_str()]) + }), metrics_handled_task_count: metrics::FUTUREPOOL_HANDLED_TASK_VEC .with_label_values(&[name]), }; @@ -56,7 +59,7 @@ impl FuturePool { pool, env, pool_size: AtomicUsize::new(pool_size), - max_tasks, + max_tasks: AtomicUsize::new(max_tasks), }), } } @@ -71,6 +74,16 @@ impl FuturePool { self.inner.scale_pool_size(thread_count) } + #[inline] + pub fn set_max_tasks_per_worker(&self, tasks_per_thread: usize) { + self.inner.set_max_tasks_per_worker(tasks_per_thread); + } + + #[inline] + pub fn get_max_tasks_count(&self) -> usize { + self.inner.max_tasks.load(Ordering::Relaxed) + } + /// Gets current running task count. #[inline] pub fn get_running_task_count(&self) -> usize { @@ -119,6 +132,11 @@ impl FuturePool { pub fn shutdown(&self) { self.inner.pool.shutdown(); } + + // Get a remote queue for spawning tasks without owning the thread pool. + pub fn remote(&self) -> &yatp::Remote { + self.inner.pool.remote() + } } struct PoolInner { @@ -126,37 +144,56 @@ struct PoolInner { env: Env, // for accessing pool_size config since yatp doesn't offer such getter. pool_size: AtomicUsize, - max_tasks: usize, + max_tasks: AtomicUsize, } impl PoolInner { #[inline] fn scale_pool_size(&self, thread_count: usize) { self.pool.scale_workers(thread_count); + let mut max_tasks = self.max_tasks.load(Ordering::Acquire); + if max_tasks != std::usize::MAX { + max_tasks = max_tasks + .saturating_div(self.pool_size.load(Ordering::Acquire)) + .saturating_mul(thread_count); + self.max_tasks.store(max_tasks, Ordering::Release); + } self.pool_size.store(thread_count, Ordering::Release); } + fn set_max_tasks_per_worker(&self, max_tasks_per_thread: usize) { + let max_tasks = self + .pool_size + .load(Ordering::Acquire) + .saturating_mul(max_tasks_per_thread); + self.max_tasks.store(max_tasks, Ordering::Release); + } + fn get_running_task_count(&self) -> usize { // As long as different future pool has different name prefix, we can safely use // the value in metrics. - self.env.metrics_running_task_count.get() as usize + self.env + .metrics_running_task_count_by_priority + .iter() + .map(|r| r.get()) + .sum::() as usize } - fn gate_spawn(&self) -> Result<(), Full> { + fn gate_spawn(&self, current_tasks: usize) -> Result<(), Full> { fail_point!("future_pool_spawn_full", |_| Err(Full { current_tasks: 100, max_tasks: 100, })); - if self.max_tasks == std::usize::MAX { + let max_tasks = self.max_tasks.load(Ordering::Acquire); + if max_tasks == std::usize::MAX { return Ok(()); } - let current_tasks = self.get_running_task_count(); - if current_tasks >= self.max_tasks { + if current_tasks >= max_tasks { Err(Full { current_tasks, - max_tasks: self.max_tasks, + max_tasks, }) } else { Ok(()) @@ -168,9 +205,14 @@ impl PoolInner { F: Future + Send + 'static, { let metrics_handled_task_count = self.env.metrics_handled_task_count.clone(); - let metrics_running_task_count = self.env.metrics_running_task_count.clone(); + let task_priority = extras + .as_ref() + .map(|m| priority_from_task_meta(m.metadata())) + .unwrap_or(TaskPriority::Medium); + let metrics_running_task_count = + self.env.metrics_running_task_count_by_priority[task_priority as usize].clone(); - self.gate_spawn()?; + self.gate_spawn(metrics_running_task_count.get() as usize)?; metrics_running_task_count.inc(); @@ -197,9 +239,10 @@ impl PoolInner { F::Output: Send, { let metrics_handled_task_count = self.env.metrics_handled_task_count.clone(); - let metrics_running_task_count = self.env.metrics_running_task_count.clone(); + let metrics_running_task_count = + self.env.metrics_running_task_count_by_priority[TaskPriority::Medium as usize].clone(); - self.gate_spawn()?; + self.gate_spawn(metrics_running_task_count.get() as usize)?; let (tx, rx) = oneshot::channel(); metrics_running_task_count.inc(); diff --git a/components/tikv_util/src/yatp_pool/metrics.rs b/components/tikv_util/src/yatp_pool/metrics.rs index 8ae1aa8910e..a3e68b260db 100644 --- a/components/tikv_util/src/yatp_pool/metrics.rs +++ b/components/tikv_util/src/yatp_pool/metrics.rs @@ -7,7 +7,7 @@ lazy_static! { pub static ref FUTUREPOOL_RUNNING_TASK_VEC: IntGaugeVec = register_int_gauge_vec!( "tikv_futurepool_pending_task_total", "Current future_pool pending + running tasks.", - &["name"] + &["name", "priority"] ) .unwrap(); pub static ref FUTUREPOOL_HANDLED_TASK_VEC: IntCounterVec = register_int_counter_vec!( @@ -19,8 +19,8 @@ lazy_static! { pub static ref YATP_POOL_SCHEDULE_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( "tikv_yatp_pool_schedule_wait_duration", "Histogram of yatp pool schedule wait duration.", - &["name"], - exponential_buckets(1e-5, 4.0, 12).unwrap() // 10us ~ 41s + &["name", "priority"], + exponential_buckets(1e-5, 2.0, 18).unwrap() // 10us ~ 2.5s ) .unwrap(); } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index fc80e69cd84..cfdfc540b30 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -1,14 +1,14 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod future_pool; -mod metrics; +pub mod metrics; use std::sync::Arc; use fail::fail_point; pub use future_pool::{Full, FuturePool}; use futures::{compat::Stream01CompatExt, StreamExt}; -use prometheus::{local::LocalHistogram, Histogram}; +use prometheus::{local::LocalHistogram, Histogram, HistogramOpts}; use yatp::{ pool::{CloneRunnerBuilder, Local, Remote, Runner}, queue::{multilevel, priority, Extras, QueueType, TaskCell as _}, @@ -17,6 +17,7 @@ use yatp::{ }; use crate::{ + resource_control::{priority_from_task_meta, TaskPriority}, thread_group::GroupProperties, time::{Duration, Instant}, timer::GLOBAL_TIMER_HANDLE, @@ -165,7 +166,8 @@ pub struct YatpPoolRunner { before_pause: Option>, // Statistics about the schedule wait duration. - schedule_wait_duration: LocalHistogram, + // local histogram for high,medium,low priority tasks. + schedule_wait_durations: [LocalHistogram; TaskPriority::PRIORITY_COUNT], } impl Runner for YatpPoolRunner { @@ -190,12 +192,12 @@ impl Runner for YatpPoolRunner { fn handle(&mut self, local: &mut Local, mut task_cell: Self::TaskCell) -> bool { let extras = task_cell.mut_extras(); if let Some(schedule_time) = extras.schedule_time() { - self.schedule_wait_duration - .observe(schedule_time.elapsed().as_secs_f64()); + let idx = priority_from_task_meta(extras.metadata()) as usize; + self.schedule_wait_durations[idx].observe(schedule_time.elapsed().as_secs_f64()); } let finished = self.inner.handle(local, task_cell); if self.ticker.try_tick() { - self.schedule_wait_duration.flush(); + self.schedule_wait_durations.iter().for_each(|m| m.flush()); } finished } @@ -229,7 +231,7 @@ impl YatpPoolRunner { after_start: Option>, before_stop: Option>, before_pause: Option>, - schedule_wait_duration: Histogram, + schedule_wait_durations: [Histogram; TaskPriority::PRIORITY_COUNT], ) -> Self { YatpPoolRunner { inner, @@ -238,7 +240,7 @@ impl YatpPoolRunner { after_start, before_stop, before_pause, - schedule_wait_duration: schedule_wait_duration.local(), + schedule_wait_durations: schedule_wait_durations.map(|m| m.local()), } } } @@ -256,6 +258,10 @@ pub struct YatpPoolBuilder { max_tasks: usize, cleanup_method: CleanupMethod, + // whether to tracker task scheduling wait duration + enable_task_wait_metrics: bool, + metric_idx_from_task_meta: Option usize + Send + Sync>>, + #[cfg(test)] background_cleanup_hook: Option>, } @@ -275,6 +281,9 @@ impl YatpPoolBuilder { max_tasks: std::usize::MAX, cleanup_method: CleanupMethod::InPlace, + enable_task_wait_metrics: false, + metric_idx_from_task_meta: None, + #[cfg(test)] background_cleanup_hook: None, } @@ -344,6 +353,19 @@ impl YatpPoolBuilder { self } + pub fn enable_task_wait_metrics(mut self, enable: bool) -> Self { + self.enable_task_wait_metrics = enable; + self + } + + pub fn metric_idx_from_task_meta( + mut self, + f: Arc usize + Send + Sync>, + ) -> Self { + self.metric_idx_from_task_meta = Some(f); + self + } + pub fn build_future_pool(self) -> FuturePool { let name = self .name_prefix @@ -469,15 +491,21 @@ impl YatpPoolBuilder { let after_start = self.after_start.take(); let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); - let schedule_wait_duration = - metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name]); + let schedule_wait_durations = if self.enable_task_wait_metrics { + TaskPriority::priorities().map(|p| { + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC + .with_label_values(&[&name, p.as_str()]) + }) + } else { + std::array::from_fn(|_| Histogram::with_opts(HistogramOpts::new("_", "_")).unwrap()) + }; let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), after_start, before_stop, before_pause, - schedule_wait_duration, + schedule_wait_durations, ); (builder, read_pool_runner) } @@ -500,6 +528,7 @@ mod tests { let name = "test_record_schedule_wait_duration"; let pool = YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix(name) + .enable_task_wait_metrics(true) .build_single_level_pool(); let (tx, rx) = mpsc::channel(); for _ in 0..3 { @@ -518,7 +547,8 @@ mod tests { } // Drop the pool so the local metrics are flushed. drop(pool); - let histogram = metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + let histogram = + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name, "medium"]); assert_eq!(histogram.get_sample_count() as u32, 6, "{:?}", histogram); } diff --git a/doc/http.md b/doc/http.md new file mode 100644 index 00000000000..625af034091 --- /dev/null +++ b/doc/http.md @@ -0,0 +1,90 @@ +# HTTP API + +In the context of the following line: `TIKV_ADDRESS=$TIKV_IP:$TIKV_STATUS_PORT` + +By default: + +- `TIKV_IP` should be set to `127.0.0.1` +- `TIKV_STATUS_PORT` should be set to `20180` + +## CPU Profiling + +Collect and export CPU profiling data within a specified time range. + +```bash +curl -H 'Content-Type:' -X GET 'http://$TIKV_ADDRESS/debug/pprof/profile?seconds=&frequency=' +``` + +#### Parameters + +- **seconds** (optional): Specifies the number of seconds to collect CPU profiling data. + - Default: 10 + - Example: `?seconds=20` + +- **frequency** (optional): Specifies the sampling frequency for CPU profiling data. + - Default: 99 + - Example: `?frequency=100` + +- **type** (optional): Specifies the Content-Type of the response. + - Options: `application/protobuf` for raw profile data, any other types for flame graph. + - Default: `N/A` + - Example: `-H "Content-Type:application/protobuf"` + +#### Response + +The server will return CPU profiling data. The response format is determined by the Content-Type in the request header and can be either raw profile data in protobuf format or flame graph in SVG format. + +The raw profile data can be handled by `pprof` tool. For example, use `go tool pprof --http=0.0.0.0:1234 xxx.proto` to open a interactive web browser. + +## Heap Profiling + +Collect and export heap profiling data. + +Note that, heap profile is not like CPU profile which is collected within the specified time range right after the request. Instead, heap profile is just a snapshot of the accumulated memory usage at the time of request, as the memory usage is always being collected once activated. + +```bash +curl -X GET 'http://$TIKV_ADDRESS/debug/pprof/heap?jeprof=' +``` + +#### Parameters + +- **jeprof** (optional): Indicates whether to use Jeprof to process the heap profile to generate call graph. It needs `perl` being installed. + - Default: false + - Example: `?jeprof=true` + +#### Response + +The server will return heap profiling data. The response format is determined by the `jeprof` parameter. If true, the response will be a call graph in SVG format generated by `jeprof` needing `perl` installed in the TiKV environment. Otherwise, the response will be raw profile data in jemalloc dedicated format. + +## Heap Profile Symbolization + +The heap profile retrieved by `heap` API by default is a raw profile data in jemalloc dedicated format, which should be handled by `jeporf` to visualize. + +There are two ways to generate a call graph in SVG format from the raw profile data: + +- local: by provided profile and use TiKV binary to resolve symbols + +```bash +jeprof --svg +``` + +- remote: by latest heap profile retrieved by HTTP and use symbolization service provided by TiKV to resolve symbols + +```bash +jeprof --svg http://$TIKV_ADDRESS/debug/pprof/heap +``` + +To support the remote way, TiKV provides a symbolization service to resolve symbols from memory addresses. Jeprof would implicitly call the `.../debug/pprof/symbol` to map call stack's addresses to corresponding function names. For most of the cases, you don't need to +it explicitly. But if you want to use it for other purposes, you can refer as follows. + +```bash +curl -X POST -d '' 'http://$TIKV_ADDRESS/debug/pprof/symbol' +``` + +#### Parameters + +- **address_list** (required): A list of memory addresses to be resolved. The addresses should be provided in hexadecimal format(whether or not start with '0x' is okay), separated by a '+' character. + +#### Response + +A list of resolved symbols in plain text. Each line represented as a hexadecimal address followed by the corresponding function name. If a memory address cannot be resolved, it will be marked with "??". diff --git a/etc/config-template.toml b/etc/config-template.toml index 36d8d25d883..75c7eab0c10 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -83,6 +83,18 @@ ## maximum number of old log files to retain # max-backups = 0 +[memory] +## Whether enable the heap profiling which may have a bit performance overhead about 2% for the +## default sample rate. +# enable-heap-profiling = true + +## Average interval between allocation samples, as measured in bytes of allocation activity. +## Increasing the sampling interval decreases profile fidelity, but also decreases the +## computational overhead. +## The default sample interval is 512 KB. It only accepts power of two, otherwise it will be +## rounded up to the next power of two. +# profiling-sample-per-bytes = "512KB" + ## Configurations for the single thread pool serving read requests. [readpool.unified] ## The minimal working thread count of the thread pool. @@ -382,6 +394,9 @@ ## Store heartbeat tick interval for reporting to PD. # pd-store-heartbeat-tick-interval = "10s" +## Store min resolved ts tick interval for reporting to PD. +# pd-report-min-resolved-ts-interval = "1s" + ## The threshold of triggering Region split check. ## When Region size change exceeds this config, TiKV will check whether the Region should be split ## or not. To reduce the cost of scanning data in the checking process, you can set the value to @@ -437,6 +452,15 @@ ## exceeds `region-compact-tombstones-percent`. # region-compact-tombstones-percent = 30 +## The minimum number of duplicated MVCC keys to trigger manual compaction. +# region-compact-min-redundant-rows = 50000 + +## The minimum percentage of duplicated MVCC keys to trigger manual compaction. +## It should be set between 1 and 100. Manual compaction is only triggered when the number of +## duplicated MVCC keys exceeds `region-compact-min-redundant-rows` and the percentage of duplicated MVCC keys +## exceeds `region-compact-redundant-rows-percent`. +# region-compact-redundant-rows-percent = 20 + ## Interval to check whether to start a manual compaction for Lock Column Family. ## If written bytes reach `lock-cf-compact-bytes-threshold` for Lock Column Family, TiKV will ## trigger a manual compaction for Lock Column Family. @@ -652,8 +676,8 @@ # enabled = false ## Maximum number of threads of `Titan` background gc jobs. -## default: 4 -# max-background-gc = 4 +## default: 1 +# max-background-gc = 1 ## Options for "Default" Column Family, which stores actual user data. [rocksdb.defaultcf] @@ -915,8 +939,15 @@ ## lz4: kLZ4Compression ## lz4hc: kLZ4HCCompression ## zstd: kZSTD -## default: lz4 -# blob-file-compression = "lz4" +## default: zstd +# blob-file-compression = "zstd" + +## Set blob file zstd dictionary compression, default(0) will use zstd compression. +## It is recommended to set the dictionary size to values such as 4k or 16k. Additionally, +## the sample data size to train dictionary is of size 100X dictionary size innerly. +## It has no effect when `blob-file-compression` is not `zstd`. +## default: 0 +# zstd-dict-size = 0 ## Specifics cache size for blob records ## default: 0 diff --git a/fuzz/fuzzer-afl/Cargo.toml b/fuzz/fuzzer-afl/Cargo.toml index 6c97305a253..5e9894fba3e 100644 --- a/fuzz/fuzzer-afl/Cargo.toml +++ b/fuzz/fuzzer-afl/Cargo.toml @@ -8,4 +8,4 @@ fuzz-targets = { path = "../targets" } # AFL only works for x86 targets [target.'cfg(all(not(target_os = "windows"), target_arch = "x86_64"))'.dependencies] -afl = "0.6" +afl = "0.14" diff --git a/metrics/alertmanager/tikv.accelerate.rules.yml b/metrics/alertmanager/tikv.accelerate.rules.yml index 4bc48336c60..e5ad2daa8cf 100644 --- a/metrics/alertmanager/tikv.accelerate.rules.yml +++ b/metrics/alertmanager/tikv.accelerate.rules.yml @@ -32,7 +32,7 @@ groups: - record: tikv_pd_request_duration_seconds:avg:1m expr: sum(rate(tikv_pd_request_duration_seconds_sum{instance=~".*"}[1m])) by (type) / sum(rate(tikv_pd_request_duration_seconds_count{instance=~".*"}[1m])) by (type) - record: tikv_coprocessor_request_wait_seconds:p95:1m - expr: histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~".*"}[1m])) by (le, instance,req)) + expr: histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~".*", type="all"}[1m])) by (le, instance,req)) - record: tikv_grpc_msg_duration_seconds:avg:1m expr: sum(rate(tikv_grpc_msg_duration_seconds_sum{instance=~".*"}[1m])) by (type) / sum(rate(tikv_grpc_msg_duration_seconds_count[1m])) by (type) - record: tikv_raftstore_apply_wait_time_duration_secs:p99:1m @@ -48,7 +48,7 @@ groups: - record: tikv_coprocessor_request_duration_seconds:1m expr: sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~".*"}[1m])) by (le) - record: tikv_futurepool_pending_task:1m - expr: sum(rate(tikv_futurepool_pending_task_total{instance=~".*"}[1m])) by (name) + expr: sum(avg_over_time(tikv_futurepool_pending_task_total{instance=~".*"}[1m])) by (name) - record: tikv_storage_engine_async_request:1m expr: sum(rate(tikv_storage_engine_async_request_total{instance=~".*", status!~"all|success"}[1m])) by (status) - record: tikv_thread_cpu_seconds_nogrpc:1m diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index e43ca401d42..1b460311e60 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -1,6 +1,18 @@ groups: - name: alert.rules rules: + - alert: TiKV_critical_error + expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0 + # without the for clause will become active on the first evaluation. + labels: + env: ENV_LABELS_ENV + level: critical + expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0 + annotations: + description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' + value: '{{ $value }}' + summary: TiKV encounters critical error + - alert: TiKV_memory_used_too_fast expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024 for: 5m @@ -15,7 +27,7 @@ groups: - alert: TiKV_GC_can_not_work expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 and (sum(increase(tikv_gc_compaction_filter_perform[1d])) < 1 and sum(increase(tikv_engine_event_total{db="kv", cf="write", type="compaction"}[1d])) >= 1) - for: 1m + for: 5m labels: env: ENV_LABELS_ENV level: emergency @@ -98,12 +110,12 @@ groups: summary: TiKV async request write duration seconds more than 1s - alert: TiKV_coprocessor_request_wait_seconds - expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 + expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{type="all"}[1m])) by (le, instance, req)) > 10 for: 1m labels: env: ENV_LABELS_ENV level: critical - expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 + expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{type="all"}[1m])) by (le, instance, req)) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' diff --git a/metrics/grafana/README.md b/metrics/grafana/README.md new file mode 100644 index 00000000000..dec76a67529 --- /dev/null +++ b/metrics/grafana/README.md @@ -0,0 +1,11 @@ +# TiKV Grafana Dashboard + +The "TiKV Details" dashboard is generated by the `tikv_details.dashboard.py` +Python script. + +## Updating the Dashboard + +To add or update panels on the dashboard, make your changes in +`tikv_details.dashboard.py` and then run `./scripts/gen-tikv-details-dashboard`. + +Please avoid manually modifying `tikv_details.json`. diff --git a/metrics/grafana/common.py b/metrics/grafana/common.py new file mode 100644 index 00000000000..7f15c06998f --- /dev/null +++ b/metrics/grafana/common.py @@ -0,0 +1,1093 @@ +from typing import Optional, Union + +import attr +from attr.validators import in_, instance_of +from grafanalib import formatunits as UNITS +from grafanalib.core import ( + NULL_AS_ZERO, + TIME_SERIES_TARGET_FORMAT, + DataSourceInput, + Graph, + GraphThreshold, + GridPos, + Heatmap, + HeatmapColor, + Legend, + Panel, + RowPanel, + SeriesOverride, + Stat, + StatValueMappings, + Target, + Template, + TimeSeries, + Tooltip, + YAxes, + YAxis, +) + +DATASOURCE_INPUT = DataSourceInput( + name="DS_TEST-CLUSTER", + label="test-cluster", + pluginId="prometheus", + pluginName="Prometheus", +) +DATASOURCE = f"${{{DATASOURCE_INPUT.name}}}" + + +@attr.s +class Expr(object): + """ + A prometheus expression that matches the following grammar: + + expr ::= ( + [aggr_param,] + [func]( + + [{,}] + [[]] + ) + ) [by (,)] [extra_expr] + """ + + metric: str = attr.ib(validator=instance_of(str)) + aggr_op: str = attr.ib( + default="", + validator=in_( + [ + "", + "sum", + "min", + "max", + "avg", + "group", + "stddev", + "stdvar", + "count", + "count_values", + "bottomk", + "topk", + "quantile", + ] + ), + ) + aggr_param: str = attr.ib(default="", validator=instance_of(str)) + func: str = attr.ib(default="", validator=instance_of(str)) + range_selector: str = attr.ib(default="", validator=instance_of(str)) + label_selectors: list[str] = attr.ib(default=[], validator=instance_of(list)) + by_labels: list[str] = attr.ib(default=[], validator=instance_of(list)) + default_label_selectors: list[str] = attr.ib( + default=[ + r'k8s_cluster="$k8s_cluster"', + r'tidb_cluster="$tidb_cluster"', + r'instance=~"$instance"', + ], + validator=instance_of(list), + ) + skip_default_instance: bool = attr.ib(default=False, validator=instance_of(bool)) + extra_expr: str = attr.ib(default="", validator=instance_of(str)) + + def __str__(self) -> str: + aggr_opeator = self.aggr_op if self.aggr_op else "" + aggr_param = self.aggr_param + "," if self.aggr_param else "" + by_clause = ( + "by ({})".format(", ".join(self.by_labels)) if self.by_labels else "" + ) + func = self.func if self.func else "" + label_selectors = self.default_label_selectors + self.label_selectors + if self.skip_default_instance: + # Remove instance=~"$instance" + label_selectors = [l for l in label_selectors if "$instance" not in l] + assert all( + ("=" in item or "~" in item) for item in label_selectors + ), f"Not all items contain '=' or '~', invalid {self.label_selectors}" + instant_selectors = ( + "{{{}}}".format(",".join(label_selectors)) if label_selectors else "" + ) + range_selector = f"[{self.range_selector}]" if self.range_selector else "" + extra_expr = self.extra_expr if self.extra_expr else "" + return f"""{aggr_opeator}({aggr_param}{func}( + {self.metric} + {instant_selectors} + {range_selector} +)) {by_clause} {extra_expr}""" + + def aggregate( + self, + aggr_op: str, + aggr_param: str = "", + by_labels: list[str] = [], + label_selectors: list[str] = [], + ) -> "Expr": + self.aggr_op = aggr_op + self.aggr_param = aggr_param + self.by_labels = by_labels + self.label_selectors = label_selectors + return self + + def function( + self, + func: str, + label_selectors: list[str] = [], + range_selector: str = "", + ) -> "Expr": + self.func = func + self.label_selectors = label_selectors + self.range_selector = range_selector + return self + + def extra( + self, + extra_expr: Optional[str] = None, + default_label_selectors: Optional[list[str]] = None, + ) -> "Expr": + if extra_expr is not None: + self.extra_expr = extra_expr + if default_label_selectors is not None: + self.default_label_selectors = default_label_selectors + return self + + def skip_default_instance_selector(self) -> "Expr": + self.skip_default_instance = True + return self + + +def expr_aggr( + metric: str, + aggr_op: str, + aggr_param: str = "", + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the aggregation of a metric. + + Example: + + sum(( + tikv_store_size_bytes + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + )) by (instance) + """ + expr = Expr(metric=metric) + expr.aggregate( + aggr_op, + aggr_param=aggr_param, + by_labels=by_labels, + label_selectors=label_selectors, + ) + return expr + + +def expr_sum( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the sum of a metric. + + Example: + + sum(( + tikv_store_size_bytes + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + )) by (instance) + """ + return expr_aggr( + metric, "sum", label_selectors=label_selectors, by_labels=by_labels + ) + + +def expr_avg( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the avg of a metric. + + Example: + + avg(( + tikv_store_size_bytes + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + )) by (instance) + """ + return expr_aggr( + metric, "avg", label_selectors=label_selectors, by_labels=by_labels + ) + + +def expr_max( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the max of a metric. + + Example: + + max(( + tikv_store_size_bytes + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + )) by (instance) + """ + return expr_aggr( + metric, "max", label_selectors=label_selectors, by_labels=by_labels + ) + + +def expr_min( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the min of a metric. + + Example: + + min(( + tikv_store_size_bytes + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + )) by (instance) + """ + return expr_aggr( + metric, "min", label_selectors=label_selectors, by_labels=by_labels + ) + + +def expr_aggr_func( + metric: str, + aggr_op: str, + func: str, + aggr_param: str = "", + label_selectors: list[str] = [], + range_selector: str = "", + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the aggregation of function of a metric. + + Example: + + expr_aggr_func( + tikv_grpc_msg_duration_seconds_count, "sum", "rate", lables_selectors=['type!="kv_gc"'] + ) + + sum(rate( + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [$__rate_interval] + )) by (instance) + """ + expr = Expr(metric=metric) + expr.aggregate( + aggr_op, + aggr_param=aggr_param, + by_labels=by_labels, + ) + expr.function( + func, + label_selectors=label_selectors, + range_selector=range_selector, + ) + return expr + + +def expr_sum_rate( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the sum of rate of a metric. + + Example: + + sum(rate( + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [$__rate_interval] + )) by (instance) + """ + # $__rate_interval is a Grafana variable that is specialized for Prometheus + # rate and increase function. + # See https://grafana.com/blog/2020/09/28/new-in-grafana-7.2-__rate_interval-for-prometheus-rate-queries-that-just-work/ + return expr_aggr_func( + metric=metric, + aggr_op="sum", + func="rate", + label_selectors=label_selectors, + range_selector="$__rate_interval", + by_labels=by_labels, + ) + + +def expr_sum_delta( + metric: str, + label_selectors: list[str] = [], + range_selector: str = "$__rate_interval", + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the sum of delta of a metric. + + Example: + + sum(delta( + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [$__rate_interval] + )) by (instance) + """ + return expr_aggr_func( + metric=metric, + aggr_op="sum", + func="delta", + label_selectors=label_selectors, + range_selector=range_selector, + by_labels=by_labels, + ) + + +def expr_sum_increase( + metric: str, + label_selectors: list[str] = [], + range_selector: str = "$__rate_interval", + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the sum of increase of a metric. + + Example: + + sum(increase( + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [$__rate_interval] + )) by (instance) + """ + return expr_aggr_func( + metric=metric, + aggr_op="sum", + func="increase", + label_selectors=label_selectors, + range_selector=range_selector, + by_labels=by_labels, + ) + + +def expr_sum_aggr_over_time( + metric: str, + aggr: str, + range_selector: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the sum of average value of all points in the specified interval of a metric. + + Example: + + sum(avg_over_time( + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [1m] + )) by (instance) + """ + return expr_aggr_func( + metric=metric, + aggr_op="sum", + func=f"{aggr}_over_time", + label_selectors=label_selectors, + range_selector=range_selector, + by_labels=by_labels, + ) + + +def expr_max_rate( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the max of rate of a metric. + + Example: + + max(rate( + tikv_thread_voluntary_context_switches + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [$__rate_interval] + )) by (name) + """ + # $__rate_interval is a Grafana variable that is specialized for Prometheus + # rate and increase function. + # See https://grafana.com/blog/2020/09/28/new-in-grafana-7.2-__rate_interval-for-prometheus-rate-queries-that-just-work/ + return expr_aggr_func( + metric=metric, + aggr_op="max", + func="rate", + label_selectors=label_selectors, + range_selector="$__rate_interval", + by_labels=by_labels, + ) + + +def expr_count_rate( + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> Expr: + """ + Calculate the count of rate of a metric. + + Example: + + count(rate( + tikv_thread_cpu_seconds_total + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",name=~"sst_.*"} + [$__rate_interval] + )) by (instance) + """ + # $__rate_interval is a Grafana variable that is specialized for Prometheus + # rate and increase function. + # See https://grafana.com/blog/2020/09/28/new-in-grafana-7.2-__rate_interval-for-prometheus-rate-queries-that-just-work/ + return expr_aggr_func( + metric=metric, + aggr_op="count", + func="rate", + label_selectors=label_selectors, + range_selector="$__rate_interval", + by_labels=by_labels, + ) + + +def expr_simple( + metric: str, + label_selectors: list[str] = [], +) -> Expr: + """ + Query an instant vector of a metric. + + Example: + + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + """ + expr = Expr(metric=metric) + expr.function("", label_selectors=label_selectors) + return expr + + +def expr_operator(lhs: Union[Expr, str], operator: str, rhs: Union[Expr, str]) -> str: + return f"""({lhs} {operator} {rhs})""" + + +def expr_histogram_quantile( + quantile: float, + metrics: str, + label_selectors: list[str] = [], + by_labels: list[str] = [], +) -> Expr: + """ + Query a quantile of a histogram metric. + + Example: + + histogram_quantile(0.99, sum(rate( + tikv_grpc_msg_duration_seconds_bucket + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance",type!="kv_gc"} + [$__rate_interval] + )) by (le)) + """ + # sum(rate(metrics_bucket{label_selectors}[$__rate_interval])) by (le) + assert not metrics.endswith( + "_bucket" + ), f"'{metrics}' should not specify '_bucket' suffix manually" + by_labels = list(filter(lambda label: label != "le", by_labels)) + sum_rate_of_buckets = expr_sum_rate( + metrics + "_bucket", + label_selectors=label_selectors, + by_labels=by_labels + ["le"], + ) + # histogram_quantile({quantile}, {sum_rate_of_buckets}) + return expr_aggr( + metric=f"{sum_rate_of_buckets}", + aggr_op="histogram_quantile", + aggr_param=f"{quantile}", + label_selectors=[], + by_labels=[], + ).extra( + # Do not attach default label selector again. + default_label_selectors=[] + ) + + +def expr_topk( + k: int, + metrics: str, +) -> Expr: + """ + Query topk of a metric. + + Example: + + topk(20, tikv_thread_voluntary_context_switches) + """ + # topk({k}, {metric}) + return expr_aggr( + metric=metrics, + aggr_op="topk", + aggr_param=f"{k}", + label_selectors=[], + by_labels=[], + ).extra( + # Do not attach default label selector again. + default_label_selectors=[] + ) + + +def expr_histogram_avg( + metrics: str, + label_selectors: list[str] = [], + by_labels: list[str] = ["instance"], +) -> str: + """ + Query the avg of a histogram metric. + + Example: + + sum(rate( + tikv_grpc_msg_duration_seconds_sum + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance"} + [$__rate_interval] + )) / sum(rate( + tikv_grpc_msg_duration_seconds_count + {k8s_cluster="$k8s_cluster",tidb_cluster="$tidb_cluster",instance=~"$instance"} + [$__rate_interval] + )) + """ + for suffix in ["_bucket", "_count", "_sum"]: + assert not metrics.endswith( + suffix + ), f"'{metrics}' should not specify '{suffix}' suffix manually" + + return expr_operator( + expr_sum_rate( + metrics + "_sum", + label_selectors=label_selectors, + by_labels=by_labels, + ), + "/", + expr_sum_rate( + metrics + "_count", + label_selectors=label_selectors, + by_labels=by_labels, + ), + ) + + +def target( + expr: Union[Expr, str], + legend_format: Optional[str] = None, + hide=False, + data_source=DATASOURCE, + interval_factor=1, # Prefer "high" resolution +) -> Target: + if legend_format is None and isinstance(expr, Expr) and expr.by_labels: + legend_format = "-".join(map(lambda x: "{{" + f"{x}" + "}}", expr.by_labels)) + return Target( + expr=f"{expr}", + hide=hide, + legendFormat=legend_format, + intervalFactor=interval_factor, + datasource=data_source, + ) + + +def template( + name, + query, + data_source, + hide, + regex=None, + multi=False, + include_all=False, + all_value=None, +) -> Template: + return Template( + dataSource=data_source, + hide=hide, + label=name, + multi=multi, + name=name, + query=query, + refresh=2, + sort=1, + type="query", + useTags=False, + regex=regex, + includeAll=include_all, + allValue=all_value, + ) + + +class Layout: + # Rows are always 24 "units" wide. + ROW_WIDTH = 24 + PANEL_HEIGHT = 7 + row_panel: RowPanel + current_row_y_pos: int + current_row_x_pos: int + + def __init__(self, title, collapsed=True, repeat: Optional[str] = None) -> None: + extraJson = None + if repeat: + extraJson = {"repeat": repeat} + title = f"{title} - ${repeat}" + self.current_row_y_pos = 0 + self.current_row_x_pos = 0 + self.row_panel = RowPanel( + title=title, + gridPos=GridPos(h=self.PANEL_HEIGHT, w=self.ROW_WIDTH, x=0, y=0), + collapsed=collapsed, + extraJson=extraJson, + ) + + def row(self, panels: list[Panel], width: int = ROW_WIDTH): + """Start a new row and evenly scales panels width""" + count = len(panels) + if count == 0: + return panels + width = width // count + remain = self.ROW_WIDTH % count + x = self.current_row_x_pos % self.ROW_WIDTH + for panel in panels: + panel.gridPos = GridPos( + h=self.PANEL_HEIGHT, + w=width, + x=x, + y=self.current_row_y_pos, + ) + x += width + panels[-1].gridPos.w += remain + self.row_panel.panels.extend(panels) + self.current_row_y_pos += self.PANEL_HEIGHT + self.current_row_x_pos = x + + def half_row(self, panels: list[Panel]): + self.row(panels, self.ROW_WIDTH // 2) + + +def timeseries_panel( + title, + targets, + legend_calcs=["max", "last"], + unit="s", + draw_style="line", + line_width=1, + fill_opacity=10, + gradient_mode="opacity", + tooltip_mode="multi", + legend_display_mode="table", + legend_placement="right", + description=None, + data_source=DATASOURCE, +) -> TimeSeries: + return TimeSeries( + title=title, + dataSource=data_source, + description=description, + targets=targets, + legendCalcs=legend_calcs, + drawStyle=draw_style, + lineWidth=line_width, + fillOpacity=fill_opacity, + gradientMode=gradient_mode, + unit=unit, + tooltipMode=tooltip_mode, + legendDisplayMode=legend_display_mode, + legendPlacement=legend_placement, + ) + + +def yaxis(format: str, log_base=1) -> YAxis: + assert format not in [ + UNITS.BYTES, + UNITS.BITS, + UNITS.KILO_BYTES, + UNITS.MEGA_BYTES, + UNITS.GIGA_BYTES, + UNITS.TERA_BYTES, + UNITS.PETA_BYTES, + UNITS.BYTES_SEC, + UNITS.KILO_BYTES_SEC, + UNITS.MEGA_BYTES_SEC, + UNITS.GIGA_BYTES_SEC, + UNITS.TERA_BYTES_SEC, + UNITS.PETA_BYTES_SEC, + ], "Must not use SI bytes" + return YAxis(format=format, logBase=log_base) + + +def yaxes(left_format: str, right_format: Optional[str] = None, log_base=1) -> YAxes: + ya = YAxes(left=yaxis(left_format, log_base=log_base)) + if right_format is not None: + ya.right = yaxis(right_format, log_base=log_base) + return ya + + +def graph_legend( + avg=False, + current=True, + max=True, + min=False, + show=True, + total=False, + align_as_table=True, + hide_empty=True, + hide_zero=True, + right_side=True, + side_width=None, + sort_desc=True, +) -> Legend: + sort = "max" if max else "current" + return Legend( + avg=avg, + current=current, + max=max, + min=min, + show=show, + total=total, + alignAsTable=align_as_table, + hideEmpty=hide_empty, + hideZero=hide_zero, + rightSide=right_side, + sideWidth=side_width, + sort=sort, + sortDesc=sort_desc, + ) + + +def graph_panel( + title: str, + targets: list[Target], + description=None, + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + legend=None, + tooltip=Tooltip(shared=True, valueType="individual"), + lines=True, + line_width=1, + fill=1, + fill_gradient=1, + stack=False, + thresholds: list[GraphThreshold] = [], + series_overrides: list[SeriesOverride] = [], + data_source=DATASOURCE, + null_point_mode=NULL_AS_ZERO, +) -> Panel: + # extraJson add patches grafanalib result. + extraJson = {} + if fill_gradient != 0: + # fillGradient is only valid when fill is 1. + if fill == 0: + fill = 1 + # fillGradient is not set correctly in grafanalib(0.7.0), so we need to + # set it manually. + # TODO: remove it when grafanalib fix this. + extraJson["fillGradient"] = 1 + for target in targets: + # Make sure target is in time_series format. + target.format = TIME_SERIES_TARGET_FORMAT + + return Graph( + title=title, + dataSource=data_source, + description=description, + targets=targets, + yAxes=yaxes, + legend=legend if legend else graph_legend(), + lines=lines, + bars=not lines, + lineWidth=line_width, + fill=fill, + fillGradient=fill_gradient, + stack=stack, + nullPointMode=null_point_mode, + thresholds=thresholds, + tooltip=tooltip, + seriesOverrides=series_overrides, + # Do not specify max max data points, let Grafana decide. + maxDataPoints=None, + extraJson=extraJson, + ) + + +def series_override( + alias: str, + bars: bool = False, + lines: bool = True, + yaxis: int = 1, + fill: int = 1, + zindex: int = 0, + dashes: Optional[bool] = None, + dash_length: Optional[int] = None, + space_length: Optional[int] = None, + transform_negative_y: bool = False, +) -> SeriesOverride: + class SeriesOverridePatch(SeriesOverride): + dashes_override: Optional[bool] + dash_length_override: Optional[int] + space_length_override: Optional[int] + transform_negative_y: bool + + def __init__(self, *args, **kwargs) -> None: + self.dashes_override = kwargs["dashes"] + if self.dashes_override is None: + del kwargs["dashes"] + self.dash_length_override = kwargs["dashLength"] + if self.dash_length_override is None: + del kwargs["dashLength"] + self.space_length_override = kwargs["spaceLength"] + if self.space_length_override is None: + del kwargs["spaceLength"] + self.transform_negative_y = kwargs["transform_negative_y"] + del kwargs["transform_negative_y"] + super().__init__(*args, **kwargs) + + def to_json_data(self): + data = super().to_json_data() + # The default 'null' color makes it transparent, remove it. + del data["color"] + # The default 'null' makes it a transparent line, remove it. + if self.dashes_override is None: + del data["dashes"] + if self.dash_length_override is None: + del data["dashLength"] + if self.space_length_override is None: + del data["spaceLength"] + # Add missing transform. + if self.transform_negative_y: + data["transform"] = "negative-Y" + return data + + return SeriesOverridePatch( + alias=alias, + bars=bars, + lines=lines, + yaxis=yaxis, + fill=fill, + zindex=zindex, + dashes=dashes, + dashLength=dash_length, + spaceLength=space_length, + transform_negative_y=transform_negative_y, + ) + + +def heatmap_color() -> HeatmapColor: + return HeatmapColor( + cardColor="#b4ff00", + colorScale="sqrt", + colorScheme="interpolateSpectral", + exponent=0.5, + mode="spectrum", + max=None, + min=None, + ) + + +def heatmap_panel( + title: str, + metric: str, + description=None, + label_selectors: list[str] = [], + yaxis=yaxis(UNITS.NO_FORMAT), + tooltip=Tooltip(shared=True, valueType="individual"), + color=heatmap_color(), + decimals=1, + data_source=DATASOURCE, +) -> Panel: + assert metric.endswith( + "_bucket" + ), f"'{metric}' should be a histogram metric with '_bucket' suffix" + t = target( + expr=expr_sum_rate(metric, label_selectors=label_selectors, by_labels=["le"]), + ) + # Make sure targets are in heatmap format. + t.format = "heatmap" + # Heatmap target legendFormat should be "{{le}}" + t.legendFormat = "{{le}}" + # Overrides yaxis decimal places. + yaxis.decimals = decimals + return Heatmap( + title=title, + dataSource=data_source, + description=description, + targets=[t], + yAxis=yaxis, + color=color, + dataFormat="tsbuckets", + yBucketBound="upper", + tooltip=tooltip, + extraJson={"tooltip": {"showHistogram": True}}, + hideZeroBuckets=True, + # Limit data points, because too many data points slows browser when + # the resolution is too high. + # See: https://grafana.com/blog/2020/06/23/how-to-visualize-prometheus-histograms-in-grafana/ + maxDataPoints=512, + # Fix grafana heatmap migration panic if options is null. + # See: https://github.com/grafana/grafana/blob/v9.5.14/public/app/plugins/panel/heatmap/migrations.ts#L17 + options={}, + ) + + +def stat_panel( + title: str, + targets: list[Target], + description=None, + format=UNITS.NONE_FORMAT, + graph_mode="none", + decimals: Optional[int] = None, + mappings: Optional[StatValueMappings] = None, + text_mode: str = "auto", + data_source=DATASOURCE, +) -> Panel: + for target in targets: + # Make sure target is in time_series format. + target.format = TIME_SERIES_TARGET_FORMAT + return Stat( + title=title, + dataSource=data_source, + description=description, + targets=targets, + format=format, + graphMode=graph_mode, + reduceCalc="lastNotNull", + decimals=decimals, + mappings=mappings, + textMode=text_mode, + ) + + +def graph_panel_histogram_quantiles( + title: str, + description: str, + yaxes: YAxes, + metric: str, + label_selectors: list[str] = [], + by_labels: list[str] = [], + hide_p9999=False, + hide_avg=False, + hide_count=False, +) -> Panel: + """ + Return a graph panel that shows histogram quantiles of a metric. + + Targets: + - 99.99% quantile + - 99% quantile + - avg + - count + """ + + def legend(prefix, labels): + if not labels: + return prefix + else: + return "-".join([prefix] + ["{{%s}}" % lb for lb in labels]) + + return graph_panel( + title=title, + description=description, + yaxes=yaxes, + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + f"{metric}", + label_selectors=label_selectors, + by_labels=by_labels, + ), + legend_format=legend("99.99%", by_labels), + hide=hide_p9999, + ), + target( + expr=expr_histogram_quantile( + 0.99, + f"{metric}", + label_selectors=label_selectors, + by_labels=by_labels, + ), + legend_format=legend("99%", by_labels), + ), + target( + expr=expr_histogram_avg( + metric, + label_selectors=label_selectors, + by_labels=by_labels, + ), + legend_format=legend("avg", by_labels), + hide=hide_avg, + ), + target( + expr=expr_sum_rate( + f"{metric}_count", + label_selectors=label_selectors, + by_labels=by_labels, + ), + legend_format=legend("count", by_labels), + hide=hide_count, + ), + ], + series_overrides=[ + series_override( + alias="count", + fill=2, + yaxis=2, + zindex=-3, + dashes=True, + dash_length=1, + space_length=1, + transform_negative_y=True, + ), + series_override( + alias="avg", + fill=7, + ), + ], + ) + + +def heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title: str, + heatmap_description: str, + graph_title: str, + graph_description: str, + yaxis_format: str, + metric: str, + label_selectors=[], + graph_by_labels=[], +) -> list[Panel]: + return [ + heatmap_panel( + title=heatmap_title, + description=heatmap_description, + yaxis=yaxis(format=yaxis_format), + metric=f"{metric}_bucket", + label_selectors=label_selectors, + ), + graph_panel_histogram_quantiles( + title=graph_title, + description=graph_description, + metric=f"{metric}", + yaxes=yaxes(left_format=yaxis_format), + label_selectors=label_selectors, + by_labels=graph_by_labels, + hide_count=True, + ), + ] diff --git a/metrics/grafana/performance_read.json b/metrics/grafana/performance_read.json index caa2635d34c..aaf24de396e 100644 --- a/metrics/grafana/performance_read.json +++ b/metrics/grafana/performance_read.json @@ -2686,14 +2686,14 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le,req))", + "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"all\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "D" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le,req))", + "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"all\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", diff --git a/metrics/grafana/tikv_details.dashboard.py b/metrics/grafana/tikv_details.dashboard.py new file mode 100644 index 00000000000..1ed32eb6fe5 --- /dev/null +++ b/metrics/grafana/tikv_details.dashboard.py @@ -0,0 +1,8668 @@ +import os +import sys + +sys.path.append(os.path.dirname(__file__)) + +from common import ( + DATASOURCE, + DATASOURCE_INPUT, + Layout, + expr_avg, + expr_count_rate, + expr_histogram_avg, + expr_histogram_quantile, + expr_max, + expr_max_rate, + expr_min, + expr_operator, + expr_simple, + expr_sum, + expr_sum_aggr_over_time, + expr_sum_delta, + expr_sum_increase, + expr_sum_rate, + expr_topk, + graph_legend, + graph_panel, + graph_panel_histogram_quantiles, + heatmap_panel, + heatmap_panel_graph_panel_histogram_quantile_pairs, + series_override, + stat_panel, + target, + template, + yaxes, + yaxis, +) +from grafanalib import formatunits as UNITS +from grafanalib.core import ( + GRAPH_TOOLTIP_MODE_SHARED_CROSSHAIR, + HIDE_VARIABLE, + NULL_AS_NULL, + SHOW, + Dashboard, + GraphThreshold, + RowPanel, + StatValueMappingItem, + StatValueMappings, + Templating, +) + +#### Metrics Definition Start #### + + +def Templates() -> Templating: + return Templating( + list=[ + template( + name="k8s_cluster", + query="label_values(tikv_engine_block_cache_size_bytes, k8s_cluster)", + data_source=DATASOURCE, + hide=HIDE_VARIABLE, + ), + template( + name="tidb_cluster", + query='label_values(tikv_engine_block_cache_size_bytes{k8s_cluster ="$k8s_cluster"}, tidb_cluster)', + data_source=DATASOURCE, + hide=HIDE_VARIABLE, + ), + template( + name="db", + query='label_values(tikv_engine_block_cache_size_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, db)', + data_source=DATASOURCE, + hide=SHOW, + multi=True, + include_all=True, + ), + template( + name="command", + query='query_result(tikv_storage_command_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"} != 0)', + data_source=DATASOURCE, + hide=SHOW, + regex='/\\btype="([^"]+)"/', + multi=True, + include_all=True, + ), + template( + name="instance", + query='label_values(tikv_engine_size_bytes{k8s_cluster ="$k8s_cluster", tidb_cluster="$tidb_cluster"}, instance)', + data_source=DATASOURCE, + hide=SHOW, + include_all=True, + all_value=".*", + ), + template( + name="titan_db", + query='label_values(tikv_engine_titandb_num_live_blob_file{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, db)', + data_source=DATASOURCE, + hide=HIDE_VARIABLE, + multi=True, + include_all=True, + ), + ] + ) + + +def Duration() -> RowPanel: + layout = Layout(title="Duration") + layout.row( + [ + graph_panel( + title="Write Pipeline Duration", + description="Write Pipeline Composition", + yaxes=yaxes(left_format=UNITS.SECONDS), + lines=False, + stack=True, + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, "tikv_raftstore_append_log_duration_seconds" + ), + legend_format="Write Raft Log .99", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_request_wait_time_duration_secs", + ), + legend_format="Propose Wait .99", + ), + target( + expr=expr_histogram_quantile( + 0.99, "tikv_raftstore_apply_wait_time_duration_secs" + ), + legend_format="Apply Wait .99", + ), + target( + expr=expr_histogram_quantile( + 0.99, "tikv_raftstore_commit_log_duration_seconds" + ), + legend_format="Replicate Raft Log .99", + ), + target( + expr=expr_histogram_quantile( + 0.99, "tikv_raftstore_apply_log_duration_seconds" + ), + legend_format="Apply Duration .99", + ), + ], + ), + graph_panel( + title="Cop Read Duration", + description="Read Duration Composition", + yaxes=yaxes(left_format=UNITS.SECONDS), + lines=False, + stack=True, + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_storage_engine_async_request_duration_seconds", + ['type="snapshot"'], + ), + legend_format="Get Snapshot .99", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_coprocessor_request_wait_seconds", + ['type="all"'], + ), + legend_format="Cop Wait .99", + ), + target( + expr=expr_histogram_quantile( + 0.95, "tikv_coprocessor_request_handle_seconds" + ), + legend_format="Cop Handle .99", + ), + ], + ), + ] + ) + return layout.row_panel + + +def Cluster() -> RowPanel: + layout = Layout(title="Cluster") + layout.row( + [ + graph_panel( + title="Store size", + description="The storage size per TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + fill=1, + stack=True, + legend=graph_legend(max=False), + targets=[ + target( + expr=expr_sum( + "tikv_store_size_bytes", + label_selectors=['type = "used"'], + ), + ), + ], + ), + graph_panel( + title="Available size", + description="The available capacity size of each TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + fill=1, + stack=True, + legend=graph_legend(max=False), + targets=[ + target( + expr=expr_sum( + "tikv_store_size_bytes", + label_selectors=['type="available"'], + ), + ), + ], + ), + graph_panel( + title="Capacity size", + description="The capacity size per TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + fill=1, + stack=True, + legend=graph_legend(max=False), + targets=[ + target( + expr=expr_sum( + "tikv_store_size_bytes", + label_selectors=['type="capacity"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="CPU", + description="The CPU usage of each TiKV instance", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "process_cpu_seconds_total", + label_selectors=['job=~".*tikv"'], + ), + ), + ], + ), + graph_panel( + title="Memory", + description="The memory usage per TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "process_resident_memory_bytes", + label_selectors=['job=~".*tikv"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="IO utilization", + description="The I/O utilization per TiKV instance", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "node_disk_io_time_seconds_total", + ), + legend_format=r"{{instance}}-{{device}}", + ), + ], + ), + graph_panel( + title="MBps", + description="The total bytes of read and write in each TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=['type="wal_file_bytes"'], + ), + legend_format=r"{{instance}}-write", + ), + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=['type=~"bytes_read|iter_bytes_read"'], + ), + legend_format=r"{{instance}}-read", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="QPS", + description="The number of leaders on each TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_msg_duration_seconds_count", + label_selectors=['type!="kv_gc"'], + by_labels=["instance", "type"], + ), + legend_format=r"{{instance}}-{{type}}", + ), + ], + ), + graph_panel( + title="Errps", + description="The total number of the gRPC message failures", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_msg_fail_total", + label_selectors=['type!="kv_gc"'], + ), + legend_format=r"{{instance}}-grpc-msg-fail", + ), + target( + expr=expr_sum_delta( + "tikv_pd_heartbeat_message_total", + label_selectors=['type="noop"'], + ).extra(extra_expr="< 1"), + legend_format=r"{{instance}}-pd-heartbeat", + ), + target( + expr=expr_sum_rate( + "tikv_critical_error_total", + by_labels=["instance", "type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Leader", + description="The number of leaders on each TiKV instance", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_region_count", + label_selectors=['type="leader"'], + ), + ), + ], + ), + graph_panel( + title="Region", + description="The number of Regions and Buckets on each TiKV instance", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_region_count", + label_selectors=['type="region"'], + ), + ), + target( + expr=expr_sum( + "tikv_raftstore_region_count", + label_selectors=['type="buckets"'], + ), + legend_format=r"{{instance}}-buckets", + hide=True, + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Uptime", + description="TiKV uptime since the last restart", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_operator( + "time()", + "-", + expr_simple( + "process_start_time_seconds", + label_selectors=['job=~".*tikv"'], + ), + ), + legend_format=r"{{instance}}", + ), + ], + ) + ] + ) + return layout.row_panel + + +def Errors() -> RowPanel: + layout = Layout(title="Errors") + layout.row( + [ + graph_panel( + title="Critical error", + targets=[ + target( + expr=expr_sum_rate( + "tikv_critical_error_total", + by_labels=["instance", "type"], + ), + ), + ], + thresholds=[GraphThreshold(value=0.0)], + ) + ] + ) + layout.row( + [ + graph_panel( + title="Server is busy", + description=""" +Indicates occurrences of events that make the TiKV instance unavailable +temporarily, such as Write Stall, Channel Full, Scheduler Busy, and Coprocessor +Full""", + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_too_busy_total", + ), + legend_format=r"scheduler-{{instance}}", + ), + target( + expr=expr_sum_rate( + "tikv_channel_full_total", + by_labels=["instance", "type"], + ), + legend_format=r"channelfull-{{instance}}-{{type}}", + ), + target( + expr=expr_sum_rate( + "tikv_coprocessor_request_error", + label_selectors=['type="full"'], + ), + legend_format=r"coprocessor-{{instance}}", + ), + target( + expr=expr_avg( + "tikv_engine_write_stall", + label_selectors=[ + 'type="write_stall_percentile99"', + 'db=~"$db"', + ], + by_labels=["instance", "db"], + ), + legend_format=r"stall-{{instance}}-{{db}}", + ), + target( + expr=expr_sum_rate( + "tikv_raftstore_store_write_msg_block_wait_duration_seconds_count", + ), + legend_format=r"store-write-channelfull-{{instance}}", + ), + ], + ), + graph_panel( + title="Server report failures", + description="The total number of reporting failure messages", + targets=[ + target( + expr=expr_sum_rate( + "tikv_server_report_failure_msg_total", + by_labels=["type", "instance", "store_id"], + ), + legend_format=r"{{instance}}-{{type}}-to-{{store_id}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raftstore error", + description="The number of different raftstore errors on each TiKV instance", + targets=[ + target( + expr=expr_sum_rate( + "tikv_storage_engine_async_request_total", + label_selectors=['status!~"success|all"'], + by_labels=["instance", "status"], + ), + ), + ], + ), + graph_panel( + title="Scheduler error", + description="The number of scheduler errors per type on each TiKV instance", + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_stage_total", + label_selectors=['stage=~"snapshot_err|prepare_write_err"'], + by_labels=["instance", "stage"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Coprocessor error", + description="The number of different coprocessor errors on each TiKV instance", + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_request_error", + by_labels=["instance", "reason"], + ), + ), + ], + ), + graph_panel( + title="gRPC message error", + description="The number of gRPC message errors per type on each TiKV instance", + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_msg_fail_total", + by_labels=["instance", "type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Leader drop", + description="The count of dropped leaders per TiKV instance", + targets=[ + target( + expr=expr_sum_delta( + "tikv_raftstore_region_count", + label_selectors=['type="leader"'], + ), + ), + ], + ), + graph_panel( + title="Leader missing", + description="The count of missing leaders per TiKV instance", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_leader_missing", + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Damaged files", + description="RocksDB damaged SST files", + targets=[ + target( + expr=expr_simple("tikv_rocksdb_damaged_files"), + legend_format=r"{{instance}}-existed", + ), + target( + expr=expr_simple("tikv_rocksdb_damaged_files_deleted"), + legend_format=r"{{instance}}-deleted", + ), + ], + ), + graph_panel( + title="Log Replication Rejected", + description="The count of Log Replication Reject caused by follower memory insufficient", + targets=[ + target( + expr=expr_sum_rate( + "tikv_server_raft_append_rejects", + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def Server() -> RowPanel: + layout = Layout(title="Server") + layout.row( + [ + graph_panel( + title="CF size", + description="The size of each column family", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum("tikv_engine_size_bytes", by_labels=["type"]), + ), + ], + ), + graph_panel( + title="Channel full", + description="The total number of channel full errors on each TiKV instance", + targets=[ + target( + expr=expr_sum_rate( + "tikv_channel_full_total", by_labels=["instance", "type"] + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Active written leaders", + description="The number of leaders being written on each TiKV instance", + targets=[ + target( + expr=expr_sum_rate( + "tikv_region_written_keys_count", + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Approximate region size", + metric="tikv_raftstore_region_size_bucket", + yaxis=yaxis(format=UNITS.BYTES_IEC), + ), + graph_panel_histogram_quantiles( + title="Approximate region size", + description="The approximate Region size", + metric="tikv_raftstore_region_size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + hide_count=True, + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Region written bytes", + metric="tikv_region_written_bytes_bucket", + yaxis=yaxis(format=UNITS.BYTES_IEC), + ), + graph_panel( + title="Region average written bytes", + description="The average rate of writing bytes to Regions per TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_histogram_avg("tikv_region_written_bytes"), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Region written keys", + metric="tikv_region_written_keys_bucket", + ), + graph_panel( + title="Region average written keys", + description="The average rate of written keys to Regions per TiKV instance", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_histogram_avg("tikv_region_written_keys"), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Hibernate Peers", + description="The number of peers in hibernated state", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_hibernated_peer_state", + by_labels=["instance", "state"], + ), + ), + ], + ), + graph_panel( + title="Memory trace", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_simple( + "tikv_server_mem_trace_sum", + label_selectors=['name=~"raftstore-.*"'], + ), + legend_format="{{instance}}-{{name}}", + ), + target( + expr=expr_simple( + "raft_engine_memory_usage", + ), + legend_format="{{instance}}-raft-engine", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft Entry Cache Evicts", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raft_entries_evict_bytes", + ), + ), + ], + ), + graph_panel( + title="Resolve address duration", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_server_address_resolve_duration_secs", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="99% Thread Pool Schedule Wait Duration", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_yatp_pool_schedule_wait_duration", + by_labels=["name"], + ), + legend_format="{{name}}", + ), + ], + thresholds=[GraphThreshold(value=1.0)], + ), + graph_panel( + title="Average Thread Pool Schedule Wait Duration", + description="The average rate of written keys to Regions per TiKV instance", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + targets=[ + target( + expr=expr_histogram_avg( + "tikv_yatp_pool_schedule_wait_duration", + by_labels=["name"], + ), + legend_format="{{name}}", + ), + ], + thresholds=[GraphThreshold(value=1.0)], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Disk IO time per second", + yaxes=yaxes(left_format=UNITS.NANO_SECONDS), + lines=False, + stack=True, + targets=[ + target( + expr=expr_sum_rate( + "tikv_storage_rocksdb_perf", + label_selectors=['metric="block_read_time"'], + by_labels=["req"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_coprocessor_rocksdb_perf", + label_selectors=['metric="block_read_time"'], + by_labels=["req"], + ), + legend_format="copr-{{req}}", + ), + ], + ), + graph_panel( + title="Disk IO bytes per second", + yaxes=yaxes(left_format=UNITS.NANO_SECONDS), + lines=False, + stack=True, + targets=[ + target( + expr=expr_sum_rate( + "tikv_storage_rocksdb_perf", + label_selectors=['metric="block_read_byte"'], + by_labels=["req"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_coprocessor_rocksdb_perf", + label_selectors=['metric="block_read_byte"'], + by_labels=["req"], + ), + legend_format="copr-{{req}}", + ), + ], + ), + ] + ) + return layout.row_panel + + +def gRPC() -> RowPanel: + layout = Layout(title="gRPC") + layout.row( + [ + graph_panel( + title="gRPC message count", + description="The count of different kinds of gRPC message", + yaxes=yaxes(left_format=UNITS.REQUESTS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_msg_duration_seconds_count", + label_selectors=['type!="kv_gc"'], + by_labels=["type"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_grpc_msg_duration_seconds_count", + label_selectors=['type!="kv_gc"'], + by_labels=["type", "priority"], + ), + hide=True, + ), + ], + ), + graph_panel( + title="gRPC message failed", + description="The count of different kinds of gRPC message which is failed", + yaxes=yaxes(left_format=UNITS.REQUESTS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_msg_fail_total", + label_selectors=['type!="kv_gc"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title=r"99% gRPC message duration", + description=r"The 99% percentile of execution time of gRPC message", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_grpc_msg_duration_seconds", + label_selectors=['type!="kv_gc"'], + by_labels=["type"], + ), + legend_format="{{type}}", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_grpc_msg_duration_seconds", + label_selectors=['type!="kv_gc"'], + by_labels=["type", "priority"], + ), + legend_format="{{type}}-{{priority}}", + hide=True, + ), + ], + ), + graph_panel( + title="Average gRPC message duration", + description="The average execution time of gRPC message", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + targets=[ + target( + expr=expr_histogram_avg( + "tikv_grpc_msg_duration_seconds", + by_labels=["type"], + ), + legend_format="{{type}}", + ), + target( + expr=expr_histogram_avg( + "tikv_grpc_msg_duration_seconds", + by_labels=["type", "priority"], + ), + legend_format="{{type}}-{{priority}}", + hide=True, + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="gRPC batch size", + description=r"The 99% percentile of execution time of gRPC message", + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_server_grpc_req_batch_size", + ), + legend_format=r"99% request", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_server_grpc_resp_batch_size", + ), + legend_format=r"99% response", + ), + target( + expr=expr_histogram_avg( + "tikv_server_grpc_req_batch_size", + by_labels=[], # override default by instance. + ), + legend_format="avg request", + ), + target( + expr=expr_histogram_avg( + "tikv_server_grpc_resp_batch_size", + by_labels=[], # override default by instance. + ), + legend_format="avg response", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_server_request_batch_size", + ), + legend_format=r"99% kv get batch", + ), + target( + expr=expr_histogram_avg( + "tikv_server_request_batch_size", + by_labels=[], # override default by instance. + ), + legend_format="avg kv batch", + ), + ], + ), + graph_panel( + title="raft message batch size", + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_server_raft_message_batch_size", + ), + legend_format=r"99%", + ), + target( + expr=expr_histogram_avg( + "tikv_server_raft_message_batch_size", + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="gRPC request sources QPS", + description="The QPS of different sources of gRPC request", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_request_source_counter_vec", + by_labels=["source"], + ), + ), + ], + ), + graph_panel( + title="gRPC request sources duration", + description="The duration of different sources of gRPC request", + yaxes=yaxes(left_format=UNITS.SECONDS), + lines=False, + stack=True, + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_request_source_duration_vec", + by_labels=["source"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="gRPC resource group QPS", + description="The QPS of different resource groups of gRPC request", + targets=[ + target( + expr=expr_sum_rate( + "tikv_grpc_resource_group_total", by_labels=["name"] + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def ThreadCPU() -> RowPanel: + layout = Layout(title="Thread CPU") + layout.row( + [ + graph_panel( + title="Raft store CPU", + description="The CPU utilization of raftstore thread", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"(raftstore|rs)_.*"'], + ), + ), + ], + ), + graph_panel( + title="Async apply CPU", + description="The CPU utilization of async apply", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"apply_[0-9]+"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Store writer CPU", + description="The CPU utilization of store writer thread", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"store_write.*"'], + ), + ), + ], + thresholds=[GraphThreshold(value=0.8)], + ), + graph_panel( + title="gRPC poll CPU", + description="The CPU utilization of gRPC", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"grpc.*"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Scheduler worker CPU", + description="The CPU utilization of scheduler worker", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"sched_.*"'], + ), + ), + ], + thresholds=[GraphThreshold(value=3.6)], + ), + graph_panel( + title="Storage ReadPool CPU", + description="The CPU utilization of readpool", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"store_read_norm.*"'], + ), + legend_format="{{instance}}-normal", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"store_read_high.*"'], + ), + legend_format="{{instance}}-high", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"store_read_low.*"'], + ), + legend_format="{{instance}}-low", + ), + ], + thresholds=[GraphThreshold(value=3.6)], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Unified read pool CPU", + description="The CPU utilization of the unified read pool", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"unified_read_po.*"'], + ), + ), + ], + thresholds=[GraphThreshold(value=7.2)], + ), + graph_panel( + title="RocksDB CPU", + description="The CPU utilization of RocksDB", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"rocksdb.*"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Coprocessor CPU", + description="The CPU utilization of coprocessor", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"cop_normal.*"'], + ), + legend_format="{{instance}}-normal", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"cop_high.*"'], + ), + legend_format="{{instance}}-high", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"cop_low.*"'], + ), + legend_format="{{instance}}-low", + ), + ], + thresholds=[GraphThreshold(value=7.2)], + ), + graph_panel( + title="GC worker CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"gc_worker.*"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Background Worker CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"background.*"'], + ), + ), + ], + ), + graph_panel( + title="Raftlog fetch Worker CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"raftlog_fetch.*"'], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Import CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"sst_.*"'], + ), + ), + ], + ), + graph_panel( + title="Backup CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"(backup-worker|bkwkr|backup_endpoint).*"' + ], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="CDC worker CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"cdcwkr.*"'], + ), + legend_format="{{instance}}-worker", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"tso"'], + ), + legend_format="{{instance}}-tso", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"cdc_.*"'], + ), + legend_format="{{instance}}-endpoint", + ), + ], + ), + graph_panel( + title="TSO Worker CPU", + description="The CPU utilization of raftstore thread", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"tso_worker"'], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def TTL() -> RowPanel: + layout = Layout(title="TTL") + layout.row( + [ + graph_panel( + title="TTL expire count", + targets=[ + target( + expr=expr_sum_rate( + "tikv_ttl_expire_kv_count_total", + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="TTL expire size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_ttl_expire_kv_size_total", + ) + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="TTL check progress", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_operator( + expr_sum_rate( + "tikv_ttl_checker_processed_regions", + ), + "/", + expr_sum_rate( + "tikv_raftstore_region_count", + label_selectors=['type="region"'], + ), + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="TTL checker actions", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_ttl_checker_actions", by_labels=["type"] + ) + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="TTL checker compact duration", + description="The time consumed when executing GC tasks", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_ttl_checker_compact_duration", + ), + stat_panel( + title="TTL checker poll interval", + format=UNITS.MILLI_SECONDS, + targets=[ + target( + expr=expr_max( + "tikv_ttl_checker_poll_interval", + label_selectors=['type="tikv_gc_run_interval"'], + by_labels=[], # override default by instance. + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def PD() -> RowPanel: + layout = Layout(title="PD") + layout.row( + [ + graph_panel( + title="PD requests", + description="The count of requests that TiKV sends to PD", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_pd_request_duration_seconds_count", + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="PD request duration (average)", + description="The time consumed by requests that TiKV sends to PD", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_avg( + "tikv_pd_request_duration_seconds", + by_labels=["type"], + ), + legend_format="{{type}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="PD heartbeats", + description="The total number of PD heartbeat messages", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_pd_heartbeat_message_total", + by_labels=["type"], + ), + ), + target( + expr=expr_sum( + "tikv_pd_pending_heartbeat_total", + ), + legend_format="{{instance}}-pending", + ), + ], + ), + graph_panel( + title="PD validate peers", + description="The total number of peers validated by the PD worker", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_pd_validate_peer_total", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="PD reconnection", + description="The count of reconnection between TiKV and PD", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_delta( + "tikv_pd_reconnect_total", + range_selector="$__rate_interval", + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="PD forward status", + description="The forward status of PD client", + targets=[ + target( + expr=expr_simple( + "tikv_pd_request_forwarded", + ), + legend_format="{{instance}}-{{host}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Pending TSO Requests", + description="The number of TSO requests waiting in the queue.", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum( + "tikv_pd_pending_tso_request_total", + ), + ), + ], + ), + graph_panel( + title="Store Slow Score", + description="The slow score of stores", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_slow_score", + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Inspected duration per server", + description="The duration that recorded by inspecting messages.", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_inspect_duration_seconds", + by_labels=["instance", "type"], + ), + legend_format="{{instance}}-{{type}}", + ), + ], + ) + ] + ) + return layout.row_panel + + +def IOBreakdown() -> RowPanel: + layout = Layout(title="IO Breakdown") + layout.row( + [ + graph_panel( + title="Write IO bytes", + description="The throughput of disk write per IO type", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_io_bytes", + label_selectors=['op="write"'], + by_labels=["type"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_io_bytes", + label_selectors=['op="write"'], + by_labels=[], # override default by instance. + ), + legend_format="total", + ), + ], + ), + graph_panel( + title="Read IO bytes", + description="The throughput of disk read per IO type", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_io_bytes", + label_selectors=['op="read"'], + by_labels=["type"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_io_bytes", + label_selectors=['op="read"'], + by_labels=[], # override default by instance. + ), + legend_format="total", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="IO threshold", + description="The threshold of disk IOs per priority", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_rate_limiter_max_bytes_per_sec", + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Rate Limiter Request Wait Duration", + description="IO rate limiter request wait duration.", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_rate_limiter_request_wait_duration_seconds", + by_labels=["type"], + ), + legend_format=r"{{type}}-99%", + ), + target( + expr=expr_histogram_avg( + "tikv_rate_limiter_request_wait_duration_seconds", + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + return layout.row_panel + + +def RaftWaterfall() -> RowPanel: + layout = Layout(title="Raft Waterfall") + layout.row( + [ + graph_panel_histogram_quantiles( + title="Storage async write duration", + description="The time consumed by processing asynchronous write requests", + yaxes=yaxes(left_format=UNITS.SECONDS, right_format=UNITS.NONE_FORMAT), + metric="tikv_storage_engine_async_request_duration_seconds", + label_selectors=['type="write"'], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Store duration", + description="The store time duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_duration_secs", + ), + graph_panel_histogram_quantiles( + title="Apply duration", + description="The apply time duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_apply_duration_secs", + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Store propose wait duration", + description="The propose wait time duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_request_wait_time_duration_secs", + ), + graph_panel_histogram_quantiles( + title="Store batch wait duration", + description="The batch wait time duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_batch_wait_duration_seconds", + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Store send to write queue duration", + description="The send-to-write-queue time duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_send_to_queue_duration_seconds", + ), + graph_panel_histogram_quantiles( + title="Store send proposal duration", + description="The send raft message of the proposal duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_send_proposal_duration_seconds", + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Store write kv db end duration", + description="The write kv db end duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_write_kvdb_end_duration_seconds", + ), + graph_panel_histogram_quantiles( + title="Store before write duration", + description="The before write time duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_before_write_duration_seconds", + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Store persist duration", + description="The persist duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_persist_duration_seconds", + ), + graph_panel_histogram_quantiles( + title="Store write end duration", + description="The write end duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_write_end_duration_seconds", + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Store commit but not persist duration", + description="The commit but not persist duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds", + ), + graph_panel_histogram_quantiles( + title="Store commit and persist duration", + description="The commit and persist duration of each request", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_raftstore_store_wf_commit_log_duration_seconds", + ), + ] + ) + return layout.row_panel + + +def RaftIO() -> RowPanel: + layout = Layout(title="Raft IO") + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Process ready duration", + heatmap_description="The time consumed for peer processes to be ready in Raft", + graph_title="99% Process ready duration per server", + graph_description="The time consumed for peer processes to be ready in Raft", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_raft_process_duration_secs", + label_selectors=['type="ready"'], + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Store write loop duration", + heatmap_description="The time duration of store write loop when store-io-pool-size is not zero.", + graph_title="99% Store write loop duration per server", + graph_description="The time duration of store write loop on each TiKV instance when store-io-pool-size is not zero.", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_store_write_loop_duration_seconds", + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Append log duration", + heatmap_description="The time consumed when Raft appends log", + graph_title="99% Commit log duration per server", + graph_description="The time consumed when Raft commits log on each TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_append_log_duration_seconds", + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Commit log duration", + heatmap_description="The time consumed when Raft commits log", + graph_title="99% Commit log duration per server", + graph_description="The time consumed when Raft commits log on each TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_commit_log_duration_seconds", + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Apply log duration", + heatmap_description="The time consumed when Raft applies log", + graph_title="99% Apply log duration per server", + graph_description="The time consumed for Raft to apply logs per TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_apply_log_duration_seconds", + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Raft Client Wait Connection Ready Duration", + heatmap_description="The time consumed for Raft Client wait connection ready", + graph_title="99% Raft Client Wait Connection Ready Duration", + graph_description="The time consumed for Raft Client wait connection ready per TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_server_raft_client_wait_ready_duration", + graph_by_labels=["to"], + ) + ) + layout.row( + [ + graph_panel( + title="Store io task reschedule", + description="The throughput of disk write per IO type", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_io_reschedule_region_total", + ), + legend_format="rechedule-{{instance}}", + ), + target( + expr=expr_sum( + "tikv_raftstore_io_reschedule_pending_tasks_total", + ), + legend_format="pending-task-{{instance}}", + ), + ], + ), + graph_panel( + title="99% Write task block duration per server", + description="The time consumed when store write task block on each TiKV instance", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_store_write_msg_block_wait_duration_seconds", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + return layout.row_panel + + +def RaftPropose() -> RowPanel: + layout = Layout(title="Raft Propose") + layout.row( + [ + graph_panel( + title="Raft proposals per ready", + description="The proposal count of a Regions in a tick", + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_apply_proposal", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="Raft read/write proposals", + description="The number of proposals per type", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_proposal_total", + label_selectors=['type=~"local_read|normal|read_index"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft read proposals per server", + description="The number of read proposals which are made by each TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_proposal_total", + label_selectors=['type=~"local_read|read_index"'], + ), + ), + ], + ), + graph_panel( + title="Raft write proposals per server", + description="The number of write proposals which are made by each TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_proposal_total", + label_selectors=['type=~"normal"'], + ), + ), + ], + ), + ] + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Propose wait duration", + heatmap_description="The wait time of each proposal", + graph_title="99% Propose wait duration per server", + graph_description="The wait time of each proposal in each TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_request_wait_time_duration_secs", + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Store write wait duration", + heatmap_description="The wait time of each store write task", + graph_title="99% Store write wait duration per server", + graph_description="The wait time of each store write task in each TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_store_write_task_wait_duration_secs", + ) + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Apply wait duration", + heatmap_description="The wait time of each apply task", + graph_title="99% Apply wait duration per server", + graph_description="The wait time of each apply task in each TiKV instance", + yaxis_format=UNITS.SECONDS, + metric="tikv_raftstore_apply_wait_time_duration_secs", + ) + ) + layout.row( + [ + heatmap_panel( + title="Store write handle msg duration", + description="The handle duration of each store write task msg", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_raftstore_store_write_handle_msg_duration_secs_bucket", + ), + heatmap_panel( + title="Store write trigger size", + description="The distribution of write trigger size", + yaxis=yaxis(format=UNITS.BYTES_IEC), + metric="tikv_raftstore_store_write_trigger_wb_bytes_bucket", + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft propose speed", + description="The rate at which peers propose logs", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_propose_log_size_sum", + ), + ), + ], + ), + graph_panel( + title="Perf Context duration", + description="The rate at which peers propose logs", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_store_perf_context_time_duration_secs", + by_labels=["type"], + ), + legend_format="store-{{type}}", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_apply_perf_context_time_duration_secs", + by_labels=["type"], + ), + legend_format="apply-{{type}}", + ), + ], + ), + ] + ) + return layout.row_panel + + +def RaftProcess() -> RowPanel: + layout = Layout(title="Raft Process") + layout.row( + [ + graph_panel( + title="Ready handled", + description="The count of different ready type of Raft", + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_ready_handled_total", + by_labels=["type"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_process_duration_secs_count", + label_selectors=['type="ready"'], + by_labels=[], # overwrite default by instance. + ), + legend_format="count", + ), + ], + ), + graph_panel( + title="Max duration of raft store events", + description="The max time consumed by raftstore events", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.999999, + "tikv_raftstore_event_duration", + by_labels=["type"], + ), + legend_format="{{type}}", + ), + target( + expr=expr_histogram_quantile( + 0.999999, + "tikv_broadcast_normal_duration_seconds", + ), + legend_format="broadcast_normal", + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Replica read lock checking duration", + description="Replica read lock checking duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_replica_read_lock_check_duration_seconds_bucket", + ), + heatmap_panel( + title="Peer msg length distribution", + description="The length of peer msgs for each round handling", + metric="tikv_raftstore_peer_msg_len_bucket", + ), + ] + ) + return layout.row_panel + + +def RaftMessage() -> RowPanel: + layout = Layout(title="Raft Message") + layout.row( + [ + graph_panel( + title="Sent messages per server", + description="The number of Raft messages sent by each TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_sent_message_total", + ), + ), + ], + ), + graph_panel( + title="Flush messages per server", + description="The number of Raft messages flushed by each TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_server_raft_message_flush_total", + by_labels=["instance", "reason"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Receive messages per server", + description="The number of Raft messages received by each TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_server_raft_message_recv_total", + ), + ), + ], + ), + graph_panel( + title="Messages", + description="The number of different types of Raft messages that are sent", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_sent_message_total", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Vote", + description="The total number of vote messages that are sent in Raft", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_sent_message_total", + label_selectors=['type="vote"'], + ), + ), + ], + ), + graph_panel( + title="Raft dropped messages", + description="The number of dropped Raft messages per type", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_dropped_message_total", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def RaftAdmin() -> RowPanel: + layout = Layout(title="Raft Admin") + layout.row( + [ + graph_panel( + title="Admin proposals", + description="The number of admin proposals", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_proposal_total", + label_selectors=['type=~"conf_change|transfer_leader"'], + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Admin apply", + description="The number of the processed apply command", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_admin_cmd_total", + label_selectors=['type!="compact"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Check split", + description="The number of raftstore split checks", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_check_split_total", + label_selectors=['type!="ignore"'], + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="99.99% Check split duration", + description="The time consumed when running split check in .9999", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + "tikv_raftstore_check_split_duration_seconds", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Load base split event", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_delta( + "tikv_load_base_split_event", + range_selector="1m", + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Load base split duration", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.80, + "tikv_load_base_split_duration_seconds", + by_labels=["instance"], + ), + legend_format="80%-{{instance}}", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_load_base_split_duration_seconds", + by_labels=["instance"], + ), + legend_format="99%-{{instance}}", + ), + target( + expr=expr_histogram_avg( + "tikv_load_base_split_duration_seconds", + by_labels=["instance"], + ), + legend_format="avg-{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Peer in Flashback State", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_peer_in_flashback_state", + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def RaftLog() -> RowPanel: + layout = Layout(title="Raft Log") + layout.row( + [ + graph_panel( + title="Raft log GC write duration", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=10), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + "tikv_raftstore_raft_log_gc_write_duration_secs", + by_labels=["instance"], + ), + legend_format="99.99%-{{instance}}", + ), + target( + expr=expr_histogram_avg( + "tikv_raftstore_raft_log_gc_write_duration_secs", + by_labels=["instance"], + ), + legend_format="avg-{{instance}}", + ), + ], + ), + graph_panel( + title="Raft log GC kv sync duration", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=10), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + "tikv_raftstore_raft_log_kv_sync_duration_secs", + by_labels=["instance"], + ), + legend_format="99.99%-{{instance}}", + ), + target( + expr=expr_histogram_avg( + "tikv_raftstore_raft_log_kv_sync_duration_secs", + by_labels=["instance"], + ), + legend_format="avg-{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft log GC write operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_log_gc_write_duration_secs_count", + ), + ), + ], + ), + graph_panel( + title="Raft log GC seek operations ", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_log_gc_seek_operations_count", + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft log lag", + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_log_lag_sum", + ), + ), + ], + ), + graph_panel( + title="Raft log gc skipped", + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_log_gc_skipped", + by_labels=["instance", "reason"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft log GC failed", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_raft_log_gc_failed", + ), + ), + ], + ), + graph_panel( + title="Raft log fetch ", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_entry_fetches", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Raft log async fetch task duration", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=10), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + "tikv_raftstore_entry_fetches_task_duration_seconds", + ), + legend_format="99.99%", + ), + target( + expr=expr_histogram_avg( + "tikv_raftstore_entry_fetches_task_duration_seconds", + by_labels=["instance"], + ), + legend_format="avg-{{instance}}", + ), + target( + expr=expr_sum( + "tikv_worker_pending_task_total", + label_selectors=['name=~"raftlog-fetch-worker"'], + ), + legend_format="pending-task", + ), + ], + series_overrides=[ + series_override( + alias="/pending-task/", + yaxis=2, + transform_negative_y=True, + ), + ], + ), + ] + ) + return layout.row_panel + + +def LocalReader() -> RowPanel: + layout = Layout(title="Local Reader") + layout.row( + [ + graph_panel( + title="Raft log async fetch task duration", + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_local_read_reject_total", + by_labels=["instance", "reason"], + ), + legend_format="{{instance}}-reject-by-{{reason}}", + ), + target( + expr=expr_sum_rate( + "tikv_raftstore_local_read_executed_requests", + ), + legend_format="{{instance}}-total", + ), + target( + expr=expr_sum_rate( + "tikv_raftstore_local_read_executed_stale_read_requests", + ), + legend_format="{{instance}}-stale-read", + ), + ], + series_overrides=[ + series_override( + alias="/.*-total/", + yaxis=2, + ), + ], + ), + ] + ) + return layout.row_panel + + +def UnifiedReadPool() -> RowPanel: + layout = Layout(title="Unified Read Pool") + layout.row( + [ + graph_panel( + title="Time used by level", + description="The time used by each level in the unified read pool per second. Level 0 refers to small queries.", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_sum_rate( + "tikv_multilevel_level_elapsed", + label_selectors=['name="unified-read-pool"'], + by_labels=["level"], + ), + ), + ], + ), + graph_panel( + title="Level 0 chance", + description="The chance that level 0 (small) tasks are scheduled in the unified read pool.", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_simple( + "tikv_multilevel_level0_chance", + label_selectors=['name="unified-read-pool"'], + ), + legend_format="{{type}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Running tasks", + description="The number of concurrently running tasks in the unified read pool.", + targets=[ + target( + expr=expr_sum_aggr_over_time( + "tikv_unified_read_pool_running_tasks", + "avg", + "1m", + ), + ), + ], + ), + heatmap_panel( + title="Unified Read Pool Wait Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_yatp_pool_schedule_wait_duration_bucket", + label_selectors=['name=~"unified-read.*"'], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Duration of One Time Slice", + description="Unified read pool task execution time during one schedule.", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + metric="tikv_yatp_task_poll_duration", + hide_count=True, + ), + graph_panel_histogram_quantiles( + title="Task Execute Duration", + description="Unified read pool task total execution duration.", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + metric="tikv_yatp_task_exec_duration", + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Task Schedule Times", + description="Task schedule number of times.", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT, log_base=2), + metric="tikv_yatp_task_execute_times", + hide_count=True, + ), + ] + ) + return layout.row_panel + + +def Storage() -> RowPanel: + layout = Layout(title="Storage") + layout.row( + [ + graph_panel( + title="Storage command total", + description="The total count of different kinds of commands received", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC, log_base=10), + targets=[ + target( + expr=expr_sum_rate( + "tikv_storage_command_total", + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Storage async request error", + description="The total number of engine asynchronous request errors", + targets=[ + target( + expr=expr_sum_rate( + "tikv_storage_engine_async_request_total", + label_selectors=['status!~"all|success"'], + by_labels=["status"], + ), + ), + ], + ), + ] + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Storage async write duration", + heatmap_description="The time consumed by processing asynchronous write requests", + graph_title="Storage async write duration", + graph_description="The storage async write duration", + yaxis_format=UNITS.SECONDS, + metric="tikv_storage_engine_async_request_duration_seconds", + label_selectors=['type="write"'], + ), + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Storage async snapshot duration", + heatmap_description="The time consumed by processing asynchronous snapshot requests", + graph_title="Storage async snapshot duration", + graph_description="The storage async snapshot duration", + yaxis_format=UNITS.SECONDS, + metric="tikv_storage_engine_async_request_duration_seconds", + label_selectors=['type="snapshot"'], + ), + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Storage async snapshot duration (pure local read)", + heatmap_description="The storage async snapshot duration without the involving of raftstore", + graph_title="Storage async snapshot duration (pure local read)", + graph_description="The storage async snapshot duration without the involving of raftstore", + yaxis_format=UNITS.SECONDS, + metric="tikv_storage_engine_async_request_duration_seconds", + label_selectors=['type="snapshot_local_read"'], + ), + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Read index propose wait duration", + heatmap_description="Read index propose wait duration associated with async snapshot", + graph_title="Read index propose wait duration", + graph_description="Read index propose wait duration associated with async snapshot", + yaxis_format=UNITS.SECONDS, + metric="tikv_storage_engine_async_request_duration_seconds", + label_selectors=['type="snapshot_read_index_propose_wait"'], + ), + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Read index confirm duration", + heatmap_description="Read index confirm duration associated with async snapshot", + graph_title="Read index confirm duration", + graph_description="Read index confirm duration associated with async snapshot", + yaxis_format=UNITS.SECONDS, + metric="tikv_storage_engine_async_request_duration_seconds", + label_selectors=['type="snapshot_read_index_confirm"'], + ), + ) + layout.row( + [ + graph_panel( + title="Process Stat Cpu Usage", + description="CPU usage measured over a 30 second window", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum( + "tikv_storage_process_stat_cpu_usage", + ), + ), + ], + ), + graph_panel_histogram_quantiles( + title="Full compaction duration seconds", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_storage_full_compact_duration_seconds", + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Full compaction pause duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_storage_full_compact_pause_duration_seconds", + hide_count=True, + ), + graph_panel_histogram_quantiles( + title="Full compaction per-increment duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_storage_full_compact_increment_duration_seconds", + hide_count=True, + ), + ] + ) + return layout.row_panel + + +def FlowControl() -> RowPanel: + layout = Layout(title="Flow Control") + layout.row( + [ + graph_panel( + title="Scheduler flow", + description="", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_scheduler_write_flow", + ), + legend_format="write-{{instance}}", + ), + target( + expr=expr_sum( + "tikv_scheduler_throttle_flow", + ).extra(" != 0"), + legend_format="throttle-{{instance}}", + ), + ], + ), + graph_panel( + title="Scheduler discard ratio", + description="", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum( + "tikv_scheduler_discard_ratio", + by_labels=["type"], + ).extra(" / 10000000"), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Throttle duration", + metric="tikv_scheduler_throttle_duration_seconds_bucket", + yaxis=yaxis(format=UNITS.SECONDS), + ), + graph_panel( + title="Scheduler throttled CF", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_simple( + "tikv_scheduler_throttle_cf", + ).extra(" != 0"), + legend_format="{{instance}}-{{cf}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Flow controller actions", + description="", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_throttle_action_total", + by_labels=["type", "cf"], + ), + ), + ], + ), + graph_panel( + title="Flush/L0 flow", + description="", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_scheduler_l0_flow", + by_labels=["instance", "cf"], + ), + legend_format="{{cf}}_l0_flow-{{instance}}", + ), + target( + expr=expr_sum( + "tikv_scheduler_flush_flow", + by_labels=["instance", "cf"], + ), + legend_format="{{cf}}_flush_flow-{{instance}}", + ), + target( + expr=expr_sum( + "tikv_scheduler_l0_flow", + ), + legend_format="total_l0_flow-{{instance}}", + ), + target( + expr=expr_sum( + "tikv_scheduler_flush_flow", + ), + legend_format="total_flush_flow-{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Flow controller factors", + description="", + targets=[ + target( + expr=expr_max( + "tikv_scheduler_l0", + ), + legend_format="l0-{{instance}}", + ), + target( + expr=expr_max( + "tikv_scheduler_memtable", + ), + legend_format="memtable-{{instance}}", + ), + target( + expr=expr_max( + "tikv_scheduler_l0_avg", + ), + legend_format="avg_l0-{{instance}}", + ), + ], + ), + graph_panel( + title="Compaction pending bytes", + description="", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_engine_pending_compaction_bytes", + label_selectors=['db="kv"'], + by_labels=["cf"], + ), + ), + target( + expr=expr_sum( + "tikv_scheduler_pending_compaction_bytes", + by_labels=["cf"], + ).extra(" / 10000000"), + legend_format="pending-bytes-{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Txn command throttled duration", + description="Throttle time for txn storage commands in 1 minute.", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_sum_rate( + "tikv_txn_command_throttle_time_total", + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Non-txn command throttled duration", + description="Throttle time for non-txn related processing like analyze or dag in 1 minute.", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_sum_rate( + "tikv_non_txn_command_throttle_time_total", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def SchedulerCommands() -> RowPanel: + layout = Layout(title="Scheduler", repeat="command") + layout.row( + [ + graph_panel( + title="Scheduler stage total", + description="The total number of commands on each stage in commit command", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_too_busy_total", + label_selectors=['type="$command"'], + ), + legend_format="busy-{{instance}}", + ), + target( + expr=expr_sum_rate( + "tikv_scheduler_stage_total", + label_selectors=['type="$command"'], + by_labels=["stage"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Scheduler command duration", + description="The time consumed when executing commit command", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_scheduler_command_duration_seconds", + label_selectors=['type="$command"'], + hide_count=True, + ), + graph_panel_histogram_quantiles( + title="Scheduler latch wait duration", + description="The time which is caused by latch wait in commit command", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_scheduler_latch_wait_duration_seconds", + label_selectors=['type="$command"'], + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Scheduler keys read", + description="The count of keys read by a commit command", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + metric="tikv_scheduler_kv_command_key_read", + label_selectors=['type="$command"'], + hide_count=True, + ), + graph_panel_histogram_quantiles( + title="Scheduler keys written", + description="The count of keys written by a commit command", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + metric="tikv_scheduler_kv_command_key_write", + label_selectors=['type="$command"'], + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel( + title="Scheduler scan details", + description="The keys scan details of each CF when executing commit command", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_kv_scan_details", + label_selectors=['req="$command"'], + by_labels=["tag"], + ), + ), + ], + ), + graph_panel( + title="Scheduler scan details [lock]", + description="The keys scan details of lock CF when executing commit command", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_kv_scan_details", + label_selectors=['req="$command", cf="lock"'], + by_labels=["tag"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Scheduler scan details [write]", + description="The keys scan details of write CF when executing commit command", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_kv_scan_details", + label_selectors=['req="$command", cf="write"'], + by_labels=["tag"], + ), + ), + ], + ), + graph_panel( + title="Scheduler scan details [default]", + description="The keys scan details of default CF when executing commit command", + yaxes=yaxes(left_format=UNITS.NONE_FORMAT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_kv_scan_details", + label_selectors=['req="$command", cf="default"'], + by_labels=["tag"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Scheduler command read duration", + description="The time consumed on reading when executing commit command", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_scheduler_processing_read_duration_seconds", + label_selectors=['type="$command"'], + hide_count=True, + ), + heatmap_panel( + title="Check memory locks duration", + description="The time consumed on checking memory locks", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_storage_check_mem_lock_duration_seconds_bucket", + label_selectors=['type="$command"'], + ), + ] + ) + return layout.row_panel + + +def Scheduler() -> RowPanel: + layout = Layout(title="Scheduler") + layout.row( + [ + graph_panel( + title="Scheduler stage total", + description="The total number of commands on each stage", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_too_busy_total", + by_labels=["stage"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_scheduler_stage_total", + by_labels=["stage"], + ), + ), + ], + ), + graph_panel( + title="Scheduler writing bytes", + description="The total writing bytes of commands on each stage", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_scheduler_writing_bytes", + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Scheduler priority commands", + description="The count of different priority commands", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_scheduler_commands_pri_total", + by_labels=["priority"], + ), + ), + ], + ), + graph_panel( + title="Scheduler pending commands", + description="The count of pending commands per TiKV instance", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum( + "tikv_scheduler_contex_total", + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Txn Scheduler Pool Wait Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_yatp_pool_schedule_wait_duration_bucket", + label_selectors=['name=~"sched-worker.*"'], + ), + ] + ) + return layout.row_panel + + +def GC() -> RowPanel: + layout = Layout(title="GC") + layout.row( + [ + graph_panel( + title="GC tasks", + description="The count of GC tasks processed by gc_worker", + targets=[ + target( + expr=expr_sum_rate( + "tikv_gcworker_gc_tasks_vec", + by_labels=["task"], + ), + legend_format="total-{{task}}", + ), + target( + expr=expr_sum_rate( + "tikv_storage_gc_skipped_counter", + by_labels=["task"], + ), + legend_format="skipped-{{task}}", + ), + target( + expr=expr_sum_rate( + "tikv_gcworker_gc_task_fail_vec", + by_labels=["task"], + ), + legend_format="failed-{{task}}", + ), + target( + expr=expr_sum_rate( + "tikv_gc_worker_too_busy", + by_labels=[], + ), + legend_format="gcworker-too-busy", + ), + ], + ), + graph_panel_histogram_quantiles( + title="GC tasks duration", + description="The time consumed when executing GC tasks", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_gcworker_gc_task_duration_vec", + label_selectors=['type="$command"'], + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel( + title="TiDB GC seconds", + description="The GC duration", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 1, "tidb_tikvclient_gc_seconds", by_labels=["instance"] + ).skip_default_instance_selector(), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="TiDB GC worker actions", + description="The count of TiDB GC worker actions", + targets=[ + target( + expr=expr_sum_rate( + "tidb_tikvclient_gc_worker_actions_total", + by_labels=["type"], + ).skip_default_instance_selector(), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="ResolveLocks Progress", + description="Progress of ResolveLocks, the first phase of GC", + targets=[ + target( + expr=expr_max( + "tidb_tikvclient_range_task_stats", + label_selectors=['type=~"resolve-locks.*"'], + by_labels=["result"], + ).skip_default_instance_selector(), + ), + ], + ), + graph_panel( + title="TiKV Auto GC Progress", + description="Progress of TiKV's GC", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_operator( + expr_sum( + "tikv_gcworker_autogc_processed_regions", + label_selectors=['type="scan"'], + ), + "/", + expr_sum( + "tikv_raftstore_region_count", + label_selectors=['type="region"'], + ), + ), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="GC speed", + description="keys / second", + targets=[ + target( + expr=expr_sum_rate( + "tikv_storage_mvcc_gc_delete_versions_sum", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_keys/s", + ), + ], + ), + graph_panel( + title="TiKV Auto GC SafePoint", + description="SafePoint used for TiKV's Auto GC", + yaxes=yaxes(left_format=UNITS.DATE_TIME_ISO), + targets=[ + target( + expr=expr_max( + "tikv_gcworker_autogc_safe_point", + ) + .extra("/ (2^18)") + .skip_default_instance_selector(), + ), + ], + ), + ] + ) + layout.half_row( + [ + stat_panel( + title="GC lifetime", + description="The lifetime of TiDB GC", + format=UNITS.SECONDS, + targets=[ + target( + expr=expr_max( + "tidb_tikvclient_gc_config", + label_selectors=['type="tikv_gc_life_time"'], + by_labels=[], + ).skip_default_instance_selector(), + ), + ], + ), + stat_panel( + title="GC interval", + description="The interval of TiDB GC", + format=UNITS.SECONDS, + targets=[ + target( + expr=expr_max( + "tidb_tikvclient_gc_config", + label_selectors=['type="tikv_gc_run_interval"'], + by_labels=[], + ).skip_default_instance_selector(), + ), + ], + ), + ] + ) + layout.half_row( + [ + graph_panel( + title="GC in Compaction Filter", + description="Keys handled in GC compaction filter", + targets=[ + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filtered", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_filtered", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filter_skip", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_skipped", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_mvcc_rollback", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_mvcc-rollback/mvcc-lock", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filter_orphan_versions", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_orphan-versions", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filter_perform", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_performed-times", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_failure", + by_labels=["key_mode", "type"], + ), + legend_format="{{key_mode}}_failure-{{type}}", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filter_mvcc_deletion_met", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_mvcc-deletion-met", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filter_mvcc_deletion_handled", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_mvcc-deletion-handled", + ), + target( + expr=expr_sum_rate( + "tikv_gc_compaction_filter_mvcc_deletion_wasted", + by_labels=["key_mode"], + ), + legend_format="{{key_mode}}_mvcc-deletion-wasted", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="GC scan write details", + description="GC scan write details", + targets=[ + target( + expr=expr_sum_rate( + "tikv_gcworker_gc_keys", + label_selectors=['cf="write"'], + by_labels=["key_mode", "tag"], + ), + ), + ], + ), + graph_panel( + title="GC scan default details", + description="GC scan default details", + targets=[ + target( + expr=expr_sum_rate( + "tikv_gcworker_gc_keys", + label_selectors=['cf="default"'], + by_labels=["key_mode", "tag"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def Snapshot() -> RowPanel: + layout = Layout(title="Snapshot") + layout.row( + [ + graph_panel( + title="Rate snapshot message", + description="The rate of Raft snapshot messages sent", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_delta( + "tikv_raftstore_raft_sent_message_total", + range_selector="1m", + label_selectors=['type="snapshot"'], + ), + ), + ], + ), + graph_panel( + title="Snapshot state count", + description="The number of snapshots in different states", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_snapshot_traffic_total", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="99% Snapshot generation wait duration", + description="The time snapshot generation tasks waited to be scheduled. ", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_snapshot_generation_wait_duration_seconds", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="99% Handle snapshot duration", + description="The time consumed when handling snapshots", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_server_send_snapshot_duration_seconds", + ), + legend_format="send", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_snapshot_duration_seconds", + label_selectors=['type="apply"'], + ), + legend_format="apply", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_raftstore_snapshot_duration_seconds", + label_selectors=['type="generate"'], + ), + legend_format="generate", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="99.99% Snapshot size", + description="The snapshot size (P99.99).9999", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + "tikv_snapshot_size", + ), + legend_format="size", + ), + ], + ), + graph_panel( + title="99.99% Snapshot KV count", + description="The number of KV within a snapshot in .9999", + targets=[ + target( + expr=expr_histogram_quantile( + 0.9999, + "tikv_snapshot_kv_count", + ), + legend_format="count", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Snapshot Actions", + description="Action stats for snapshot generating and applying", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_delta( + "tikv_raftstore_snapshot_total", + range_selector="1m", + by_labels=["type", "status"], + ), + ), + target( + expr=expr_sum_delta( + "tikv_raftstore_clean_region_count", + range_selector="1m", + by_labels=["type", "status"], + ), + legend_format="clean-region-by-{{type}}", + ), + ], + ), + graph_panel( + title="Snapshot transport speed", + description="The speed of sending or receiving snapshot", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_snapshot_limit_transport_bytes", + by_labels=["instance", "type"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_snapshot_limit_generate_bytes", + ), + legend_format="{{instance}}-generate", + ), + ], + ), + ] + ) + return layout.row_panel + + +def Task() -> RowPanel: + layout = Layout(title="Task") + layout.row( + [ + graph_panel( + title="Worker handled tasks", + description="The number of tasks handled by worker", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_worker_handled_task_total", + by_labels=["name"], + ), + ), + ], + ), + graph_panel( + title="Worker pending tasks", + description="Current pending and running tasks of worker", + targets=[ + target( + expr=expr_sum( + "tikv_worker_pending_task_total", + by_labels=["name"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="FuturePool handled tasks", + description="The number of tasks handled by future_pool", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_futurepool_handled_task_total", + by_labels=["name"], + ), + ), + ], + ), + graph_panel( + title="FuturePool pending tasks", + description="Current pending and running tasks of future_pool", + targets=[ + target( + expr=expr_sum_aggr_over_time( + "tikv_futurepool_pending_task_total", + "avg", + range_selector="1m", + by_labels=["name"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def CoprocessorOverview() -> RowPanel: + layout = Layout(title="Coprocessor Overview") + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Request duration", + heatmap_description="The time consumed to handle coprocessor read requests", + graph_title="Request duration", + graph_description="The time consumed to handle coprocessor read requests", + yaxis_format=UNITS.SECONDS, + metric="tikv_coprocessor_request_duration_seconds", + graph_by_labels=["req"], + ), + ) + layout.row( + [ + graph_panel( + title="Total Requests", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_request_duration_seconds_count", + by_labels=["req"], + ), + ), + ], + ), + graph_panel( + title="Total Request Errors", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_request_error", + by_labels=["reason"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="KV Cursor Operations", + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_scan_keys_sum", + by_labels=["req"], + ), + ), + ], + ), + graph_panel_histogram_quantiles( + title="KV Cursor Operations", + description="", + metric="tikv_coprocessor_scan_keys", + yaxes=yaxes(left_format=UNITS.SHORT), + by_labels=["req"], + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel( + title="Total RocksDB Perf Statistics", + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_rocksdb_perf", + label_selectors=['metric="internal_delete_skipped_count"'], + by_labels=["req"], + ), + legend_format="delete_skipped-{{req}}", + ), + ], + ), + graph_panel( + title="Total Response Size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_response_bytes", + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def CoprocessorDetail() -> RowPanel: + layout = Layout(title="Coprocessor Detail") + layout.row( + [ + graph_panel_histogram_quantiles( + title="Handle duration", + description="The time consumed when handling coprocessor requests", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_coprocessor_request_handle_seconds", + by_labels=["req"], + hide_avg=True, + hide_count=True, + ), + graph_panel_histogram_quantiles( + title="Handle duration by store", + description="The time consumed to handle coprocessor requests per TiKV instance", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_coprocessor_request_handle_seconds", + by_labels=["req", "instance"], + hide_avg=True, + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Wait duration", + description="The time consumed when coprocessor requests are wait for being handled", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_coprocessor_request_wait_seconds", + label_selectors=['type="all"'], + by_labels=["req"], + hide_avg=True, + hide_count=True, + ), + graph_panel_histogram_quantiles( + title="Wait duration by store", + description="The time consumed when coprocessor requests are wait for being handled in each TiKV instance", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_coprocessor_request_wait_seconds", + label_selectors=['type="all"'], + by_labels=["req", "instance"], + hide_avg=True, + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel( + title="Total DAG Requests", + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_dag_request_count", + by_labels=["vec_type"], + ), + ), + ], + ), + graph_panel( + title="Total DAG Executors", + description="The total number of DAG executors", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_executor_count", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Total Ops Details (Table Scan)", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_scan_details", + label_selectors=['req="select"'], + by_labels=["tag"], + ), + ), + ], + ), + graph_panel( + title="Total Ops Details (Index Scan)", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_scan_details", + label_selectors=['req="index"'], + by_labels=["tag"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Total Ops Details by CF (Table Scan)", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_scan_details", + label_selectors=['req="select"'], + by_labels=["cf", "tag"], + ), + ), + ], + ), + graph_panel( + title="Total Ops Details by CF (Index Scan)", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_scan_details", + label_selectors=['req="index"'], + by_labels=["cf", "tag"], + ), + ), + ], + ), + ] + ) + layout.row( + heatmap_panel_graph_panel_histogram_quantile_pairs( + heatmap_title="Memory lock checking duration", + heatmap_description="The time consumed on checking memory locks for coprocessor requests", + graph_title="Memory lock checking duration", + graph_description="The time consumed on checking memory locks for coprocessor requests", + yaxis_format=UNITS.SECONDS, + metric="tikv_coprocessor_mem_lock_check_duration_seconds", + ), + ) + return layout.row_panel + + +def Threads() -> RowPanel: + layout = Layout(title="Threads") + layout.row( + [ + graph_panel( + title="Threads state", + targets=[ + target( + expr=expr_sum( + "tikv_threads_state", + by_labels=["instance", "state"], + ), + ), + target( + expr=expr_sum( + "tikv_threads_state", + by_labels=["instance"], + ), + legend_format="{{instance}}-total", + ), + ], + ), + graph_panel( + title="Threads IO", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_topk( + 20, + "%s" + % expr_sum_rate( + "tikv_threads_io_bytes_total", + by_labels=["name", "io"], + ).extra("> 1024"), + ), + legend_format="{{name}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Thread Voluntary Context Switches", + targets=[ + target( + expr=expr_topk( + 20, + "%s" + % expr_max_rate( + "tikv_thread_voluntary_context_switches", + by_labels=["name"], + ).extra("> 100"), + ), + legend_format="{{name}}", + ), + ], + ), + graph_panel( + title="Thread Nonvoluntary Context Switches", + targets=[ + target( + expr=expr_topk( + 20, + "%s" + % expr_max_rate( + "tikv_thread_nonvoluntary_context_switches", + by_labels=["name"], + ).extra("> 100"), + ), + legend_format="{{name}}", + ), + ], + ), + ] + ) + return layout.row_panel + + +def RocksDB() -> RowPanel: + layout = Layout(title="RocksDB", repeat="db") + layout.row( + [ + graph_panel( + title="Get operations", + description="The count of get operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_memtable_efficiency", + label_selectors=[ + 'db="$db"', + 'type="memtable_hit"', + ], + by_labels=[], # override default by instance. + ), + legend_format="memtable", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type=~"block_cache_data_hit|block_cache_filter_hit"', + ], + by_labels=[], # override default by instance. + ), + legend_format="block_cache", + ), + target( + expr=expr_sum_rate( + "tikv_engine_get_served", + label_selectors=[ + 'db="$db"', + 'type="get_hit_l0"', + ], + by_labels=[], # override default by instance. + ), + legend_format="l0", + ), + target( + expr=expr_sum_rate( + "tikv_engine_get_served", + label_selectors=[ + 'db="$db"', + 'type="get_hit_l1"', + ], + by_labels=[], # override default by instance. + ), + legend_format="l1", + ), + target( + expr=expr_sum_rate( + "tikv_engine_get_served", + label_selectors=[ + 'db="$db"', + 'type="get_hit_l2_and_up"', + ], + by_labels=[], # override default by instance. + ), + legend_format="l2_and_up", + ), + ], + ), + graph_panel( + title="Get duration", + description="The time consumed when executing get operations", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=2), + targets=[ + target( + expr=expr_max( + "tikv_engine_get_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="get_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_get_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="get_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_get_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="get_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_get_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="get_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Seek operations", + description="The count of seek operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_locate", + label_selectors=[ + 'db="$db"', + 'type="number_db_seek"', + ], + by_labels=[], # override default by instance. + ), + legend_format="seek", + ), + target( + expr=expr_sum_rate( + "tikv_engine_locate", + label_selectors=[ + 'db="$db"', + 'type="number_db_seek_found"', + ], + by_labels=[], # override default by instance. + ), + legend_format="seek_found", + ), + target( + expr=expr_sum_rate( + "tikv_engine_locate", + label_selectors=[ + 'db="$db"', + 'type="number_db_next"', + ], + by_labels=[], # override default by instance. + ), + legend_format="next", + ), + target( + expr=expr_sum_rate( + "tikv_engine_locate", + label_selectors=[ + 'db="$db"', + 'type="number_db_next_found"', + ], + by_labels=[], # override default by instance. + ), + legend_format="next_found", + ), + target( + expr=expr_sum_rate( + "tikv_engine_locate", + label_selectors=[ + 'db="$db"', + 'type="number_db_prev"', + ], + by_labels=[], # override default by instance. + ), + legend_format="prev", + ), + target( + expr=expr_sum_rate( + "tikv_engine_locate", + label_selectors=[ + 'db="$db"', + 'type="number_db_prev_found"', + ], + by_labels=[], # override default by instance. + ), + legend_format="prev_found", + ), + ], + ), + graph_panel( + title="Seek duration", + description="The time consumed when executing seek operation", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=2), + targets=[ + target( + expr=expr_max( + "tikv_engine_seek_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="seek_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_seek_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="seek_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_seek_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="seek_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_seek_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="seek_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Write operations", + description="The count of write operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_write_served", + label_selectors=[ + 'db="$db"', + 'type=~"write_done_by_self|write_done_by_other"', + ], + by_labels=[], # override default by instance. + ), + legend_format="done", + ), + target( + expr=expr_sum_rate( + "tikv_engine_write_served", + label_selectors=[ + 'db="$db"', + 'type="write_timeout"', + ], + by_labels=[], # override default by instance. + ), + legend_format="timeout", + ), + target( + expr=expr_sum_rate( + "tikv_engine_write_served", + label_selectors=[ + 'db="$db"', + 'type="write_with_wal"', + ], + by_labels=[], # override default by instance. + ), + legend_format="with_wal", + ), + ], + ), + graph_panel( + title="Write duration", + description="The time consumed when executing write operation", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=2), + targets=[ + target( + expr=expr_max( + "tikv_engine_write_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_write_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_write_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_write_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="WAL sync operations", + description="The count of WAL sync operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_wal_file_synced", + label_selectors=[ + 'db="$db"', + ], + by_labels=[], # override default by instance. + ), + legend_format="sync", + ), + ], + ), + graph_panel( + title="Write WAL duration", + description="The time consumed when executing write wal operation", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=2), + targets=[ + target( + expr=expr_max( + "tikv_engine_write_wal_time_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_wal_micros_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_write_wal_time_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_wal_micros_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_write_wal_time_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_wal_micros_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_write_wal_time_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="write_wal_micros_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Compaction operations", + description="The count of compaction and flush operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_event_total", + label_selectors=[ + 'db="$db"', + ], + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="WAL sync duration", + description="The time consumed when executing WAL sync operation", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=10), + targets=[ + target( + expr=expr_max( + "tikv_engine_wal_file_sync_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="wal_file_sync_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_wal_file_sync_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="wal_file_sync_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_wal_file_sync_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="wal_file_sync_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_wal_file_sync_micro_seconds", + label_selectors=[ + 'db="$db"', + 'type="wal_file_sync_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Compaction guard actions", + description="Compaction guard actions", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_raftstore_compaction_guard_action_total", + label_selectors=[ + 'cf=~"default|write"', + ], + by_labels=["cf", " type"], + ), + ), + ], + ), + graph_panel( + title="Compaction duration", + description="The time consumed when executing the compaction and flush operations", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=2), + targets=[ + target( + expr=expr_max( + "tikv_engine_compaction_time", + label_selectors=[ + 'db="$db"', + 'type="compaction_time_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_compaction_time", + label_selectors=[ + 'db="$db"', + 'type="compaction_time_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_compaction_time", + label_selectors=[ + 'db="$db"', + 'type="compaction_time_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_compaction_time", + label_selectors=[ + 'db="$db"', + 'type="compaction_time_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="SST read duration", + description="The time consumed when reading SST files", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS, log_base=2), + targets=[ + target( + expr=expr_max( + "tikv_engine_sst_read_micros", + label_selectors=[ + 'db="$db"', + 'type="sst_read_micros_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_sst_read_micros", + label_selectors=[ + 'db="$db"', + 'type="sst_read_micros_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_sst_read_micros", + label_selectors=[ + 'db="$db"', + 'type="sst_read_micros_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_sst_read_micros", + label_selectors=[ + 'db="$db"', + 'type="sst_read_micros_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + graph_panel( + title="Compaction reason", + description=None, + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_compaction_reason", + label_selectors=[ + 'db="$db"', + ], + by_labels=["cf", "reason"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Block cache size", + description="The block cache size. Broken down by column family if shared block cache is disabled.", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_topk( + 20, + "%s" + % expr_avg( + "tikv_engine_block_cache_size_bytes", + label_selectors=[ + 'db="$db"', + ], + by_labels=["cf", "instance"], + ), + ), + legend_format="{{instance}}-{{cf}}", + ), + ], + ), + graph_panel( + title="Memtable hit", + description="The hit rate of memtable", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_memtable_efficiency", + label_selectors=[ + 'db="$db"', + 'type="memtable_hit"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_operator( + expr_sum_rate( + "tikv_engine_memtable_efficiency", + label_selectors=[ + 'db="$db"', + 'type="memtable_hit"', + ], + by_labels=[], # override default by instance. + ), + "+", + expr_sum_rate( + "tikv_engine_memtable_efficiency", + label_selectors=[ + 'db="$db"', + 'type="memtable_miss"', + ], + by_labels=[], # override default by instance. + ), + ), + ), + legend_format="hit", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Block cache flow", + description="The flow of different kinds of block cache operations", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC, log_base=10), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="block_cache_byte_read"', + ], + by_labels=[], # override default by instance. + ), + legend_format="total_read", + ), + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="block_cache_byte_write"', + ], + by_labels=[], # override default by instance. + ), + legend_format="total_written", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_data_bytes_insert"', + ], + by_labels=[], # override default by instance. + ), + legend_format="data_insert", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_filter_bytes_insert"', + ], + by_labels=[], # override default by instance. + ), + legend_format="filter_insert", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_filter_bytes_evict"', + ], + by_labels=[], # override default by instance. + ), + legend_format="filter_evict", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_index_bytes_insert"', + ], + by_labels=[], # override default by instance. + ), + legend_format="index_insert", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_index_bytes_evict"', + ], + by_labels=[], # override default by instance. + ), + legend_format="index_evict", + ), + ], + ), + graph_panel( + title="Block cache hit", + description="The hit rate of block cache", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_hit"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_hit"', + ], + by_labels=[], # override default by instance. + ), + "+", + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_miss"', + ], + by_labels=[], # override default by instance. + ), + ), + ), + legend_format="all", + ), + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_data_hit"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_data_hit"', + ], + by_labels=[], # override default by instance. + ), + "+", + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_data_miss"', + ], + by_labels=[], # override default by instance. + ), + ), + ), + legend_format="data", + ), + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_filter_hit"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_filter_hit"', + ], + by_labels=[], # override default by instance. + ), + "+", + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_filter_miss"', + ], + by_labels=[], # override default by instance. + ), + ), + ), + legend_format="filter", + ), + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_index_hit"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_operator( + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_index_hit"', + ], + by_labels=[], # override default by instance. + ), + "+", + expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_index_miss"', + ], + by_labels=[], # override default by instance. + ), + ), + ), + legend_format="index", + ), + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_bloom_efficiency", + label_selectors=[ + 'db="$db"', + 'type="bloom_prefix_useful"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_sum_rate( + "tikv_engine_bloom_efficiency", + label_selectors=[ + 'db="$db"', + 'type="bloom_prefix_checked"', + ], + by_labels=[], # override default by instance. + ), + ), + legend_format="bloom prefix", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Keys flow", + description="The flow of different kinds of operations on keys", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="keys_read"', + ], + by_labels=[], # override default by instance. + ), + legend_format="read", + ), + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="keys_written"', + ], + by_labels=[], # override default by instance. + ), + legend_format="written", + ), + target( + expr=expr_sum_rate( + "tikv_engine_compaction_num_corrupt_keys", + label_selectors=[ + 'db="$db"', + ], + by_labels=[], # override default by instance. + ), + legend_format="corrupt", + ), + ], + ), + graph_panel( + title="Block cache operations", + description="The count of different kinds of block cache operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_add"', + ], + by_labels=[], # override default by instance. + ), + legend_format="total_add", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_data_add"', + ], + by_labels=[], # override default by instance. + ), + legend_format="data_add", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_filter_add"', + ], + by_labels=[], # override default by instance. + ), + legend_format="filter_add", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_index_add"', + ], + by_labels=[], # override default by instance. + ), + legend_format="index_add", + ), + target( + expr=expr_sum_rate( + "tikv_engine_cache_efficiency", + label_selectors=[ + 'db="$db"', + 'type="block_cache_add_failures"', + ], + by_labels=[], # override default by instance. + ), + legend_format="add_failures", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Read flow", + description="The flow rate of read operations per type", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="bytes_read"', + ], + by_labels=[], # override default by instance. + ), + legend_format="get", + ), + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="iter_bytes_read"', + ], + by_labels=[], # override default by instance. + ), + legend_format="scan", + ), + ], + ), + graph_panel( + title="Total keys", + description="The count of keys in each column family", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_sum( + "tikv_engine_estimate_num_keys", + label_selectors=[ + 'db="$db"', + ], + by_labels=["cf"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Write flow", + description="The flow of different kinds of write operations", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="wal_file_bytes"', + ], + by_labels=[], # override default by instance. + ), + legend_format="wal", + ), + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="bytes_written"', + ], + by_labels=[], # override default by instance. + ), + legend_format="write", + ), + ], + ), + graph_panel( + title="Bytes / Read", + description="The bytes per read", + yaxes=yaxes(left_format=UNITS.BYTES_IEC, log_base=10), + targets=[ + target( + expr=expr_max( + "tikv_engine_bytes_per_read", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_read_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_bytes_per_read", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_read_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_bytes_per_read", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_read_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_bytes_per_read", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_read_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Compaction flow", + description="The flow rate of compaction operations per type", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_compaction_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="bytes_read"', + ], + by_labels=[], # override default by instance. + ), + legend_format="read", + ), + target( + expr=expr_sum_rate( + "tikv_engine_compaction_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="bytes_written"', + ], + by_labels=[], # override default by instance. + ), + legend_format="written", + ), + target( + expr=expr_sum_rate( + "tikv_engine_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="flush_write_bytes"', + ], + by_labels=[], # override default by instance. + ), + legend_format="flushed", + ), + ], + ), + graph_panel( + title="Bytes / Write", + description="The bytes per write", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_max( + "tikv_engine_bytes_per_write", + label_selectors=['db="$db"', 'type="bytes_per_write_max"'], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_bytes_per_write", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_write_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_bytes_per_write", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_write_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_bytes_per_write", + label_selectors=[ + 'db="$db"', + 'type="bytes_per_write_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Read amplification", + description="The read amplification per TiKV instance", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_read_amp_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="read_amp_total_read_bytes"', + ], + ), + "/", + expr_sum_rate( + "tikv_engine_read_amp_flow_bytes", + label_selectors=[ + 'db="$db"', + 'type="read_amp_estimate_useful_bytes"', + ], + ), + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="Compaction pending bytes", + description="The pending bytes to be compacted", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_engine_pending_compaction_bytes", + label_selectors=['db="$db"'], + by_labels=["cf"], + ), + legend_format="{{cf}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Number of snapshots", + description="The number of snapshot of each TiKV instance", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_simple( + "tikv_engine_num_snapshots", + label_selectors=['db="$db"'], + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="Compression ratio", + description="The compression ratio of each level", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_avg( + "tikv_engine_compression_ratio", + label_selectors=['db="$db"'], + by_labels=["cf", "level"], + ), + legend_format="{{cf}}-L{{level}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Number files at each level", + description="The number of SST files for different column families in each level", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_avg( + "tikv_engine_num_files_at_level", + label_selectors=['db="$db"'], + by_labels=["cf", "level"], + ), + legend_format="{{cf}}-L{{level}}", + ), + ], + ), + graph_panel( + title="Oldest snapshots duration", + description="The time that the oldest unreleased snapshot survivals", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_simple( + "tikv_engine_oldest_snapshot_duration", + label_selectors=['db="$db"'], + ), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Stall conditions changed of each CF", + description="Stall conditions changed of each column family", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_simple( + "tikv_engine_stall_conditions_changed", + label_selectors=['db="$db"'], + ), + legend_format="{{instance}}-{{cf}}-{{type}}", + ), + ], + ), + graph_panel_histogram_quantiles( + title="Ingest SST duration seconds", + description="The time consumed when ingesting SST files", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_snapshot_ingest_sst_duration_seconds", + label_selectors=['db="$db"'], + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel( + title="Write Stall Reason", + description=None, + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_write_stall_reason", + label_selectors=['db="$db"'], + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Write stall duration", + description="The time which is caused by write stall", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_max( + "tikv_engine_write_stall", + label_selectors=['db="$db"', 'type="write_stall_max"'], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + target( + expr=expr_avg( + "tikv_engine_write_stall", + label_selectors=[ + 'db="$db"', + 'type="write_stall_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_avg( + "tikv_engine_write_stall", + label_selectors=[ + 'db="$db"', + 'type="write_stall_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_write_stall", + label_selectors=['db="$db"', 'type="write_stall_average"'], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Ingestion picked level", + description="The level that the external file ingests into", + yaxis=yaxis(format=UNITS.SHORT), + metric="tikv_engine_ingestion_picked_level_bucket", + label_selectors=['db="$db"'], + ), + graph_panel( + title="Memtable size", + description="The memtable size of each column family", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_engine_memory_bytes", + label_selectors=['db="$db"', 'type="mem-tables-all"'], + by_labels=["cf"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def RaftEngine() -> RowPanel: + layout = Layout(title="Raft Engine") + layout.row( + [ + graph_panel( + title="Operation", + description="The count of operations per second", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "raft_engine_write_apply_duration_seconds_count", + by_labels=[], # override default by instance. + ), + legend_format="write", + ), + target( + expr=expr_sum_rate( + "raft_engine_read_entry_duration_seconds_count", + by_labels=[], # override default by instance. + ), + legend_format="read_entry", + ), + target( + expr=expr_sum_rate( + "raft_engine_read_message_duration_seconds_count", + by_labels=[], # override default by instance. + ), + legend_format="read_message", + ), + ], + ), + graph_panel_histogram_quantiles( + title="Write Duration", + description="The time used in write operation", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="raft_engine_write_duration_seconds", + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel( + title="Flow", + description="The I/O flow rate", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "raft_engine_write_size_sum", + by_labels=[], # override default by instance. + ), + legend_format="write", + ), + target( + expr=expr_sum_rate( + "raft_engine_background_rewrite_bytes_sum", + by_labels=["type"], + ), + legend_format="rewrite-{{type}}", + ), + ], + ), + graph_panel( + title="Write Duration Breakdown (99%)", + description="99% duration breakdown of write operation", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, "raft_engine_write_preprocess_duration_seconds" + ), + legend_format="wait", + ), + target( + expr=expr_histogram_quantile( + 0.99, "raft_engine_write_leader_duration_seconds" + ), + legend_format="wal", + ), + target( + expr=expr_histogram_quantile( + 0.99, "raft_engine_write_apply_duration_seconds" + ), + legend_format="apply", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Bytes / Written", + description="The bytes per write", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + metric="raft_engine_write_size", + hide_count=True, + ), + graph_panel( + title="WAL Duration Breakdown (999%)", + description="999% duration breakdown of WAL write operation", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_write_leader_duration_seconds" + ), + legend_format="total", + ), + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_sync_log_duration_seconds" + ), + legend_format="sync", + ), + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_allocate_log_duration_seconds" + ), + legend_format="allocate", + ), + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_rotate_log_duration_seconds" + ), + legend_format="rotate", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="File Count", + description="The average number of files", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_avg( + "raft_engine_log_file_count", + by_labels=["type"], + ), + ), + target( + expr=expr_avg( + "raft_engine_swap_file_count", + by_labels=[], # override default by instance. + ), + legend_format="swap", + ), + target( + expr=expr_avg( + "raft_engine_recycled_file_count", + by_labels=["type"], + ), + legend_format="{{type}}-recycle", + ), + ], + ), + graph_panel( + title="Other Durations (99%)", + description="The 99% duration of operations other than write", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + targets=[ + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_read_entry_duration_seconds" + ), + legend_format="read_entry", + ), + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_read_message_duration_seconds" + ), + legend_format="read_message", + ), + target( + expr=expr_histogram_quantile( + 0.999, "raft_engine_purge_duration_seconds" + ), + legend_format="purge", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Entry Count", + description="The average number of log entries", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_avg( + "raft_engine_log_entry_count", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def Titan() -> RowPanel: + layout = Layout(title="Titan", repeat="titan_db") + layout.row( + [ + graph_panel( + title="Blob file count", + targets=[ + target( + expr=expr_sum( + "tikv_engine_titandb_num_live_blob_file", + label_selectors=['db="$titan_db"'], + by_labels=[], # override default by instance. + ), + legend_format="live blob file num", + ), + target( + expr=expr_sum( + "tikv_engine_titandb_num_obsolete_blob_file", + label_selectors=['db="$titan_db"'], + by_labels=[], # override default by instance. + ), + legend_format="obsolete blob file num", + ), + ], + ), + graph_panel( + title="Blob file size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_engine_titandb_live_blob_file_size", + label_selectors=['db="$titan_db"'], + by_labels=[], # override default by instance. + ), + legend_format="live blob file size", + ), + target( + expr=expr_sum( + "tikv_engine_titandb_obsolete_blob_file_size", + label_selectors=['db="$titan_db"'], + by_labels=[], # override default by instance. + ), + legend_format="obsolete blob file size", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Live blob size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_engine_titandb_live_blob_size", + label_selectors=['db="$titan_db"'], + ), + legend_format="live blob size", + ), + ], + ), + graph_panel( + title="Blob cache hit", + description="The hit rate of block cache", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_operator( + expr_sum_rate( + "tikv_engine_blob_cache_efficiency", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_cache_hit"', + ], + by_labels=[], # override default by instance. + ), + "/", + expr_operator( + expr_sum_rate( + "tikv_engine_blob_cache_efficiency", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_cache_hit"', + ], + by_labels=[], # override default by instance. + ), + "+", + expr_sum_rate( + "tikv_engine_blob_cache_efficiency", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_cache_miss"', + ], + by_labels=[], # override default by instance. + ), + ), + ), + legend_format="all", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Iter touched blob file count", + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_iter_touch_blob_file_count", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_iter_touch_blob_file_count_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_iter_touch_blob_file_count", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_iter_touch_blob_file_count_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_iter_touch_blob_file_count", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_iter_touch_blob_file_count_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_iter_touch_blob_file_count", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_iter_touch_blob_file_count_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob cache size", + description="The blob cache size.", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_topk( + 20, + "%s" + % expr_avg( + "tikv_engine_blob_cache_size_bytes", + label_selectors=['db="$titan_db"'], + by_labels=["cf", "instance"], + ), + ), + legend_format="{{instance}}-{{cf}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob key size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_key_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_key_size_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_key_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_key_size_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_key_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_key_size_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_key_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_key_size_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob value size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_value_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_value_size_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_value_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_value_size_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_value_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_value_size_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_value_size", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_value_size_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob get operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_locate", + label_selectors=[ + 'db="$titan_db"', + 'type="number_blob_get"', + ], + by_labels=[], # override default by instance. + ), + legend_format="get", + ), + ], + ), + graph_panel( + title="Blob get duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_get_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_average"'], + by_labels=["type"], + ), + legend_format="avg-{{type}}", + ), + target( + expr=expr_avg( + "tikv_engine_blob_get_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile95"', + ], + by_labels=["type"], + ), + legend_format="95%-{{type}}", + ), + target( + expr=expr_avg( + "tikv_engine_blob_get_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile99"', + ], + by_labels=["type"], + ), + legend_format="99%-{{type}}", + ), + target( + expr=expr_max( + "tikv_engine_blob_get_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_max"'], + by_labels=["type"], + ), + legend_format="max-{{type}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob file discardable ratio distribution", + targets=[ + target( + expr=expr_sum( + "tikv_engine_titandb_blob_file_discardable_ratio", + label_selectors=['db="$titan_db"'], + by_labels=["ratio"], + ), + ), + ], + ), + graph_panel( + title="Blob iter operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_locate", + label_selectors=[ + 'db="$titan_db"', + 'type="number_blob_seek"', + ], + by_labels=[], # override default by instance. + ), + legend_format="seek", + ), + target( + expr=expr_sum_rate( + "tikv_engine_blob_locate", + label_selectors=[ + 'db="$titan_db"', + 'type="number_blob_prev"', + ], + by_labels=[], # override default by instance. + ), + legend_format="prev", + ), + target( + expr=expr_sum_rate( + "tikv_engine_blob_locate", + label_selectors=[ + 'db="$titan_db"', + 'type="number_blob_next"', + ], + by_labels=[], # override default by instance. + ), + legend_format="next", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob seek duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_seek_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_average"'], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_seek_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_seek_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_seek_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_max"'], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob next duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_next_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_average"'], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_next_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_next_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_next_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_max"'], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob prev duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_prev_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_average"'], + by_labels=["type"], + ), + legend_format="avg-{{type}}", + ), + target( + expr=expr_avg( + "tikv_engine_blob_prev_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile95"', + ], + by_labels=["type"], + ), + legend_format="95%-{{type}}", + ), + target( + expr=expr_avg( + "tikv_engine_blob_prev_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type=~".*_percentile99"', + ], + by_labels=["type"], + ), + legend_format="99%-{{type}}", + ), + target( + expr=expr_max( + "tikv_engine_blob_prev_micros_seconds", + label_selectors=['db="$titan_db"', 'type=~".*_max"'], + by_labels=["type"], + ), + legend_format="max-{{type}}", + ), + ], + ), + graph_panel( + title="Blob keys flow", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_flow_bytes", + label_selectors=['db="$titan_db"', 'type=~"keys.*"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob file read duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_file_read_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_read_micros_average"', + ], + by_labels=["type"], + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_file_read_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_read_micros_percentile99"', + ], + by_labels=["type"], + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_file_read_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_read_micros_percentile95"', + ], + by_labels=["type"], + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_file_read_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_read_micros_max"', + ], + by_labels=["type"], + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob bytes flow", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_flow_bytes", + label_selectors=['db="$titan_db"', 'type=~"bytes.*"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob file write duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_file_write_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_write_micros_average"', + ], + by_labels=["type"], + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_file_write_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_write_micros_percentile99"', + ], + by_labels=["type"], + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_file_write_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_write_micros_percentile95"', + ], + by_labels=["type"], + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_file_write_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_write_micros_max"', + ], + by_labels=["type"], + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob file sync operations", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_file_synced", + label_selectors=['db="$titan_db"'], + by_labels=[], # override default by instance. + ), + legend_format="sync", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob GC action", + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_gc_action_count", + label_selectors=['db="$titan_db"'], + by_labels=["type"], + ), + ), + ], + ), + graph_panel( + title="Blob file sync duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_file_sync_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_sync_micros_average"', + ], + by_labels=["type"], + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_file_sync_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_sync_micros_percentile95"', + ], + by_labels=["type"], + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_file_sync_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_sync_micros_percentile99"', + ], + by_labels=["type"], + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_file_sync_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_file_sync_micros_max"', + ], + by_labels=["type"], + ), + legend_format="max", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob GC duration", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_gc_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_micros_average"', + ], + by_labels=["type"], + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_gc_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_micros_percentile95"', + ], + by_labels=["type"], + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_gc_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_micros_percentile99"', + ], + by_labels=["type"], + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_gc_micros_seconds", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_micros_max"', + ], + by_labels=["type"], + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob GC keys flow", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_gc_flow_bytes", + label_selectors=['db="$titan_db"', 'type=~"keys.*"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob GC input file size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_gc_input_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_input_file_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_gc_input_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_input_file_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_gc_input_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_input_file_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_gc_input_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_input_file_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob GC bytes flow", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_gc_flow_bytes", + label_selectors=['db="$titan_db"', 'type=~"bytes.*"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Blob GC output file size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_engine_blob_gc_output_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_output_file_average"', + ], + by_labels=[], # override default by instance. + ), + legend_format="avg", + ), + target( + expr=expr_avg( + "tikv_engine_blob_gc_output_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_output_file_percentile95"', + ], + by_labels=[], # override default by instance. + ), + legend_format="95%", + ), + target( + expr=expr_avg( + "tikv_engine_blob_gc_output_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_output_file_percentile99"', + ], + by_labels=[], # override default by instance. + ), + legend_format="99%", + ), + target( + expr=expr_max( + "tikv_engine_blob_gc_output_file", + label_selectors=[ + 'db="$titan_db"', + 'type="blob_gc_output_file_max"', + ], + by_labels=[], # override default by instance. + ), + legend_format="max", + ), + ], + ), + graph_panel( + title="Blob GC file count", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_engine_blob_gc_file_count", + label_selectors=['db="$titan_db"'], + by_labels=["type"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def PessimisticLocking() -> RowPanel: + layout = Layout(title="Pessimistic Locking") + layout.row( + [ + graph_panel( + title="Lock Manager Thread CPU", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"waiter_manager.*"'], + by_labels=["instance", "name"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"deadlock_detect.*"'], + by_labels=["instance", "name"], + ), + ), + ], + ), + graph_panel( + title="Lock Manager Handled tasks", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_lock_manager_task_counter", + by_labels=["type"], + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Waiter lifetime duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + metric="tikv_lock_manager_waiter_lifetime_duration", + hide_count=True, + ), + graph_panel( + title="Lock Waiting Queue", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_sum_aggr_over_time( + "tikv_lock_manager_wait_table_status", + "max", + "30s", + by_labels=["type"], + ), + ), + target( + expr=expr_sum_aggr_over_time( + "tikv_lock_wait_queue_entries_gauge_vec", + "max", + "30s", + by_labels=["type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Deadlock detect duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + metric="tikv_lock_manager_detect_duration", + hide_count=True, + ), + graph_panel( + title="Detect error", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_lock_manager_error_counter", by_labels=["type"] + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Deadlock detector leader", + targets=[ + target( + expr=expr_sum_aggr_over_time( + "tikv_lock_manager_detector_leader_heartbeat", + "max", + "30s", + ), + ) + ], + ), + graph_panel( + title="Total pessimistic locks memory size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_simple("tikv_pessimistic_lock_memory_size"), + legend_format="{{instance}}", + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="In-memory pessimistic locking result", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_in_memory_pessimistic_locking", by_labels=["result"] + ), + ) + ], + ), + graph_panel( + title="Pessimistic lock activities", + description="The number of active keys and waiters.", + targets=[ + target( + expr=expr_sum( + "tikv_lock_wait_queue_entries_gauge_vec", by_labels=["type"] + ), + ) + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Lengths of lock wait queues when transaction enqueues", + description="The length includes the entering transaction itself", + yaxis=yaxis(format=UNITS.SHORT), + metric="tikv_lock_wait_queue_length_bucket", + ), + graph_panel_histogram_quantiles( + title="In-memory scan lock read duration", + description="The duration scan in-memory pessimistic locks with read lock", + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=2), + metric="tikv_storage_mvcc_scan_lock_read_duration_seconds", + by_labels=["type"], + hide_count=True, + hide_avg=True, + ), + ] + ) + return layout.row_panel + + +def PointInTimeRestore() -> RowPanel: + layout = Layout(title="Point In Time Restore") + layout.row( + [ + graph_panel( + title="CPU Usage", + description=None, + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"sst_.*"', + ], + ), + ), + ], + ), + graph_panel( + title="P99 RPC Duration", + description=None, + yaxes=yaxes(left_format=UNITS.SECONDS, log_base=1), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_import_rpc_duration", + label_selectors=[ + 'request="apply"', + ], + ), + legend_format="total-99", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_import_apply_duration", + label_selectors=[ + 'type=~"queue|exec_download"', + ], + by_labels=["le", "type"], + ), + legend_format="(DL){{type}}-99", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_import_engine_request", + by_labels=["le", "type"], + ), + legend_format="(AP){{type}}-99", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Import RPC Ops", + description="", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_rpc_duration_count", + label_selectors=[ + 'request="apply"', + ], + by_labels=["instance", "request"], + ), + ), + target( + expr=expr_sum_rate( + "tikv_import_rpc_duration_count", + label_selectors=[ + 'request!="switch_mode"', + ], + by_labels=["request"], + ), + legend_format="total-{{request}}", + ), + ], + ), + graph_panel( + title="Cache Events", + description=None, + yaxes=yaxes(left_format=UNITS.COUNTS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_apply_cache_event", + label_selectors=[], + by_labels=["type", "instance"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Overall RPC Duration", + description=None, + yaxis=yaxis(format=UNITS.SECONDS, log_base=1), + metric="tikv_import_rpc_duration_bucket", + label_selectors=[ + 'request="apply"', + ], + ), + heatmap_panel( + title="Read File into Memory Duration", + description=None, + yaxis=yaxis(format=UNITS.SECONDS, log_base=1), + metric="tikv_import_apply_duration_bucket", + label_selectors=[ + 'type="exec_download"', + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Queuing Time", + description=None, + yaxis=yaxis(format=UNITS.SECONDS, log_base=1), + metric="tikv_import_engine_request_bucket", + label_selectors=[ + 'type="queuing"', + ], + ), + graph_panel( + title="Apply Request Throughput", + description=None, + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_apply_bytes_sum", + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Downloaded File Size", + description=None, + yaxis=yaxis(format=UNITS.BYTES_IEC), + metric="tikv_import_download_bytes_bucket", + ), + heatmap_panel( + title="Apply Batch Size", + description=None, + yaxis=yaxis(format=UNITS.BYTES_IEC), + metric="tikv_import_apply_bytes_bucket", + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Blocked by Concurrency Time", + description=None, + yaxis=yaxis(format=UNITS.SECONDS, log_base=1), + metric="tikv_import_engine_request_bucket", + label_selectors=[ + 'type="get_permit"', + ], + ), + graph_panel( + title="Apply Request Speed", + description=None, + yaxes=yaxes( + left_format=UNITS.OPS_PER_SEC, + log_base=1, + ), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_applier_event", + label_selectors=[ + 'type="begin_req"', + ], + by_labels=["instance", "type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Cached File in Memory", + description=None, + yaxes=yaxes(left_format=UNITS.BYTES_IEC, log_base=1), + targets=[ + target( + expr=expr_sum("tikv_import_apply_cached_bytes"), + ), + ], + ), + graph_panel( + title="Engine Requests Unfinished", + description=None, + yaxes=yaxes( + left_format=UNITS.SHORT, + log_base=1, + ), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_applier_event", + label_selectors=[ + 'type!="begin_req"', + ], + by_labels=["instance", "type"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Apply Time", + description=None, + yaxis=yaxis(format=UNITS.SECONDS, log_base=1), + metric="tikv_import_engine_request_bucket", + label_selectors=[ + 'type="apply"', + ], + ), + graph_panel( + title="Raft Store Memory Usage", + description="", + yaxes=yaxes(left_format=UNITS.BYTES_IEC, log_base=1), + targets=[ + target( + expr=expr_sum( + "tikv_server_mem_trace_sum", + label_selectors=[ + 'name=~"raftstore-.*"', + ], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def ResolvedTS() -> RowPanel: + layout = Layout(title="Resolved TS") + layout.row( + [ + graph_panel( + title="Resolved TS Worker CPU", + description="The CPU utilization of resolved ts worker", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"resolved_ts.*"', + ], + ), + ) + ], + ), + graph_panel( + title="Advance ts Worker CPU", + description="The CPU utilization of advance ts worker", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"advance_ts.*"', + ], + ), + ) + ], + ), + graph_panel( + title="Scan lock Worker CPU", + description="The CPU utilization of scan lock worker", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"inc_scan.*"', + ], + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Max gap of resolved-ts", + description="The gap between resolved ts (the maximum candidate of safe-ts) and current time.", + yaxes=yaxes(left_format=UNITS.MILLI_SECONDS), + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_min_resolved_ts_gap_millis", + ), + ) + ], + ), + graph_panel( + title="Max gap of follower safe-ts", + description="The gap between now() and the minimal (non-zero) safe ts for followers", + yaxes=yaxes(left_format=UNITS.MILLI_SECONDS), + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_min_follower_safe_ts_gap_millis", + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Min Resolved TS Region", + description="The region that has minimal resolved ts", + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_min_resolved_ts_region", + ), + ) + ], + ), + graph_panel( + title="Min Safe TS Follower Region", + description="The region id of the follower that has minimal safe ts", + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_min_follower_safe_ts_region", + ), + ) + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Check leader duration", + description="The time consumed when handle a check leader request", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_resolved_ts_check_leader_duration_seconds_bucket", + ), + graph_panel( + title="Max gap of resolved-ts in region leaders", + description="The gap between resolved ts of leaders and current time", + yaxes=yaxes(left_format=UNITS.MILLI_SECONDS), + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_min_leader_resolved_ts_gap_millis", + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="99% CheckLeader request region count", + description="Bucketed histogram of region count in a check leader request", + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_check_leader_request_item_count", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ) + ], + ), + heatmap_panel( + title="Initial scan backoff duration", + description="The backoff duration before starting initial scan", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_resolved_ts_initial_scan_backoff_duration_seconds_bucket", + ), + ] + ) + layout.row( + [ + graph_panel( + title="Lock heap size", + description="Total bytes in memory of resolved-ts observe regions's lock heap", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_resolved_ts_lock_heap_bytes", + ), + ) + ], + ), + graph_panel( + title="Min Leader Resolved TS Region", + description="The region that its leader has minimal resolved ts.", + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_min_leader_resolved_ts_region", + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Observe region status", + description="The status of resolved-ts observe regions", + targets=[ + target( + expr=expr_sum( + "tikv_resolved_ts_region_resolve_status", + by_labels=["type"], + ), + ) + ], + ), + graph_panel( + title="Fail advance ts count", + description="The count of fail to advance resolved-ts", + targets=[ + target( + expr=expr_sum_delta( + "tikv_resolved_ts_fail_advance_count", + by_labels=["instance", "reason"], + ), + ), + target( + expr=expr_sum_delta( + "tikv_raftstore_check_stale_peer", + by_labels=["instance"], + ), + legend_format="{{instance}}-stale-peer", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="99% CheckLeader request size", + description="Bucketed histogram of the check leader request size", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_check_leader_request_size_bytes", + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_check_leader_request_item_count", + by_labels=["instance"], + ), + legend_format="{{instance}}-check-num", + ), + ], + ), + graph_panel( + title="Pending command size", + description="Total bytes of pending commands in the channel", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_avg( + "tikv_resolved_ts_channel_penging_cmd_bytes_total", + ), + ) + ], + ), + ] + ) + return layout.row_panel + + +def Memory() -> RowPanel: + layout = Layout(title="Memory") + layout.row( + [ + graph_panel( + title="Allocator Stats", + description=None, + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum( + "tikv_allocator_stats", by_labels=["instance", "type"] + ) + ) + ], + ), + graph_panel( + title="Send Allocated(+) / Release Received(-) Bytes Rate", + description=None, + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_operator( + expr_sum_rate( + "tikv_allocator_thread_allocation", + label_selectors=['type="alloc"'], + by_labels=["thread_name"], + ), + "-", + expr_sum_rate( + "tikv_allocator_thread_allocation", + label_selectors=['type="dealloc"'], + by_labels=["thread_name"], + ), + ), + legend_format="{{thread_name}}", + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Newly Allocated Bytes by Thread", + description=None, + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_allocator_thread_allocation", + label_selectors=['type="alloc"'], + by_labels=["thread_name"], + ), + ) + ], + ), + graph_panel( + title="Recently Released Bytes by Thread", + description=None, + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_allocator_thread_allocation", + label_selectors=['type="dealloc"'], + by_labels=["thread_name"], + ), + ) + ], + ), + ] + ) + return layout.row_panel + + +def BackupImport() -> RowPanel: + layout = Layout(title="Backup & Import") + layout.row( + [ + graph_panel( + title="Backup CPU Utilization", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"b.*k.*w.*k.*"', + ], + ), + legend_format="backup-{{instance}}", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"backup_io"', + ], + ), + legend_format="backup-io-{{instance}}", + ), + target( + expr=expr_simple( + "tikv_backup_softlimit", + ), + legend_format="backup-auto-throttle-{{instance}}", + ), + ], + ), + graph_panel( + title="Backup Thread Count", + targets=[ + target( + expr=expr_sum( + "tikv_backup_thread_pool_size", + ), + ), + ], + ), + graph_panel( + title="Backup Errors", + description="", + targets=[ + target( + expr=expr_sum_delta( + "tikv_backup_error_counter", + by_labels=["instance", "error"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Backup Write CF SST Size", + yaxis=yaxis(format=UNITS.BYTES_IEC), + metric="tikv_backup_range_size_bytes_bucket", + label_selectors=['cf="write"'], + ), + heatmap_panel( + title="Backup Default CF SST Size", + yaxis=yaxis(format=UNITS.BYTES_IEC), + metric="tikv_backup_range_size_bytes_bucket", + label_selectors=['cf="default"'], + ), + graph_panel( + title="Backup SST Generation Throughput", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_backup_range_size_bytes_sum", + by_labels=[], # override default by instance. + ), + legend_format="total", + ), + target( + expr=expr_sum_rate( + "tikv_backup_range_size_bytes_sum", + by_labels=["instance", "cf"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Backup Scan SST Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_backup_range_duration_seconds_bucket", + label_selectors=['type="snapshot"'], + ), + heatmap_panel( + title="Backup Scan SST Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_backup_range_duration_seconds_bucket", + label_selectors=['type="scan"'], + ), + heatmap_panel( + title="Backup Save SST Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_backup_range_duration_seconds_bucket", + label_selectors=['type=~"save.*"'], + ), + graph_panel( + title="Backup SST Duration", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.999, + "tikv_backup_range_duration_seconds", + by_labels=["type"], + ), + legend_format="{{type}}-99.9%", + ), + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_backup_range_duration_seconds", + by_labels=["type"], + ), + legend_format="{{type}}-99%", + ), + target( + expr=expr_operator( + expr_sum( + "tikv_backup_range_duration_seconds_sum", + by_labels=["type"], + ), + "/", + expr_sum( + "tikv_backup_range_duration_seconds_count", + by_labels=["type"], + ), + ), + legend_format="{{type}}-avg", + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="External Storage Create Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_external_storage_create_seconds_bucket", + ), + graph_panel_histogram_quantiles( + title="External Storage Create Duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_external_storage_create_seconds", + hide_avg=True, + hide_count=True, + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Checksum Request Duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_coprocessor_request_duration_seconds", + label_selectors=['req=~"analyze.*|checksum.*"'], + by_labels=["req"], + hide_avg=True, + hide_count=True, + ), + graph_panel( + title="IO Utilization", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "node_disk_io_time_seconds_total", + by_labels=["instance", "device"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Import CPU Utilization", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"sst_.*"'], + by_labels=["instance"], + ), + legend_format="import-{{instance}}", + ), + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"sst_.*"'], + by_labels=["instance", "tid"], + ).extra("> 0"), + legend_format="import-{{instance}}-{{tid}}", + hide=True, + ), + target( + expr=expr_count_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"sst_.*"'], + ), + legend_format="import-count-{{instance}}", + hide=True, + ), + ], + ), + graph_panel( + title="Import Thread Count", + targets=[ + target( + expr=expr_count_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=['name=~"sst_.*"'], + by_labels=["instance"], + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="Import Errors", + targets=[ + target( + expr=expr_sum_delta( + "tikv_import_error_counter", + by_labels=["type", "error", "instance"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel_histogram_quantiles( + title="Import RPC Duration", + description="", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_import_rpc_duration", + by_labels=["request"], + hide_count=True, + ), + graph_panel( + title="Import RPC Ops", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_rpc_duration_count", + label_selectors=['request!="switch_mode"'], + by_labels=["request"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Import Write/Download RPC Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_rpc_duration_bucket", + label_selectors=['request=~"download|write"'], + ), + heatmap_panel( + title="Import Wait Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_download_duration_bucket", + label_selectors=['type="queue"'], + ), + heatmap_panel( + title="Import Read SST Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_download_duration_bucket", + label_selectors=['type="read"'], + ), + heatmap_panel( + title="Import Rewrite SST Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_download_duration_bucket", + label_selectors=['type="rewrite"'], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Import Ingest RPC Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_rpc_duration_bucket", + label_selectors=['request=~"ingest"'], + ), + heatmap_panel( + title="Import Ingest SST Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_ingest_duration_bucket", + label_selectors=['type=~"ingest"'], + ), + heatmap_panel( + title="Import Ingest SST Bytes", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_import_ingest_byte_bucket", + ), + graph_panel( + title="Import Download SST Throughput", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_download_bytes_sum", + ), + ), + target( + expr=expr_sum_rate( + "tikv_import_download_bytes_sum", + by_labels=[], + ), + legend_format="total", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Import Local Write keys", + targets=[ + target( + expr=expr_sum_delta( + "tikv_import_local_write_keys", + by_labels=["type", "instance"], + ), + ), + ], + ), + graph_panel( + title="Import Local Write bytes", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_import_local_write_bytes", + by_labels=["type", "instance"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="TTL Expired", + targets=[ + target( + expr=expr_sum( + "tikv_backup_raw_expired_count", + ), + ), + target( + expr=expr_sum( + "tikv_backup_raw_expired_count", + by_labels=[], + ), + legend_format="sum", + ), + ], + ), + graph_panel( + title="cloud request", + description="", + yaxes=yaxes(left_format=UNITS.SHORT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_cloud_request_duration_seconds_count", + by_labels=["cloud", "req"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def Encryption() -> RowPanel: + layout = Layout(title="Encryption") + layout.row( + [ + graph_panel( + title="Encryption data keys", + description="Total number of encryption data keys in use", + targets=[ + target( + expr=expr_sum( + "tikv_encryption_data_key_storage_total", + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="Encrypted files", + description="Number of files being encrypted", + targets=[ + target( + expr=expr_sum( + "tikv_encryption_file_num", + ), + legend_format="{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Encryption initialized", + description="Flag to indicate if encryption is initialized", + targets=[ + target( + expr=expr_simple( + "tikv_encryption_is_initialized", + ), + legend_format="{{instance}}", + ), + ], + ), + graph_panel( + title="Encryption meta files size", + description="Total size of encryption meta files", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_simple( + "tikv_encryption_meta_file_size_bytes", + ), + legend_format="{{name}}-{{instance}}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Encrypt/decrypt data nanos", + description="", + targets=[ + target( + expr=expr_sum_rate( + "tikv_coprocessor_rocksdb_perf", + label_selectors=[ + 'metric="encrypt_data_nanos"', + ], + by_labels=["req"], + ), + legend_format="encrypt-{{req}}", + ), + target( + expr=expr_sum_rate( + "tikv_coprocessor_rocksdb_perf", + label_selectors=[ + 'metric="decrypt_data_nanos"', + ], + by_labels=["req"], + ), + legend_format="decrypt-{{req}}", + ), + ], + ), + graph_panel_histogram_quantiles( + title="Read/write encryption meta duration", + description="Writing or reading file duration (second)", + yaxes=yaxes(left_format=UNITS.SECONDS), + metric="tikv_encryption_write_read_file_duration_seconds", + hide_count=True, + ), + ] + ) + return layout.row_panel + + +def BackupLog() -> RowPanel: + layout = Layout(title="Backup Log") + layout.row( + [ + stat_panel( + title="Endpoint Status", + targets=[ + target( + expr=expr_simple("tikv_log_backup_enabled"), + legend_format="{{ instance }}", + ), + ], + mappings=[ + StatValueMappings( + StatValueMappingItem("Disabled", "0", "red"), + StatValueMappingItem("Enabled", "1", "green"), + ), + ], + ), + stat_panel( + title="Task Status", + targets=[ + target( + expr=expr_min("tikv_log_backup_task_status"), + ), + ], + mappings=[ + StatValueMappings( + StatValueMappingItem("Running", "0", "green"), + StatValueMappingItem("Paused", "1", "yellow"), + StatValueMappingItem("Error", "2", "red"), + ), + ], + ), + stat_panel( + title="Advancer Owner", + text_mode="name", + targets=[ + target( + expr="tidb_log_backup_advancer_owner > 0", + legend_format="{{ instance }}", + ), + ], + ), + stat_panel( + title="Average Flush Size", + description="The average flush size of last 30mins.", + format=UNITS.BYTES_IEC, + targets=[ + target( + expr=expr_operator( + expr_sum_increase( + "tikv_log_backup_flush_file_size_sum", + range_selector="30m", + ), + "/", + expr_sum_increase( + "tikv_log_backup_flush_duration_sec_count", + label_selectors=['stage=~"save_files"'], + range_selector="30m", + ), + ), + legend_format="{{ instance }}", + ), + ], + ), + ] + ) + layout.row( + [ + stat_panel( + title="Flushed Files (Last 30m) Per Host", + description="The current total flushed file number of this run.", + decimals=0, + targets=[ + target( + expr=expr_sum_delta( + "tikv_log_backup_flush_file_size_count", + range_selector="30m", + ).extra("> 0"), + ), + ], + ), + stat_panel( + title="Flush Times (Last 30m)", + description="This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot.\n**NOTE: The size may get reduced if some of TiKVs reboot.**", + decimals=0, + targets=[ + target( + expr=expr_sum_delta( + "tikv_log_backup_flush_duration_sec_count", + range_selector="30m", + label_selectors=['stage=~"save_files"'], + ), + ), + ], + ), + stat_panel( + title="Total Flushed Size (Last 30m)", + description="This is the summary of the size has been flushed, summered by the data each TiKV has flushed since last boot.\n**NOTE: The size may get reduced if some of TiKVs reboot.**", + format=UNITS.BYTES_IEC, + targets=[ + target( + expr=expr_sum_delta( + "tikv_log_backup_flush_file_size_sum", + range_selector="30m", + ), + ), + ], + ), + stat_panel( + title="Flush Files (Last 30m)", + description="This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot.\n**NOTE: The size may get reduced if some of TiKVs reboot.**", + decimals=0, + targets=[ + target( + expr=expr_sum_delta( + "tikv_log_backup_flush_file_size_count", + range_selector="30m", + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="CPU Usage", + description="The CPU utilization of log backup threads. \n**(Note this is the average usage for a period of time, some peak of CPU usage may be lost.)**", + yaxes=yaxes(left_format=UNITS.PERCENT_UNIT), + targets=[ + target( + expr=expr_sum_rate( + "tikv_thread_cpu_seconds_total", + label_selectors=[ + 'name=~"backup_stream|log-backup-scan(-[0-9]+)?"' + ], + ), + ) + ], + ), + graph_panel( + title="Handle Event Rate", + description="", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_handle_kv_batch_sum", + ), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Initial Scan Generate Event Throughput", + description="The data rate of initial scanning emitting events.", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_incremental_scan_bytes_sum", + ), + ) + ], + ), + graph_panel( + title="Abnormal Checkpoint TS Lag", + description=None, + yaxes=yaxes(left_format=UNITS.MILLI_SECONDS), + targets=[ + target( + expr=expr_operator( + "time() * 1000", + "-", + expr_max( + "tidb_log_backup_last_checkpoint", by_labels=["task"] + ).extra("/ 262144 > 0"), + ), + legend_format="{{ task }}", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Memory Of Events", + description="The estimated memory usage by the streaming backup module.", + yaxes=yaxes(left_format=UNITS.BYTES_IEC), + targets=[ + target( + expr=expr_sum("tikv_log_backup_heap_memory"), + ) + ], + ), + graph_panel( + title="Observed Region Count", + description="", + targets=[ + target( + expr=expr_sum("tikv_log_backup_observed_region"), + ), + target( + expr=expr_sum( + "tikv_log_backup_observed_region", + ), + legend_format="{{instance}}-total", + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Errors", + description="The errors met when backing up.\n**They are retryable, don't worry.**", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_delta( + "tikv_log_backup_errors", + range_selector="1m", + by_labels=["type", "instance"], + ), + ), + ], + ), + graph_panel( + title="Fatal Errors", + description="The errors met when backing up.", + yaxes=yaxes(left_format=UNITS.OPS_PER_MIN), + targets=[ + target( + expr=expr_sum_delta( + "tikv_log_backup_fatal_errors", + range_selector="1m", + by_labels=["type", "instance"], + ), + ), + ], + ), + graph_panel( + title="Checkpoint TS of Tasks", + description=None, + yaxes=yaxes(left_format=UNITS.DATE_TIME_ISO_TODAY), + null_point_mode=NULL_AS_NULL, + targets=[ + target( + expr=expr_max( + "tidb_log_backup_last_checkpoint", by_labels=["task"] + ).extra("/ 262144 > 0"), + ), + target(expr="time() * 1000", legend_format="Current Time"), + ], + series_overrides=[ + series_override( + alias="Current Time", + fill=0, + dashes=True, + ), + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Flush Duration", + description="The duration of flushing a batch of file.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_flush_duration_sec_bucket", + label_selectors=['stage=~"save_files"'], + ), + heatmap_panel( + title="Initial scanning duration", + description="The duration of scanning the initial data from local DB and transform them into apply events.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_initial_scan_duration_sec_bucket", + ), + heatmap_panel( + title="Convert Raft Event duration", + description="The duration of converting a raft request into a apply event.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_event_handle_duration_sec_bucket", + label_selectors=['stage=~"to_stream_event"'], + ), + heatmap_panel( + title="Wait for Lock Duration", + description="The duration of waiting the mutex of the controller.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_event_handle_duration_sec_bucket", + label_selectors=['stage=~"get_router_lock"'], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Command Batch Size", + description="The number of KV-modify of each raft command observed.", + yaxis=yaxis(format=UNITS.SHORT), + metric="tikv_log_backup_handle_kv_batch_bucket", + ), + heatmap_panel( + title="Save to Temp File Duration", + description="The total cost of saving an event into temporary file.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_event_handle_duration_sec_bucket", + label_selectors=['stage=~"save_to_temp_file"'], + ), + heatmap_panel( + title="Write to Temp File Duration", + description="The total cost of writing a event into temporary file.\nComparing to the ***Save*** duration, it doesn't contain the time cost of routing the task by range / task.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_on_event_duration_seconds_bucket", + label_selectors=['stage="write_to_tempfile"'], + ), + heatmap_panel( + title="System Write Call Duration", + description="The duration of collecting metadata and call the UNIX system call *write* for each event.", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tikv_log_backup_on_event_duration_seconds_bucket", + label_selectors=['stage="syscall_write"'], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Internal Message Type", + description="The internal message type count.", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC, log_base=2), + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_interal_actor_acting_duration_sec_count", + by_labels=["message"], + ), + ) + ], + ), + graph_panel( + title="Internal Message Handling Duration (P99)", + description="The internal handling message duration.", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tikv_log_backup_interal_actor_acting_duration_sec", + by_labels=["message"], + ), + legend_format="{{message}}", + ) + ], + ), + graph_panel( + title="Internal Message Handling Duration (P90)", + description="The internal handling message duration.", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9, + "tikv_log_backup_interal_actor_acting_duration_sec", + by_labels=["message"], + ), + legend_format="{{message}}", + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Initial Scan RocksDB Throughput", + description="The internal read throughput of RocksDB during initial scanning. This panel can roughly present the read through to the hard disk of initial scanning.", + yaxes=yaxes(left_format=UNITS.BYTES_SEC_IEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_initial_scan_operations", + label_selectors=['op=~"read_bytes"'], + by_labels=["cf"], + ), + ) + ], + ), + graph_panel( + title="Initial Scan RocksDB Operation", + description="Misc statistics of RocksDB during initial scanning.", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_initial_scan_operations", + label_selectors=['op!~"read_bytes"'], + by_labels=["cf", "op"], + ).extra("> 0"), + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Initial Scanning Trigger Reason", + description="The reason of triggering initial scanning.", + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_initial_scan_reason", + by_labels=["reason"], + ), + ) + ], + ), + graph_panel( + title="Region Checkpoint Key Putting", + description="", + yaxes=yaxes(left_format=UNITS.COUNTS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_log_backup_metadata_key_operation", + by_labels=["type"], + ), + ) + ], + ), + ] + ) + layout.row( + [ + heatmap_panel( + title="Request Checkpoint Batch Size", + metric="tidb_log_backup_advancer_batch_size_bucket", + label_selectors=['type="checkpoint"'], + ), + heatmap_panel( + title="Tick Duration", + yaxis=yaxis(format=UNITS.SECONDS), + metric="tidb_log_backup_advancer_tick_duration_sec_bucket", + label_selectors=['step="tick"'], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Region Checkpoint Failure Reason", + description="The reason of advancer failed to be advanced.", + targets=[ + target( + expr=expr_sum_rate( + "tidb_log_backup_region_request_failure", + label_selectors=['reason!="retryable-scan-region"'], + by_labels=["reason"], + ), + ), + ], + ), + graph_panel( + title="Request Result", + description="The result of getting region checkpoints.", + targets=[ + target( + expr=expr_sum_rate( + "tidb_log_backup_region_request", + by_labels=["result"], + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Tick Duration (P99)", + description="The internal handling message duration.", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.99, + "tidb_log_backup_advancer_tick_duration_sec", + by_labels=["step"], + ), + legend_format="{{ step }}", + ) + ], + ), + graph_panel( + title="Tick Duration (P90)", + description="The internal handling message duration.", + yaxes=yaxes(left_format=UNITS.SECONDS), + targets=[ + target( + expr=expr_histogram_quantile( + 0.9, + "tidb_log_backup_advancer_tick_duration_sec", + by_labels=["step"], + ), + legend_format="{{ step }}", + ) + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="Get Region Operation Count", + description="The frequent of getting region level checkpoint.", + targets=[ + target( + expr=expr_sum_rate( + "tidb_log_backup_advancer_tick_duration_sec_count", + label_selectors=['step="get-regions-in-range"'], + by_labels=["step", "instance"], + ), + ) + ], + ), + graph_panel( + title="Try Advance Trigger Time", + description="The variant of checkpoint group.", + targets=[ + target( + expr=expr_sum_rate( + "tidb_log_backup_advancer_tick_duration_sec_count", + label_selectors=['step="try-advance"'], + by_labels=["step", "instance"], + ), + ) + ], + ), + ] + ) + return layout.row_panel + + +def SlowTrendStatistics() -> RowPanel: + layout = Layout(title="Slow Trend Statistics") + layout.row( + [ + graph_panel( + title="Slow Trend", + description="The changing trend of the slowness on I/O operations. 'value > 0' means the related store might have a slow trend.", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_slow_trend", + ), + ), + ], + ), + graph_panel( + title="QPS Changing Trend", + description="The changing trend of QPS on each store. 'value < 0' means the QPS has a dropping trend.", + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_slow_trend_result", + ), + ), + ], + ), + ] + ) + layout.row( + [ + graph_panel( + title="AVG Sampling Latency", + description="The sampling latency of recent queries. A larger value indicates that the store is more likely to be the slowest store.", + yaxes=yaxes(left_format=UNITS.MICRO_SECONDS), + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_slow_trend_l0", + ), + ), + ], + ), + graph_panel( + title="QPS of each store", + description="The QPS of each store.", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum( + "tikv_raftstore_slow_trend_result_value", + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +def StatusServer() -> RowPanel: + layout = Layout(title="Status Server") + layout.row( + [ + graph_panel_histogram_quantiles( + title="Status API Request Duration", + description="The 99 quantile durtion of status server API requests", + metric="tikv_status_server_request_duration_seconds", + yaxes=yaxes(left_format=UNITS.SECONDS), + by_labels=["path"], + hide_p9999=True, + hide_count=True, + hide_avg=True, + ), + graph_panel( + title="Status API Request (op/s)", + yaxes=yaxes(left_format=UNITS.OPS_PER_SEC), + targets=[ + target( + expr=expr_sum_rate( + "tikv_status_server_request_duration_seconds_count", + by_labels=["path"], + ), + ), + ], + ), + ] + ) + return layout.row_panel + + +#### Metrics Definition End #### + + +dashboard = Dashboard( + title="Test-Cluster-TiKV-Details", + uid="RDVQiEzZz", + timezone="browser", + refresh="1m", + inputs=[DATASOURCE_INPUT], + editable=True, + templating=Templates(), + panels=[ + Duration(), + Cluster(), + Errors(), + Server(), + gRPC(), + ThreadCPU(), + TTL(), + PD(), + IOBreakdown(), + RaftWaterfall(), + RaftIO(), + RaftPropose(), + RaftProcess(), + RaftMessage(), + RaftAdmin(), + RaftLog(), + LocalReader(), + UnifiedReadPool(), + Storage(), + FlowControl(), + SchedulerCommands(), + Scheduler(), + GC(), + Snapshot(), + Task(), + CoprocessorOverview(), + CoprocessorDetail(), + Threads(), + RocksDB(), + RaftEngine(), + Titan(), + PessimisticLocking(), + PointInTimeRestore(), + ResolvedTS(), + Memory(), + BackupImport(), + Encryption(), + BackupLog(), + SlowTrendStatistics(), + StatusServer(), + ], + # Set 14 or larger to support shared crosshair or shared tooltip. + # See https://github.com/grafana/grafana/blob/v10.2.2/public/app/features/dashboard/state/DashboardMigrator.ts#L443-L445 + schemaVersion=14, + graphTooltip=GRAPH_TOOLTIP_MODE_SHARED_CROSSHAIR, +).auto_panel_ids() diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index d327041cd8a..5dd36b73dfb 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -1,122 +1,102 @@ { "__inputs": [ { - "name": "DS_TEST-CLUSTER", - "label": "test-cluster", "description": "", - "type": "datasource", + "label": "test-cluster", + "name": "DS_TEST-CLUSTER", "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "7.5.11" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "panel", - "id": "heatmap", - "name": "Heatmap", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "" + "pluginName": "Prometheus", + "type": "datasource" } ], "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "${DS_TEST-CLUSTER}", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] + "list": [] }, + "description": "", "editable": true, "gnetId": null, "graphTooltip": 1, + "hideControls": false, "id": null, - "iteration": 1689914850671, "links": [], "panels": [ { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, "y": 0 }, - "id": 13620, + "height": null, + "hideTimeOverride": false, + "id": 1, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Write Duration Composition", + "description": "Write Pipeline Composition", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 1 + "y": 0 }, - "hiddenSeries": false, - "id": 12842, + "height": null, + "hideTimeOverride": false, + "id": 2, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -124,77 +104,115 @@ "lines": false, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": true, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "Write Raft Log .99", - "refId": "A", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "Propose Wait .99", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "Apply Wait .99", - "refId": "C" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "Replicate Raft Log .99", - "refId": "D" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "Apply Duration .99", - "refId": "E" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Write Pipeline Duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -202,6 +220,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -210,6 +229,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -220,42 +240,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "Read Duration Composition", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 1 + "y": 0 }, - "hiddenSeries": false, - "id": 12970, + "height": null, + "hideTimeOverride": false, + "id": 3, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -263,61 +297,85 @@ "lines": false, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": true, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(delta(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "Get Snapshot .99", - "refId": "A", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "Cop Wait .99", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.95,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "Cop Handle .99", - "refId": "C" + "metric": "", + "query": "histogram_quantile(0.95,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Cop Read Duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -325,6 +383,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -333,6 +392,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -343,54 +403,92 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Duration", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 1 + "y": 0 }, - "id": 2742, + "height": null, + "hideTimeOverride": false, + "id": 4, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The storage size per TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 5, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 8, "x": 0, - "y": 1 + "y": 0 }, - "hiddenSeries": false, - "id": 56, + "height": null, + "hideTimeOverride": false, + "id": 5, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": true, @@ -402,45 +500,57 @@ "values": true }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"used\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_store_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type = \"used\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum((\n tikv_store_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type = \"used\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -448,14 +558,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -466,38 +578,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The available capacity size of each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 5, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 8, "x": 8, - "y": 1 + "y": 0 }, - "hiddenSeries": false, - "id": 1706, + "height": null, + "hideTimeOverride": false, + "id": 6, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": true, @@ -509,45 +633,57 @@ "values": true }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"available\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_store_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"available\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum((\n tikv_store_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"available\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Available size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -555,14 +691,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -573,38 +711,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The capacity size per TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 5, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 8, "x": 16, - "y": 1 + "y": 0 }, - "hiddenSeries": false, - "id": 1707, + "height": null, + "hideTimeOverride": false, + "id": 7, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": false, "min": false, "rightSide": true, @@ -616,45 +766,57 @@ "values": true }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_store_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"capacity\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_store_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"capacity\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum((\n tikv_store_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"capacity\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Capacity size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -662,14 +824,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -680,38 +844,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU usage of each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 9 + "y": 7 }, - "hiddenSeries": false, - "id": 1708, + "height": null, + "hideTimeOverride": false, + "id": 8, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -725,43 +901,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(process_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", job=~\".*tikv\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n process_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",job=~\".*tikv\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum(rate(\n process_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",job=~\".*tikv\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -769,14 +957,16 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -787,44 +977,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The memory usage per TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 9 + "y": 7 }, - "hiddenSeries": false, - "id": 1709, + "height": null, + "hideTimeOverride": false, + "id": 9, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -832,43 +1034,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(process_resident_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=~\".*tikv\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n process_resident_memory_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",job=~\".*tikv\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum((\n process_resident_memory_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",job=~\".*tikv\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -876,14 +1090,16 @@ }, "yaxes": [ { + "decimals": null, "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -894,38 +1110,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The I/O utilization per TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 17 + "y": 14 }, - "hiddenSeries": false, - "id": 1710, + "height": null, + "hideTimeOverride": false, + "id": 10, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -939,43 +1167,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_io_time_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n node_disk_io_time_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{device}}", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{device}}", + "metric": "", + "query": "sum(rate(\n node_disk_io_time_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -983,14 +1223,16 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1001,44 +1243,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total bytes of read and write in each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 17 + "y": 14 }, - "hiddenSeries": false, - "id": 1711, + "height": null, + "hideTimeOverride": false, + "id": 11, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1046,52 +1300,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"kv\", type=\"wal_file_bytes\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"wal_file_bytes\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-write", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"wal_file_bytes\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"kv\", type=~\"bytes_read|iter_bytes_read\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"bytes_read|iter_bytes_read\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-read", - "refId": "B" + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"bytes_read|iter_bytes_read\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "MBps", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1099,14 +1371,16 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1117,46 +1391,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The QPS per command in each TiKV instance", + "description": "The number of leaders on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 25 + "y": 21 }, - "hiddenSeries": false, - "id": 1713, + "height": null, + "hideTimeOverride": false, + "id": 12, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, - "hideZero": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1164,44 +1448,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance,type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{type}}", - "refId": "A", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1209,14 +1504,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1227,34 +1524,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total number of the gRPC message failures", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 25 + "y": 21 }, - "hiddenSeries": false, - "id": 1712, + "height": null, + "hideTimeOverride": false, + "id": 13, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -1266,7 +1573,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1274,58 +1581,85 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_fail_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_msg_fail_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-grpc-msg-fail", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum(rate(\n tikv_grpc_msg_fail_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(delta(tikv_pd_heartbeat_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"noop\"}[1m])) by (instance) < 1", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_pd_heartbeat_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"noop\"}\n [$__rate_interval]\n)) by (instance) < 1", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-pd-heartbeat", - "refId": "B" + "metric": "", + "query": "sum(delta(\n tikv_pd_heartbeat_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"noop\"}\n [$__rate_interval]\n)) by (instance) < 1", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_critical_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_critical_error_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-{{type}}", - "refId": "C" + "metric": "", + "query": "sum(rate(\n tikv_critical_error_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Errps", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1333,14 +1667,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1351,44 +1687,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe number of leaders on each TiKV instance", + "description": "The number of leaders on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 28 }, - "hiddenSeries": false, - "id": 1715, + "height": null, + "hideTimeOverride": false, + "id": 14, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1396,57 +1744,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_raftstore_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"leader\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"leader\"}\n \n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 - }, - { - "expr": "delta(tikv_raftstore_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"leader\"}[30s]) < -10", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B" + "metric": "", + "query": "sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"leader\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Leader", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1454,14 +1800,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1472,44 +1820,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of Regions and Buckets on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 28 }, - "hiddenSeries": false, - "id": 1714, + "height": null, + "hideTimeOverride": false, + "id": 15, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1517,51 +1877,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_raftstore_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"region\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"region\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"region\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(tikv_raftstore_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"buckets\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"buckets\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - buckets", - "refId": "B", - "step": 10 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-buckets", + "metric": "", + "query": "sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"buckets\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Region", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1569,62 +1948,76 @@ }, "yaxes": [ { - "format": "short", - "label": "", + "decimals": null, + "format": "none", + "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": null, "description": "TiKV uptime since the last restart", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 24, "x": 0, - "y": 41 + "y": 35 }, - "hiddenSeries": false, - "id": 4106, + "height": null, + "hideTimeOverride": false, + "id": 16, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "max": false, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1632,43 +2025,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "(time() - process_start_time_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=~\".*tikv\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(time() - ((\n process_start_time_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",job=~\".*tikv\"}\n \n)) )", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "(time() - ((\n process_start_time_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",job=~\".*tikv\"}\n \n)) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Uptime", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1676,14 +2081,16 @@ }, "yaxes": [ { - "format": "dtdurations", - "label": "", + "decimals": null, + "format": "s", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1694,87 +2101,107 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Cluster", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 2 + "y": 0 }, - "id": 2743, + "height": null, + "hideTimeOverride": false, + "id": 17, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "Critical error alert", - "noDataState": "no_data", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.0, + "yaxis": "left" + } + ] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { "h": 7, "w": 24, "x": 0, - "y": 3 + "y": 0 }, - "hiddenSeries": false, - "id": 2741, + "height": null, + "hideTimeOverride": false, + "id": 18, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1782,28 +2209,39 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_critical_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_critical_error_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}-{{type}}", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_critical_error_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [ @@ -1812,22 +2250,23 @@ "fill": true, "line": true, "op": "gt", - "value": 0, - "visible": true + "value": 0.0, + "yaxis": "left" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Critical error", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1835,14 +2274,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -1853,34 +2294,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "Indicates occurrences of events that make the TiKV instance unavailable temporarily, such as Write Stall, Channel Full, Scheduler Busy, and Coprocessor Full", + "description": "\nIndicates occurrences of events that make the TiKV instance unavailable\ntemporarily, such as Write Stall, Channel Full, Scheduler Busy, and Coprocessor\nFull", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 10 + "y": 7 }, - "hiddenSeries": false, - "id": 1584, + "height": null, + "hideTimeOverride": false, + "id": 19, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -1892,7 +2343,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -1900,77 +2351,115 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_too_busy_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_too_busy_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "scheduler-{{instance}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_scheduler_too_busy_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_channel_full_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_channel_full_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "channelfull-{{instance}}-{{type}}", "metric": "", - "refId": "B", - "step": 4 + "query": "sum(rate(\n tikv_channel_full_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_coprocessor_request_error{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type='full'}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_error\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"full\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "coprocessor-{{instance}}", "metric": "", - "refId": "C", - "step": 4 + "query": "sum(rate(\n tikv_coprocessor_request_error\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"full\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_write_stall{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write_stall_percentile99\", db=~\"$db\"}) by (instance, db)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write_stall_percentile99\",db=~\"$db\"}\n \n)) by (instance, db) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "stall-{{instance}}-{{db}}", - "refId": "D" + "metric": "", + "query": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write_stall_percentile99\",db=~\"$db\"}\n \n)) by (instance, db) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_write_msg_block_wait_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_msg_block_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "store-write-channelfull-{{instance}}", - "refId": "E" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_msg_block_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Server is busy", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -1978,86 +2467,64 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "10s", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "10s", - "handler": 1, - "message": "TiKV server report failures", - "name": "server report failures alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total number of reporting failure messages", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 10 + "y": 7 }, - "hiddenSeries": false, - "id": 18, + "height": null, + "hideTimeOverride": false, + "id": 20, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -2077,53 +2544,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_server_report_failure_msg_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type,instance,store_id)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_server_report_failure_msg_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance, store_id) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{type}} - to - {{store_id}}", - "metric": "tikv_server_raft_store_msg_total", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 0, - "visible": true + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}-to-{{store_id}}", + "metric": "", + "query": "sum(rate(\n tikv_server_report_failure_msg_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance, store_id) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Server report failures", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2131,14 +2600,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -2149,34 +2620,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of different raftstore errors on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 17 + "y": 14 }, - "hiddenSeries": false, - "id": 1718, + "height": null, + "hideTimeOverride": false, + "id": 21, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -2196,44 +2677,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_storage_engine_async_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", status!~\"success|all\"}[1m])) by (instance, status)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",status!~\"success|all\"}\n [$__rate_interval]\n)) by (instance, status) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-{{status}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_storage_engine_async_request_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",status!~\"success|all\"}\n [$__rate_interval]\n)) by (instance, status) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raftstore error", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2241,52 +2733,64 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of scheduler errors per type on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 17 + "y": 14 }, - "hiddenSeries": false, - "id": 1719, + "height": null, + "hideTimeOverride": false, + "id": 22, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -2306,44 +2810,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_stage_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"snapshot_err|prepare_write_err\"}[1m])) by (instance, stage)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_stage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"snapshot_err|prepare_write_err\"}\n [$__rate_interval]\n)) by (instance, stage) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-{{stage}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_scheduler_stage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"snapshot_err|prepare_write_err\"}\n [$__rate_interval]\n)) by (instance, stage) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Scheduler error", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2351,162 +2866,64 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of different coprocessor errors on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 24 - }, - "hiddenSeries": false, - "id": 1720, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "max", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tikv_coprocessor_request_error{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, reason)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{reason}}", - "metric": "", - "refId": "A", - "step": 4 + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Coprocessor error", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 2, - "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of gRPC message errors per type on each TiKV instance", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": {}, - "overrides": [] + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 24 + "x": 0, + "y": 21 }, - "hiddenSeries": false, - "id": 1721, + "height": null, + "hideTimeOverride": false, + "id": 23, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -2526,44 +2943,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_fail_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_error\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{type}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{reason}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_coprocessor_request_error\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "gRPC message error", + "title": "Coprocessor error", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2571,52 +2999,64 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of dropped leaders per TiKV instance", + "description": "The number of gRPC message errors per type on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 31 + "x": 12, + "y": 21 }, - "hiddenSeries": false, - "id": 1722, + "height": null, + "hideTimeOverride": false, + "id": 24, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -2628,7 +3068,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -2636,48 +3076,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(delta(tikv_raftstore_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"leader\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_msg_fail_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "B" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_grpc_msg_fail_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Leader drop", + "title": "gRPC message error", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2685,14 +3132,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -2703,34 +3152,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of missing leaders per TiKV instance", + "description": "The count of dropped leaders per TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 31 + "x": 0, + "y": 28 }, - "hiddenSeries": false, - "id": 1723, + "height": null, + "hideTimeOverride": false, + "id": 25, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -2742,7 +3201,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -2750,48 +3209,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_raftstore_leader_missing{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"leader\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "B" + "metric": "", + "query": "sum(delta(\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"leader\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Leader missing", + "title": "Leader drop", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2799,14 +3265,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -2817,88 +3285,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "RocksDB damaged SST files", + "description": "The count of missing leaders per TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 38 + "x": 12, + "y": 28 }, - "hiddenSeries": false, - "id": 23763572510, + "height": null, + "hideTimeOverride": false, + "id": 26, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "tikv_rocksdb_damaged_files{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}-existed", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "tikv_rocksdb_damaged_files_deleted{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_leader_missing\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}}-deleted", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_leader_missing\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Damaged files", + "title": "Leader missing", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -2906,7 +3398,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -2914,6 +3407,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -2924,80 +3418,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "the count of Log Replication Reject caused by follower memory insufficient", + "description": "RocksDB damaged SST files", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 38 + "x": 0, + "y": 35 }, - "hiddenSeries": false, - "id": 23763572588, + "height": null, + "hideTimeOverride": false, + "id": 27, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_server_raft_append_rejects{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_rocksdb_damaged_files\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-existed", + "metric": "", + "query": "((\n tikv_rocksdb_damaged_files\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_rocksdb_damaged_files_deleted\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}-memory", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}-deleted", + "metric": "", + "query": "((\n tikv_rocksdb_damaged_files_deleted\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Log Replication Rejected", + "title": "Damaged files", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3005,7 +3546,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -3013,6 +3555,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -3023,55 +3566,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Errors", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 3 - }, - "id": 2744, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The size of each column family", + "description": "The count of Log Replication Reject caused by follower memory insufficient", "editable": true, "error": false, - "fill": 3, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 3 + "x": 12, + "y": 35 }, - "id": 33, + "height": null, + "hideTimeOverride": false, + "id": 28, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideZero": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -3079,39 +3623,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_server_raft_append_rejects\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_server_raft_append_rejects\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "CF size", + "title": "Log Replication Rejected", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3119,14 +3679,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -3137,71 +3699,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 29, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 0 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "datasourceId": 1, - "model": { - "expr": "sum(rate(tikv_channel_full_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, type)", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{type}}", - "metric": "", - "refId": "A", - "step": 10 - }, - "params": [ - "A", - "10s", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "10s", - "handler": 1, - "message": "TiKV channel full", - "name": "TiKV channel full alert", - "noDataState": "ok", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total number of channel full errors on each TiKV instance", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The size of each column family", "editable": true, "error": false, - "fill": 3, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 3 + "x": 0, + "y": 0 }, - "id": 22, + "height": null, + "hideTimeOverride": false, + "id": 30, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -3213,7 +3790,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -3221,48 +3798,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_channel_full_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{type}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", "metric": "", - "refId": "A", - "step": 10 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 0 + "query": "sum((\n tikv_engine_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Channel full", + "title": "CF size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3270,14 +3854,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -3288,39 +3874,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of leaders being written on each TiKV instance", + "description": "The total number of channel full errors on each TiKV instance", "editable": true, "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 11 + "x": 12, + "y": 0 }, - "id": 75, + "height": null, + "hideTimeOverride": false, + "id": 31, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -3328,40 +3931,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_region_written_keys_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_channel_full_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_region_written_keys_bucket", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_channel_full_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Active written leaders", + "title": "Channel full", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3369,90 +3987,76 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 1073741824 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "B", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "approximate region size alert", - "noDataState": "no_data", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The approximate Region size", + "description": "The number of leaders being written on each TiKV instance", "editable": true, "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 + "h": 7, + "w": 24, + "x": 0, + "y": 7 }, - "id": 1481, + "height": null, + "hideTimeOverride": false, + "id": 32, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -3460,58 +4064,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "metric": "", - "refId": "B", - "step": 10 - }, - { - "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_region_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(rate(tikv_raftstore_region_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_region_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_region_written_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", "metric": "", - "refId": "D", - "step": 10 + "query": "sum(rate(\n tikv_region_written_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Approximate Region size", + "title": "Active written leaders", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3519,156 +4120,180 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 19 + "y": 14 }, - "id": 3638, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 33, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "max": true, - "min": false, - "rightSide": true, - "show": false, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": false, - "linewidth": 1, "links": [], - "nullPointMode": "null as zero", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(rate(tikv_raftstore_region_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_region_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", "metric": "", - "refId": "B", - "step": 10 + "query": "sum(rate(\n tikv_raftstore_region_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Approximate Region size Histogram", + "title": "Approximate region size", "tooltip": { - "msResolution": false, - "shared": false, - "sort": 2, + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "histogram", + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The average rate of writing bytes to Regions per TiKV instance", + "description": "The approximate Region size", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 19 + "y": 14 }, - "id": 58, + "height": null, + "hideTimeOverride": false, + "id": 34, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -3676,40 +4301,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "connected", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_region_written_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (instance) / sum(rate(tikv_region_written_bytes_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_region_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_regi", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_region_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_region_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_region_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_region_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_region_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_region_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_region_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_region_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_region_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Region average written bytes", + "title": "Approximate region size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3717,6 +4425,7 @@ }, "yaxes": [ { + "decimals": null, "format": "bytes", "label": null, "logBase": 1, @@ -3725,6 +4434,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -3735,10 +4445,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -3746,93 +4457,148 @@ "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", - "colorScheme": "interpolateOranges", + "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, - "dataFormat": "timeseries", + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 27 + "y": 21 }, "heatmap": {}, - "hideZeroBuckets": false, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, "highlightCards": true, - "id": 3646, + "id": 35, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(rate(tikv_region_written_bytes_bucket[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_regi", - "refId": "A", - "step": 10 + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_region_written_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_region_written_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Region written bytes", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": null, - "format": "decbytes", + "decimals": 1, + "format": "bytes", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The average rate of written keys to Regions per TiKV instance", + "description": "The average rate of writing bytes to Regions per TiKV instance", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 27 + "y": 21 }, - "id": 57, + "height": null, + "hideTimeOverride": false, + "id": 36, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -3840,40 +4606,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "connected", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_region_written_keys_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tikv_region_written_keys_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_region_written_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_region_written_bytes_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_region_written_keys_bucket", - "refId": "A", - "step": 10 + "metric": "", + "query": "(sum(rate(\n tikv_region_written_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_region_written_bytes_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Region average written keys", + "title": "Region average written bytes", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -3881,7 +4662,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -3889,6 +4671,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -3899,10 +4682,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -3910,135 +4694,204 @@ "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", - "colorScheme": "interpolateOranges", + "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, - "dataFormat": "timeseries", + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 35 + "y": 28 }, "heatmap": {}, - "hideZeroBuckets": false, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, "highlightCards": true, - "id": 3647, + "id": 37, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(rate(tikv_region_written_keys_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_region_written_keys_bucket", - "refId": "A", - "step": 10 + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_region_written_keys_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_region_written_keys_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Region written keys", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": null, - "format": "short", + "decimals": 1, + "format": "none", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The size of requests into request batch per TiKV instance", + "description": "The average rate of written keys to Regions per TiKV instance", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 35 + "y": 28 }, - "id": 3720, + "height": null, + "hideTimeOverride": false, + "id": 38, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_server_request_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_server_request_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{type}} avg", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_request_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_region_written_keys_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_region_written_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}} 99", - "refId": "B" + "legendFormat": "{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_region_written_keys_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_region_written_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Request batch input", + "title": "Region average written keys", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4046,14 +4899,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, - "logBase": 10, + "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -4064,38 +4919,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe number of peers in hibernated state", + "description": "The number of peers in hibernated state", "editable": true, "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 43 + "y": 35 }, - "id": 3730, + "height": null, + "hideTimeOverride": false, + "id": 39, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -4103,45 +4976,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_raftstore_hibernated_peer_state{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance, state)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_hibernated_peer_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, state) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-{{state}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum((\n tikv_raftstore_hibernated_peer_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, state) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Hibernate Peers", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4149,14 +5032,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -4167,24 +5052,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 43 + "y": 35 }, - "id": 7266, + "height": null, + "hideTimeOverride": false, + "id": 40, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -4195,50 +5100,79 @@ "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_server_mem_trace_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore-.*\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_server_mem_trace_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftstore-.*\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}-{{name}}", - "refId": "A" + "metric": "", + "query": "((\n tikv_server_mem_trace_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftstore-.*\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "raft_engine_memory_usage{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n raft_engine_memory_usage\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}-raft-engine", - "refId": "B" + "metric": "", + "query": "((\n raft_engine_memory_usage\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Memory trace", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4246,6 +5180,7 @@ }, "yaxes": [ { + "decimals": null, "format": "bytes", "label": null, "logBase": 1, @@ -4254,78 +5189,123 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 51 + "y": 42 }, - "id": 9560, + "height": null, + "hideTimeOverride": false, + "id": 41, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(tikv_raft_entries_evict_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raft_entries_evict_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_raft_entries_evict_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft Entry Cache Evicts", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4333,7 +5313,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -4341,53 +5322,67 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 51 + "y": 42 }, - "hiddenSeries": false, - "id": 12971, + "height": null, + "hideTimeOverride": false, + "id": 42, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -4395,44 +5390,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_address_resolve_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_address_resolve_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_address_resolve_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Resolve address duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4440,6 +5446,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -4448,6 +5455,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -4458,79 +5466,130 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1.0, + "yaxis": "left" + } + ] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 60 + "y": 49 }, - "hiddenSeries": false, - "id": 23763572581, + "height": null, + "hideTimeOverride": false, + "id": 43, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, + "current": true, + "hideEmpty": true, "hideZero": true, - "max": false, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, name))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_pool_schedule_wait_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name, le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "{{name}}", - "queryType": "randomWalk", - "refId": "A" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_pool_schedule_wait_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1.0, + "yaxis": "left" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99% Thread Pool Schedule Wait Duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4538,17 +5597,19 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 2, - "max": "30", + "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true @@ -4556,79 +5617,130 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The average rate of written keys to Regions per TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1.0, + "yaxis": "left" + } + ] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 60 + "y": 49 }, - "hiddenSeries": false, - "id": 23763572692, + "height": null, + "hideTimeOverride": false, + "id": 44, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, + "current": true, + "hideEmpty": true, "hideZero": true, - "max": false, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_yatp_pool_schedule_wait_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name) / sum(rate(tikv_yatp_pool_schedule_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_yatp_pool_schedule_wait_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) / sum(rate(\n tikv_yatp_pool_schedule_wait_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "{{name}}", - "queryType": "randomWalk", - "refId": "A" + "metric": "", + "query": "(sum(rate(\n tikv_yatp_pool_schedule_wait_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) / sum(rate(\n tikv_yatp_pool_schedule_wait_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) )", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1.0, + "yaxis": "left" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Average Thread Pool Schedule Wait Duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4636,17 +5748,19 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 2, - "max": "30", + "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true @@ -4654,92 +5768,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 68 + "y": 56 }, - "hiddenSeries": false, - "id": 23763572784, + "height": null, + "hideTimeOverride": false, + "id": 45, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": false, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*/", - "stack": "A" - } - ], - "spaceLength": 10, - "stack": false, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": true, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_storage_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_time\"}[1m])) by (req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_time\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "{{req}}", - "queryType": "randomWalk", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_storage_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_time\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_time\"}[1m])) by (req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_time\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "copr-{{req}}", - "queryType": "randomWalk", - "refId": "B" + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_time\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Disk IO time per second", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4747,6 +5896,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ns", "label": null, "logBase": 1, @@ -4755,6 +5905,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -4765,90 +5916,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 68 + "y": 56 }, - "hiddenSeries": false, - "id": 23763572785, + "height": null, + "hideTimeOverride": false, + "id": 46, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": false, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*/", - "stack": "A" - } - ], - "spaceLength": 10, - "stack": false, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": true, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_storage_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_byte\"}[1m])) by (req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_byte\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "{{req}}", - "queryType": "randomWalk", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_storage_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_byte\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_byte\"}[1m])) by (req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_byte\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "copr-{{req}}", - "queryType": "randomWalk", - "refId": "B" + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"block_read_byte\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Disk IO bytes per second", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4856,7 +6044,8 @@ }, "yaxes": [ { - "format": "binBps", + "decimals": null, + "format": "ns", "label": null, "logBase": 1, "max": null, @@ -4864,6 +6053,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -4874,57 +6064,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Server", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 4 + "y": 0 }, - "id": 2745, + "height": null, + "hideTimeOverride": false, + "id": 47, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The count of different kinds of gRPC message", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 5 + "y": 0 }, - "hiddenSeries": false, - "id": 95, + "height": null, + "hideTimeOverride": false, + "id": 48, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -4932,43 +6163,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type, priority) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{priority}}", + "metric": "", + "query": "sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type, priority) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC message count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -4976,7 +6234,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "reqps", "label": null, "logBase": 1, "max": null, @@ -4984,6 +6243,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -4994,41 +6254,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The count of different kinds of gRPC message which is failed", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 5 + "y": 0 }, - "hiddenSeries": false, - "id": 107, + "height": null, + "hideTimeOverride": false, + "id": 49, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5036,43 +6311,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_fail_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_msg_fail_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_grpc_msg_fail_total", - "refId": "A", - "step": 10 + "metric": "", + "query": "sum(rate(\n tikv_grpc_msg_fail_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC message failed", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5080,7 +6367,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "reqps", "label": null, "logBase": 1, "max": null, @@ -5088,6 +6376,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5098,43 +6387,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The execution time of gRPC message", + "description": "The 99% percentile of execution time of gRPC message", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 13 + "y": 7 }, - "hiddenSeries": false, - "id": 98, + "height": null, + "hideTimeOverride": false, + "id": 50, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5142,43 +6444,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_grpc_msg_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_grpc_msg_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_grpc_msg_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type, priority, le) \n \n \n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{priority}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_grpc_msg_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"kv_gc\"}\n [$__rate_interval]\n)) by (type, priority, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99% gRPC message duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5186,14 +6515,16 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, - "logBase": 10, + "logBase": 2, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5204,42 +6535,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The average execution time of gRPC message", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 13 + "y": 7 }, - "hiddenSeries": false, - "id": 2532, + "height": null, + "hideTimeOverride": false, + "id": 51, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5247,43 +6592,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_grpc_msg_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_grpc_msg_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) / sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) )", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "metric": "", + "query": "(sum(rate(\n tikv_grpc_msg_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) / sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_grpc_msg_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, priority) / sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, priority) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{priority}}", + "metric": "", + "query": "(sum(rate(\n tikv_grpc_msg_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, priority) / sum(rate(\n tikv_grpc_msg_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, priority) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Average gRPC message duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5291,6 +6663,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 2, @@ -5299,6 +6672,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5309,42 +6683,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The 99% percentile of execution time of gRPC message", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 21 + "y": 14 }, - "hiddenSeries": false, - "id": 2533, + "height": null, + "hideTimeOverride": false, + "id": 52, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5352,78 +6740,130 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_grpc_req_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_grpc_req_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "99% request", - "refId": "A", - "step": 10 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_grpc_req_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_grpc_resp_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_grpc_resp_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99% response", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_grpc_resp_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_server_grpc_req_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_grpc_req_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_server_grpc_req_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_grpc_req_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg request", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_server_grpc_req_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_grpc_req_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_server_grpc_resp_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_grpc_resp_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_server_grpc_resp_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_grpc_resp_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg response", - "refId": "D" + "metric": "", + "query": "(sum(rate(\n tikv_server_grpc_resp_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_grpc_resp_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_request_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_request_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99% kv get batch", - "refId": "E" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_request_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_server_request_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_request_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_server_request_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_request_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg kv batch", - "refId": "F" + "metric": "", + "query": "(sum(rate(\n tikv_server_request_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_request_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC batch size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5431,7 +6871,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -5439,6 +6880,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5449,42 +6891,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 21 + "y": 14 }, - "hiddenSeries": false, - "id": 2534, + "height": null, + "hideTimeOverride": false, + "id": 53, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5492,50 +6948,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_raft_message_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_raft_message_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "99%", - "refId": "A", - "step": 10 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_raft_message_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_server_raft_message_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_raft_message_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_server_raft_message_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_raft_message_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_server_raft_message_batch_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_server_raft_message_batch_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "raft message batch size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5543,7 +7019,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -5551,6 +7028,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5561,41 +7039,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The QPS of different sources of gRPC request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 29 + "y": 21 }, - "hiddenSeries": false, - "id": 23763572858, + "height": null, + "hideTimeOverride": false, + "id": 54, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5603,45 +7096,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_grpc_request_source_counter_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (source)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_request_source_counter_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (source) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{source}}", + "metric": "", + "query": "sum(rate(\n tikv_grpc_request_source_counter_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (source) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC request sources QPS", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5649,6 +7152,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -5657,6 +7161,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5667,41 +7172,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": true, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The duration of different sources of gRPC request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 29 + "y": 21 }, - "hiddenSeries": false, - "id": 23763572859, + "height": null, + "hideTimeOverride": false, + "id": 55, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5709,45 +7229,55 @@ "lines": false, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": true, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_grpc_request_source_duration_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (source)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_request_source_duration_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (source) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{source}}", + "metric": "", + "query": "sum(rate(\n tikv_grpc_request_source_duration_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (source) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC request sources duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5755,7 +7285,8 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -5763,6 +7294,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5773,41 +7305,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The QPS of different resource groups of gRPC request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 9, + "h": 7, "w": 24, "x": 0, - "y": 37 + "y": 28 }, - "hiddenSeries": false, - "id": 23763573090, + "height": null, + "hideTimeOverride": false, + "id": 56, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5815,45 +7362,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_grpc_resource_group_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_grpc_resource_group_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{name}}", + "metric": "", + "query": "sum(rate(\n tikv_grpc_resource_group_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC resource group QPS", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -5861,7 +7418,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -5869,6 +7427,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -5879,105 +7438,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "gRPC", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 5 + "y": 0 }, - "id": 2746, + "height": null, + "hideTimeOverride": false, + "id": 57, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 1.7 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "datasourceId": 1, - "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 20 - }, - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "60s", - "handler": 1, - "message": "TiKV raftstore thread CPU usage is high", - "name": "TiKV raft store CPU alert", - "noDataState": "ok", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", "description": "The CPU utilization of raftstore thread", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 5 + "y": 0 }, - "hiddenSeries": false, - "id": 61, + "height": null, + "hideTimeOverride": false, + "id": 58, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -5985,53 +7537,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"(raftstore|rs)_.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 1.7, - "visible": true + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"(raftstore|rs)_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6039,6 +7593,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -6047,6 +7602,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6057,79 +7613,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 1.8 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "1m", - "handler": 1, - "message": "TiKV async apply thread CPU usage is high", - "name": "TiKV async apply CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of async apply", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 5 + "y": 0 }, - "hiddenSeries": false, - "id": 79, + "height": null, + "hideTimeOverride": false, + "id": 59, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -6137,53 +7670,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"apply_[0-9]+\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"apply_[0-9]+\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 1.8, - "visible": true + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"apply_[0-9]+\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Async apply CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6191,14 +7726,16 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6209,81 +7746,65 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "alertRuleTags": {}, - "conditions": [ - { - "evaluator": { - "params": [ - 0.8 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "60s", - "handler": 1, - "message": "TiKV store writer thread CPU usage is high", - "name": "Store writer CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of store writer thread", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.8, + "yaxis": "left" + } + ] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 12 + "y": 7 }, - "hiddenSeries": false, - "id": 13115, + "height": null, + "hideTimeOverride": false, + "id": 60, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -6291,30 +7812,39 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"store_write.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_write.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_write.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [ @@ -6324,22 +7854,22 @@ "line": true, "op": "gt", "value": 0.8, - "visible": true + "yaxis": "left" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store writer CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6347,6 +7877,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -6355,6 +7886,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6365,76 +7897,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 3.6 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "1m", - "handler": 1, - "message": "TiKV gRPC poll thread CPU usage is high", - "name": "TiKV gRPC poll CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of gRPC", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 12 + "y": 7 }, - "hiddenSeries": false, - "id": 105, + "height": null, + "hideTimeOverride": false, + "id": 61, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -6442,51 +7954,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"grpc.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 3.6, - "visible": true + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"grpc.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6494,6 +8010,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -6502,6 +8019,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6512,69 +8030,53 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 3.6 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "1m", - "handler": 1, - "message": "TiKV scheduler worker thread CPU usage is high", - "name": "TiKV scheduler worker CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of scheduler worker", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 3.6, + "yaxis": "left" + } + ] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 19 + "y": 14 }, - "hiddenSeries": false, - "id": 64, + "height": null, + "hideTimeOverride": false, + "id": 62, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -6586,7 +8088,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -6594,28 +8096,39 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched_.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sched_.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sched_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [ @@ -6625,22 +8138,22 @@ "line": true, "op": "gt", "value": 3.6, - "visible": true + "yaxis": "left" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Scheduler worker CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6648,6 +8161,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -6656,6 +8170,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6666,69 +8181,53 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 3.6 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "1m", - "handler": 1, - "message": "TiKV Storage ReadPool thread CPU usage is high", - "name": "TiKV Storage ReadPool CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of readpool", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 3.6, + "yaxis": "left" + } + ] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 19 + "y": 14 }, - "hiddenSeries": false, - "id": 1908, + "height": null, + "hideTimeOverride": false, + "id": 63, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -6740,7 +8239,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -6748,46 +8247,69 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"store_read_norm.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_read_norm.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - normal", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-normal", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_read_norm.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"store_read_high.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_read_high.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - high", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "B", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-high", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_read_high.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"store_read_low.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_read_low.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - low", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "C", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-low", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"store_read_low.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [ @@ -6797,22 +8319,22 @@ "line": true, "op": "gt", "value": 3.6, - "visible": true + "yaxis": "left" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6820,6 +8342,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -6828,6 +8351,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6838,80 +8362,65 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 7.2 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "1m", - "handler": 1, - "message": "TiKV unified read pool thread CPU usage is high", - "name": "Unified read pool CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of the unified read pool", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 7.2, + "yaxis": "left" + } + ] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 26 + "y": 21 }, - "hiddenSeries": false, - "id": 4287, + "height": null, + "hideTimeOverride": false, + "id": 64, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -6919,28 +8428,39 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified_read_po.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"unified_read_po.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"unified_read_po.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [ @@ -6950,22 +8470,22 @@ "line": true, "op": "gt", "value": 7.2, - "visible": true + "yaxis": "left" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Unified read pool CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -6973,6 +8493,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -6981,6 +8502,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -6991,44 +8513,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of RocksDB", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 26 + "y": 21 }, - "hiddenSeries": false, - "id": 69, + "height": null, + "hideTimeOverride": false, + "id": 65, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7036,59 +8570,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"rocksdb.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"rocksdb.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [ - { - "colorMode": "warning", - "fill": true, - "line": true, - "op": "gt", - "value": 1 - }, - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"rocksdb.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "RocksDB CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7096,6 +8626,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7104,6 +8635,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7114,80 +8646,65 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 7.2 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "0m", - "frequency": "1m", - "handler": 1, - "message": "TiKV Coprocessor thread CPU alert", - "name": "TiKV Coprocessor CPU alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The CPU utilization of coprocessor", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 7.2, + "yaxis": "left" + } + ] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 28 }, - "hiddenSeries": false, - "id": 78, + "height": null, + "hideTimeOverride": false, + "id": 66, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7195,46 +8712,69 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cop_normal.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cop_normal.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} - normal", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{instance}}-normal", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cop_normal.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cop_high.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cop_high.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} - high", - "refId": "B", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{instance}}-high", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cop_high.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cop_low.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cop_low.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} - low", - "refId": "C", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{instance}}-low", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cop_low.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [ @@ -7244,22 +8784,22 @@ "line": true, "op": "gt", "value": 7.2, - "visible": true + "yaxis": "left" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7267,6 +8807,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7275,6 +8816,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7285,40 +8827,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 28 }, - "hiddenSeries": false, - "id": 2531, + "height": null, + "hideTimeOverride": false, + "id": 67, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7326,42 +8884,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"gc_worker.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"gc_worker.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"gc_worker.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "GC worker CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7369,6 +8940,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7377,6 +8949,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7387,42 +8960,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe CPU utilization of split check", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 40 + "y": 35 }, + "height": null, + "hideTimeOverride": false, "id": 68, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7430,41 +9017,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"background.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"background.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"background.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "BackGround Worker CPU", + "title": "Background Worker CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7472,14 +9073,16 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7490,42 +9093,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 40 + "y": 35 }, - "id": 692, + "height": null, + "hideTimeOverride": false, + "id": 69, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7533,67 +9150,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/import-count.*/", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftlog_fetch.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "import-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance, tid) > 0", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "import-{{instance}}-{{tid}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "C", - "step": 4 - }, - { - "expr": "count(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "import-count-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "D", - "step": 4 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftlog_fetch.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import CPU", + "title": "Raftlog fetch Worker CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7601,6 +9206,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7609,6 +9215,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7619,42 +9226,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 47 + "y": 42 }, - "id": 691, + "height": null, + "hideTimeOverride": false, + "id": 70, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7662,77 +9283,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/backup-count.*/", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(backup-worker|bkwkr).*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "backup-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"backup_endpoint\"}[1m])) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "backup-endpoint", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "B", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(backup-worker|bkwkr).*\"}[1m])) by (instance, tid) > 0", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "backup-{{instance}}-{{tid}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "C", - "step": 4 - }, - { - "expr": "sum(tikv_backup_thread_pool_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by(instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "backup-count-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "D", - "step": 4 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Backup Worker CPU", + "title": "Import CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7740,6 +9339,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7748,6 +9348,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7758,38 +9359,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 47 + "y": 42 }, - "id": 62, + "height": null, + "hideTimeOverride": false, + "id": 71, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7797,53 +9416,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*tso/", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cdcwkr.*\"}[1m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - worker", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"tso\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"(backup-worker|bkwkr|backup_endpoint).*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - tso", - "refId": "B", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"(backup-worker|bkwkr|backup_endpoint).*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "CDC worker CPU", + "title": "Backup CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7851,6 +9472,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7859,7 +9481,8 @@ "show": true }, { - "format": "percentunit", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7869,38 +9492,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 54 + "y": 49 }, - "id": 60, + "height": null, + "hideTimeOverride": false, + "id": 72, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -7908,40 +9549,85 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cdc_.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cdcwkr.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-worker", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cdcwkr.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"tso\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-tso", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"tso\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cdc_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-endpoint", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"cdc_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "CDC endpoint CPU", + "title": "CDC worker CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -7949,6 +9635,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -7957,6 +9644,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -7967,40 +9655,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The CPU utilization of raftstore thread", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 55 + "y": 49 }, - "hiddenSeries": false, - "id": 23763572511, + "height": null, + "hideTimeOverride": false, + "id": 73, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8008,45 +9712,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, - "paceLength": 10, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftlog_fetch.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"tso_worker\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"tso_worker\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raftlog fetch Worker CPU", + "title": "TSO Worker CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8054,6 +9768,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -8062,6 +9777,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8072,44 +9788,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Thread CPU", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 74, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The CPU utilization of TSO worker", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 61 + "y": 0 }, - "hiddenSeries": false, - "id": 9962, + "height": null, + "hideTimeOverride": false, + "id": 75, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8117,63 +9887,188 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"tso_worker\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_ttl_expire_kv_count_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_ttl_expire_kv_count_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [ + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "TTL expire count", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ { - "colorMode": "warning", - "fill": true, - "line": true, - "op": "gt", - "value": 0.3, - "yaxis": "left" + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 0.8, - "yaxis": "left" + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 76, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_ttl_expire_kv_size_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_ttl_expire_kv_size_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "TSO Worker CPU", + "title": "TTL expire size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8181,7 +10076,8 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -8189,6 +10085,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8199,93 +10096,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Thread CPU", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 6 - }, - "id": 6946, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 6 + "y": 7 }, - "id": 6985, + "height": null, + "hideTimeOverride": false, + "id": 77, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_ttl_checker_processed_regions{instance=~\"$instance\"}) by (instance) / sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_ttl_checker_processed_regions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"region\"}\n [$__rate_interval]\n)) by (instance) )", "format": "time_series", - "intervalFactor": 10, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "E" + "metric": "", + "query": "(sum(rate(\n tikv_ttl_checker_processed_regions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"region\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "TTL check progress", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8293,14 +10209,16 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", - "label": "", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8311,77 +10229,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 6 + "y": 7 }, - "id": 6987, + "height": null, + "hideTimeOverride": false, + "id": 78, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_ttl_checker_actions{instance=~\"$instance\"}[30s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_ttl_checker_actions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "E" + "metric": "", + "query": "sum(rate(\n tikv_ttl_checker_actions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "TTL checker actions", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8389,14 +10342,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", - "label": "", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8407,35 +10362,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The time consumed when executing GC tasks", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 13 + "y": 14 }, - "id": 6986, + "height": null, + "hideTimeOverride": false, + "id": 79, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8443,62 +10419,123 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_ttl_checker_compact_duration_bucket{instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_ttl_checker_compact_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_ttl_checker_compact_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_ttl_checker_compact_duration_bucket{instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_ttl_checker_compact_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "metric": "tikv_storage_gc_skipped_counter", - "refId": "B", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_ttl_checker_compact_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.95, sum(rate(tikv_ttl_checker_compact_duration_bucket{instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_ttl_checker_compact_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_ttl_checker_compact_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_ttl_checker_compact_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_ttl_checker_compact_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_ttl_checker_compact_duration_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_ttl_checker_compact_duration_count{instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_ttl_checker_compact_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "average", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_ttl_checker_compact_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "TTL checker compact duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8506,6 +10543,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -8514,6 +10552,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8524,144 +10563,173 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "format": "ms", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "ms" + }, + "overrides": [] }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 13 + "y": 14 }, - "id": 7326, + "height": null, + "hideTimeOverride": false, + "id": 80, "interval": null, "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "pluginVersion": "6.1.6", - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" }, - "tableColumn": "", + "repeat": null, + "repeatDirection": null, + "span": null, "targets": [ { - "expr": "max(tikv_ttl_checker_poll_interval{instance=~\"$instance\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_ttl_checker_poll_interval\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"tikv_gc_run_interval\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": null, + "metric": "", + "query": "max((\n tikv_ttl_checker_poll_interval\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"tikv_gc_run_interval\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": "", "timeFrom": null, "timeShift": null, "title": "TTL checker poll interval", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" + "transformations": [], + "transparent": false, + "type": "stat" } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "TTL", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 7 + "y": 0 }, - "id": 2747, + "height": null, + "hideTimeOverride": false, + "id": 81, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The count of requests that TiKV sends to PD", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 7 + "y": 0 }, - "hiddenSeries": false, - "id": 1069, + "height": null, + "hideTimeOverride": false, + "id": 82, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8669,42 +10737,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_pd_request_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_pd_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ type }}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_pd_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "PD requests", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8712,7 +10793,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -8720,6 +10802,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8730,41 +10813,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The time consumed by requests that TiKV sends to PD", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 7 + "y": 0 }, - "hiddenSeries": false, - "id": 1070, + "height": null, + "hideTimeOverride": false, + "id": 83, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8772,42 +10870,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_pd_request_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_pd_request_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_pd_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) / sum(rate(\n tikv_pd_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ type }}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "(sum(rate(\n tikv_pd_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) / sum(rate(\n tikv_pd_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "PD request duration (average)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8815,6 +10926,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -8823,6 +10935,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -8833,41 +10946,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe total number of PD heartbeat messages", + "description": "The total number of PD heartbeat messages", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 15 + "y": 7 }, - "hiddenSeries": false, - "id": 1215, + "height": null, + "hideTimeOverride": false, + "id": 84, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8875,54 +11003,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "pending", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_pd_heartbeat_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_pd_heartbeat_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ type }}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_pd_heartbeat_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "tikv_pd_pending_heartbeat_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_pd_pending_heartbeat_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "pending", - "refId": "B" + "legendFormat": "{{instance}}-pending", + "metric": "", + "query": "sum((\n tikv_pd_pending_heartbeat_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "PD heartbeats", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -8930,6 +11074,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -8938,51 +11083,67 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total number of peers validated by the PD worker", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 15 + "y": 7 }, - "hiddenSeries": false, - "id": 1396, + "height": null, + "hideTimeOverride": false, + "id": 85, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -8990,43 +11151,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_pd_validate_peer_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_pd_validate_peer_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ type }}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_pd_validate_peer_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "PD validate peers", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9034,6 +11207,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -9042,6 +11216,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -9052,41 +11227,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe count of reconnections between TiKV and PD", + "description": "The count of reconnection between TiKV and PD", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 23 + "y": 14 }, - "hiddenSeries": false, - "id": 7985, + "height": null, + "hideTimeOverride": false, + "id": 86, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9094,42 +11284,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(delta(tikv_pd_reconnect_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_pd_reconnect_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ type }}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(delta(\n tikv_pd_reconnect_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "PD reconnections", + "title": "PD reconnection", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9137,6 +11340,7 @@ }, "yaxes": [ { + "decimals": null, "format": "opm", "label": null, "logBase": 1, @@ -9145,7 +11349,8 @@ "show": true }, { - "format": "opm", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9155,42 +11360,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe forward status of PD client", + "description": "The forward status of PD client", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 23 + "y": 14 }, - "hiddenSeries": false, - "id": 8376, + "height": null, + "hideTimeOverride": false, + "id": 87, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9198,42 +11417,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_pd_request_forwarded{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_pd_request_forwarded\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-{{host}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "((\n tikv_pd_request_forwarded\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "PD forward status", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9241,7 +11473,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -9249,7 +11482,8 @@ "show": true }, { - "format": "opm", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9259,44 +11493,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of TSO requests waiting in the queue.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 31 + "y": 21 }, - "hiddenSeries": false, - "id": 9963, + "height": null, + "hideTimeOverride": false, + "id": 88, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9304,46 +11550,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "tikv_pd_pending_tso_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_pd_pending_tso_request_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum((\n tikv_pd_pending_tso_request_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Pending TSO Requests", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9351,7 +11606,8 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "opm", "label": null, "logBase": 1, "max": null, @@ -9359,6 +11615,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -9369,42 +11626,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The slow score of stores", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 31 + "y": 21 }, - "hiddenSeries": false, - "id": 10365, + "height": null, + "hideTimeOverride": false, + "id": 89, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9412,44 +11683,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "tikv_raftstore_slow_score{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_slow_score\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum((\n tikv_raftstore_slow_score\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store Slow Score", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9457,7 +11739,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -9465,7 +11748,8 @@ "show": true }, { - "format": "opm", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9475,88 +11759,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration that recorded by inspecting messages.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 12, + "w": 24, "x": 0, - "y": 38 + "y": 28 }, - "hiddenSeries": false, - "id": 10366, + "height": null, + "hideTimeOverride": false, + "id": 90, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_inspect_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_inspect_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{type}}", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_inspect_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Inspected duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9564,6 +11872,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -9572,6 +11881,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -9582,58 +11892,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "PD", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 8 + "y": 0 }, - "id": 5265, + "height": null, + "hideTimeOverride": false, + "id": 91, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The throughput of disk write per IO type", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 32 + "y": 0 }, - "hiddenSeries": false, - "id": 5993, + "height": null, + "hideTimeOverride": false, + "id": 92, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9641,51 +11991,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_io_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", op=\"write\"}[45s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"write\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"write\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_io_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", op=\"write\"}[45s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"write\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "total", - "refId": "B", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"write\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Write IO bytes", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9693,14 +12062,16 @@ }, "yaxes": [ { - "format": "Bps", - "label": "", + "decimals": null, + "format": "binBps", + "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -9711,42 +12082,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The throughput of disk read per IO type", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 32 + "y": 0 }, - "hiddenSeries": false, - "id": 5994, + "height": null, + "hideTimeOverride": false, + "id": 93, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9754,51 +12139,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_io_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", op=\"read\"}[45s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"read\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"read\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_io_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", op=\"read\"}[45s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"read\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "total", - "refId": "B", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_io_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=\"read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Read IO bytes", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9806,14 +12210,16 @@ }, "yaxes": [ { - "format": "Bps", - "label": "", + "decimals": null, + "format": "binBps", + "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -9824,42 +12230,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The threshold of disk IOs per priority", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 39 + "y": 7 }, - "hiddenSeries": false, - "id": 5995, + "height": null, + "hideTimeOverride": false, + "id": 94, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -9867,43 +12287,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_rate_limiter_max_bytes_per_sec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_rate_limiter_max_bytes_per_sec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "avg((\n tikv_rate_limiter_max_bytes_per_sec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "IO threshold", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -9911,14 +12343,16 @@ }, "yaxes": [ { - "format": "Bps", - "label": "", + "decimals": null, + "format": "binBps", + "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -9929,41 +12363,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "IO rate limiter request wait duration.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 39 + "y": 7 }, - "hiddenSeries": false, - "id": 7225, + "height": null, + "hideTimeOverride": false, + "id": 95, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -9972,50 +12420,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_rate_limiter_request_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_rate_limiter_request_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": " {{type}}-99%", - "refId": "A", - "step": 4 + "legendFormat": "{{type}}-99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_rate_limiter_request_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "rate(tikv_rate_limiter_request_wait_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s]) / rate(tikv_rate_limiter_request_wait_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_rate_limiter_request_wait_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_rate_limiter_request_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": " {{type}}-avg", - "refId": "B" + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_rate_limiter_request_wait_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_rate_limiter_request_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Rate Limiter Request Wait Duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10023,6 +12491,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10031,6 +12500,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -10041,75 +12511,128 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "IO Breakdown", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 9 + "y": 0 }, - "id": 13117, + "height": null, + "hideTimeOverride": false, + "id": 96, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed by processing asynchronous write requests", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 9, + "h": 7, "w": 24, "x": 0, - "y": 10 + "y": 0 }, - "hiddenSeries": false, - "id": 13132, + "height": null, + "hideTimeOverride": false, + "id": 97, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10117,60 +12640,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"write\"}[30s])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"write\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"write\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) ", + "format": "time_series", "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Storage async write duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10178,6 +12734,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10186,7 +12743,8 @@ "show": true }, { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -10196,60 +12754,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The store time duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 19 + "y": 7 }, - "hiddenSeries": false, - "id": 13257, + "height": null, + "hideTimeOverride": false, + "id": 98, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10257,60 +12841,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_duration_secs_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_duration_secs_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_duration_secs_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_duration_secs_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_duration_secs_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10318,6 +12935,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10326,6 +12944,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -10336,60 +12955,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The apply time duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 19 + "y": 7 }, - "hiddenSeries": false, - "id": 13259, + "height": null, + "hideTimeOverride": false, + "id": 99, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10397,60 +13042,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_apply_duration_secs_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_apply_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_apply_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_duration_secs_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_apply_duration_secs_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_apply_duration_secs_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_apply_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_apply_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_apply_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_apply_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_apply_duration_secs_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_apply_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_apply_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Apply duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10458,6 +13136,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10466,6 +13145,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -10476,60 +13156,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The propose wait time duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 27 + "y": 14 }, - "hiddenSeries": false, - "id": 13261, + "height": null, + "hideTimeOverride": false, + "id": 100, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10537,60 +13243,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_request_wait_time_duration_secs_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_request_wait_time_duration_secs_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_request_wait_time_duration_secs_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store propose wait duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10598,6 +13337,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10606,6 +13346,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -10616,60 +13357,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The batch wait time duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 27 + "y": 14 }, - "hiddenSeries": false, - "id": 13263, + "height": null, + "hideTimeOverride": false, + "id": 101, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10677,60 +13444,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_batch_wait_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_batch_wait_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_batch_wait_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_batch_wait_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_batch_wait_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_batch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store batch wait duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10738,6 +13538,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10746,6 +13547,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -10756,60 +13558,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The send-to-write-queue time duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 35 + "y": 21 }, - "hiddenSeries": false, - "id": 13265, + "height": null, + "hideTimeOverride": false, + "id": 102, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10817,60 +13645,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_send_to_queue_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_send_to_queue_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_send_to_queue_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_send_to_queue_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_send_to_queue_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_send_to_queue_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store send to write queue duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -10878,6 +13739,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -10886,6 +13748,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -10896,60 +13759,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The send raft message of the proposal duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 35 + "y": 21 }, - "hiddenSeries": false, - "id": 23763572857, + "height": null, + "hideTimeOverride": false, + "id": 103, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -10957,60 +13846,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_send_proposal_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store send proposal duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11018,6 +13940,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11026,6 +13949,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11036,60 +13960,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The write kv db end duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 43 + "y": 28 }, - "hiddenSeries": false, - "id": 13269, + "height": null, + "hideTimeOverride": false, + "id": 104, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -11097,60 +14047,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_write_kvdb_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store write kv db end duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11158,6 +14141,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11166,6 +14150,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11176,60 +14161,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The before write time duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 43 + "y": 28 }, - "hiddenSeries": false, - "id": 13267, + "height": null, + "hideTimeOverride": false, + "id": 105, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -11237,60 +14248,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_before_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store before write duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11298,6 +14342,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11306,6 +14351,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11316,60 +14362,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The persist duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 51 + "y": 35 }, - "hiddenSeries": false, - "id": 13273, + "height": null, + "hideTimeOverride": false, + "id": 106, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -11377,60 +14449,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_persist_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_persist_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_persist_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_persist_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_persist_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_persist_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store persist duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11438,6 +14543,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11446,6 +14552,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11456,60 +14563,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The write end duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 51 + "y": 35 }, - "hiddenSeries": false, - "id": 13271, + "height": null, + "hideTimeOverride": false, + "id": 107, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -11517,60 +14650,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_write_end_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store write end duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11578,6 +14744,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11586,6 +14753,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11596,60 +14764,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The commit but not persist duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 59 + "y": 42 }, - "hiddenSeries": false, - "id": 13277, + "height": null, + "hideTimeOverride": false, + "id": 108, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -11657,60 +14851,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_commit_not_persist_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store commit but not persist duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11718,6 +14945,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11726,6 +14954,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11736,60 +14965,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "99%": "#eab839", - "999%": "dark-red", - "count": "rgb(33, 250, 2)" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The commit and persist duration of each request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 59 + "y": 42 }, - "hiddenSeries": false, - "id": 13275, + "height": null, + "hideTimeOverride": false, + "id": 109, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "count", + "bars": false, "dashLength": 1, "dashes": true, "fill": 2, + "fillBelowTo": null, + "lines": true, "spaceLength": 1, "transform": "negative-Y", "yaxis": 2, @@ -11797,60 +15052,93 @@ }, { "alias": "avg", - "fill": 7 + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", "hide": false, "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "D" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_wf_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Store commit and persist duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -11858,6 +15146,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -11866,6 +15155,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -11876,25 +15166,53 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Raft Waterfall", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 10 + "y": 0 }, - "id": 2748, + "height": null, + "hideTimeOverride": false, + "id": 110, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -11904,61 +15222,94 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed for peer processes to be ready in Raft", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 10 + "y": 0 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 13279, + "id": 111, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_raftstore_raft_process_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type='ready'}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_process_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "C", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_process_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Process ready duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -11967,32 +15318,44 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The time consumed for peer processes to be ready in Raft", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 10 + "y": 0 }, - "hiddenSeries": false, - "id": 13281, + "height": null, + "hideTimeOverride": false, + "id": 112, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -12006,54 +15369,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type='ready'}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_raft_process_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "C", - "step": 4 - } - ], - "thresholds": [ + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_raft_process_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 1, - "yaxis": "left" + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_raft_process_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_raft_process_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_raft_process_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_raft_process_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_raft_process_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_raft_process_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_process_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_process_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99% Process ready duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -12061,14 +15493,16 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -12079,10 +15513,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -12092,65 +15527,94 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, - "dashes": false, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time duration of store write loop when store-io-pool-size is not zero.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 17 + "y": 7 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 13283, + "id": 113, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(delta(tikv_raftstore_store_write_loop_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{le}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Store write loop duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -12159,36 +15623,50 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The time duration of store write loop on each TiKV instance when store-io-pool-size is not zero.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 17 + "y": 7 }, - "hiddenSeries": false, - "id": 13285, + "height": null, + "hideTimeOverride": false, + "id": 114, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -12196,45 +15674,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_loop_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": true, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} ", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_loop_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99% Store write loop duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -12242,6 +15798,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -12250,6 +15807,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -12260,10 +15818,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -12273,62 +15832,94 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft appends log", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 24 + "y": 14 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 39, + "id": 115, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_raftstore_append_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Append log duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -12337,36 +15928,50 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed when Raft appends log on each TiKV instance", + "description": "The time consumed when Raft commits log on each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 24 + "y": 14 }, - "hiddenSeries": false, - "id": 13376, + "height": null, + "hideTimeOverride": false, + "id": 116, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -12374,78 +15979,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} ", - "refId": "A", - "step": 4 - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_kvdb_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "hide": true, - "interval": "", - "legendFormat": "kvdb-{{instance}}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_raftdb_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "raftdb-{{instance}}", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_append_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_send_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "hide": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_append_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_append_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "sendmsg-{{instance}}", - "refId": "D" + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_append_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_append_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_callback_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_append_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", "hide": true, + "instant": false, "interval": "", - "legendFormat": "callback-{{instance}}", - "refId": "E" + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_append_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Append log duration per server", + "title": "99% Commit log duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -12453,6 +16103,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -12461,6 +16112,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -12471,10 +16123,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -12484,135 +16137,270 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft commits log", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 31 + "y": 21 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 3690, + "id": 117, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_raftstore_commit_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Commit log duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft commits log on each TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 31 + "y": 21 }, - "hiddenSeries": false, - "id": 3688, + "height": null, + "hideTimeOverride": false, + "id": 118, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_commit_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_commit_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_commit_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_commit_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99% Commit log duration per server", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -12620,6 +16408,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -12628,6 +16417,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -12638,10 +16428,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -12651,62 +16442,94 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft applies log", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 38 + "y": 28 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 31, + "id": 119, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_raftstore_apply_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Apply log duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -12715,36 +16538,50 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed for Raft to apply logs per TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 38 + "y": 28 }, - "hiddenSeries": false, - "id": 32, + "height": null, + "hideTimeOverride": false, + "id": 120, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -12752,43 +16589,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_apply_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_apply_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_apply_log_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_apply_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_apply_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": " {{instance}}", - "refId": "A", - "step": 4 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_apply_log_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99% Apply log duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -12796,6 +16713,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -12804,6 +16722,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -12814,89 +16733,284 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time consumed for Raft Client wait connection ready", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 35 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 121, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_server_raft_client_wait_ready_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_server_raft_client_wait_ready_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Raft Client Wait Connection Ready Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The time consumed for Raft Client wait connection ready per TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 45 + "x": 12, + "y": 35 }, - "hiddenSeries": false, - "id": 13382, + "height": null, + "hideTimeOverride": false, + "id": 122, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_raftstore_io_reschedule_region_total{instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_server_raft_client_wait_ready_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "rechedule-{{instance}}", - "refId": "A" + "legendFormat": "99.99%-{{to}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_server_raft_client_wait_ready_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(tikv_raftstore_io_reschedule_pending_tasks_total{instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_raft_client_wait_ready_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to, le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "pending-task-{{instance}}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "99%-{{to}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_raft_client_wait_ready_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_server_raft_client_wait_ready_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to) / sum(rate(\n tikv_server_raft_client_wait_ready_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{to}}", + "metric": "", + "query": "(sum(rate(\n tikv_server_raft_client_wait_ready_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to) / sum(rate(\n tikv_server_raft_client_wait_ready_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_server_raft_client_wait_ready_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{to}}", + "metric": "", + "query": "sum(rate(\n tikv_server_raft_client_wait_ready_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (to) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Store io task reschedule", + "title": "99% Raft Client Wait Connection Ready Duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -12904,7 +17018,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -12912,6 +17027,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -12922,81 +17038,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed when store write task block on each TiKV instance", + "description": "The throughput of disk write per IO type", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 45 + "x": 0, + "y": 42 }, - "hiddenSeries": false, - "id": 13380, + "height": null, + "hideTimeOverride": false, + "id": 123, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_msg_block_wait_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_io_reschedule_region_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "rechedule-{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_io_reschedule_region_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_io_reschedule_pending_tasks_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "pending-task-{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_io_reschedule_pending_tasks_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Write task block duration per server", + "title": "Store io task reschedule", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13004,7 +17166,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -13012,6 +17175,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13022,57 +17186,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Raft IO", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 11 - }, - "id": 2751, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The proposal count of all Regions in a mio tick", + "description": "The time consumed when store write task block on each TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 11 + "x": 12, + "y": 42 }, - "hiddenSeries": false, - "id": 108, + "height": null, + "hideTimeOverride": false, + "id": 124, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -13080,43 +17243,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_proposal_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_write_msg_block_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", "metric": "", - "refId": "A", - "step": 4 + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_write_msg_block_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft proposals per ready", + "title": "99% Write task block duration per server", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13124,7 +17299,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -13132,6 +17308,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13142,89 +17319,154 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft IO", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 125, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of proposals per type", + "description": "The proposal count of a Regions in a tick", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 11 + "x": 0, + "y": 0 }, - "hiddenSeries": false, - "id": 7, + "height": null, + "hideTimeOverride": false, + "id": 126, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_proposal_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"local_read|normal|read_index\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_proposal_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "tikv_raftstore_proposal_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_proposal_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft read/write proposals", + "title": "Raft proposals per ready", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13232,7 +17474,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -13240,6 +17483,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13250,89 +17494,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of read proposals which are made by each TiKV instance", + "description": "The number of proposals per type", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 18 + "x": 12, + "y": 0 }, - "hiddenSeries": false, - "id": 119, + "height": null, + "hideTimeOverride": false, + "id": 127, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_proposal_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"local_read|read_index\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"local_read|normal|read_index\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"local_read|normal|read_index\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft read proposals per server", + "title": "Raft read/write proposals", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13340,6 +17607,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -13348,6 +17616,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13358,89 +17627,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of write proposals which are made by each TiKV instance", + "description": "The number of read proposals which are made by each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 18 + "x": 0, + "y": 7 }, - "hiddenSeries": false, - "id": 120, + "height": null, + "hideTimeOverride": false, + "id": 128, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_proposal_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"normal\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"local_read|read_index\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_proposal_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"local_read|read_index\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft write proposals per server", + "title": "Raft read proposals per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13448,6 +17740,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -13456,6 +17749,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13466,157 +17760,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The wait time of each proposal", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 25 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 41, - "legend": { - "show": false - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(delta(tikv_raftstore_request_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Propose wait duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The wait time of each proposal in each TiKV instance", + "description": "The number of write proposals which are made by each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 25 + "y": 7 }, - "hiddenSeries": false, - "id": 42, + "height": null, + "hideTimeOverride": false, + "id": 129, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"normal\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"normal\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Propose wait duration per server", + "title": "Raft write proposals per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13624,7 +17873,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -13632,6 +17882,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13642,10 +17893,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -13655,65 +17907,94 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, - "dashes": false, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The wait time of each store write task", + "description": "The wait time of each proposal", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 32 + "y": 14 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 13524, + "id": 130, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(delta(tikv_raftstore_store_write_task_wait_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Store write wait duration", + "title": "Propose wait duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -13722,82 +18003,174 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The wait time of each store write task in each TiKV instance", + "description": "The wait time of each proposal in each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 32 + "y": 14 }, - "hiddenSeries": false, - "id": 13522, + "height": null, + "hideTimeOverride": false, + "id": 131, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_write_task_wait_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_request_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Store write wait duration per server", + "title": "99% Propose wait duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13805,6 +18178,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -13813,6 +18187,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13823,10 +18198,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -13836,61 +18212,94 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": "The wait time of each store write task", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 39 + "y": 21 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 2535, + "id": 132, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_raftstore_apply_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Apply wait duration", + "title": "Store write wait duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -13899,79 +18308,174 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The wait time of each store write task in each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 39 + "y": 21 }, - "hiddenSeries": false, - "id": 2536, + "height": null, + "hideTimeOverride": false, + "id": 133, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_task_wait_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Apply wait duration per server", + "title": "99% Store write wait duration per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -13979,6 +18483,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -13987,6 +18492,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -13997,10 +18503,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -14010,132 +18517,497 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, - "dashes": false, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The handle duration of each store write task msg", + "description": "The wait time of each apply task", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 46 + "y": 28 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 23763572700, + "id": 134, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(delta(tikv_raftstore_store_write_handle_msg_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Store write handle msg duration", + "title": "Apply wait duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dashes": false, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The distribution of write trigger size", + "description": "The wait time of each apply task in each TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 46 + "y": 28 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763572701, + "height": null, + "hideTimeOverride": false, + "id": 135, + "interval": null, + "isNew": true, "legend": { - "show": false + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(delta(tikv_raftstore_store_write_trigger_wb_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_apply_wait_time_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Store write trigger size", + "title": "99% Apply wait duration per server", "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "showHistogram": false + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The handle duration of each store write task msg", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 35 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 136, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_handle_msg_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_handle_msg_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Store write handle msg duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, "show": true }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The distribution of write trigger size", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 35 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 137, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_store_write_trigger_wb_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_store_write_trigger_wb_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Store write trigger size", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { + "decimals": 1, "format": "bytes", "label": null, "logBase": 1, @@ -14150,32 +19022,49 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The rate at which peers propose logs", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 54 + "y": 42 }, - "hiddenSeries": false, - "id": 1975, + "height": null, + "hideTimeOverride": false, + "id": 138, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -14184,41 +19073,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(rate(tikv_raftstore_propose_log_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_propose_log_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "B" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_propose_log_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft propose speed", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -14227,112 +19130,146 @@ "yaxes": [ { "decimals": null, - "format": "short", - "label": "bytes/s", + "format": "binBps", + "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The rate at which peers propose logs", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 54 + "y": 42 }, - "hiddenSeries": false, - "id": 1976, + "height": null, + "hideTimeOverride": false, + "id": 139, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_perf_context_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_perf_context_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "store-{{type}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_store_perf_context_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_perf_context_time_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_perf_context_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "apply-{{type}}", - "refId": "B", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_apply_perf_context_time_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Perf Context duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -14340,6 +19277,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -14348,6 +19286,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -14358,113 +19297,169 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Raft Propose", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 12 + "y": 0 }, - "id": 2749, + "height": null, + "hideTimeOverride": false, + "id": 140, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The count of different ready type of Raft", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 12 + "y": 0 }, - "hiddenSeries": false, - "id": 5, + "height": null, + "hideTimeOverride": false, + "id": 141, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_raft_ready_handled_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_ready_handled_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_raftstore_raft_ready_handled_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_ready_handled_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_raftstore_raft_process_duration_secs_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"ready\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_process_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "count", - "refId": "B", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_process_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"ready\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Ready handled", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -14472,7 +19467,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -14480,6 +19476,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -14490,38 +19487,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The max time consumed by raftstore events", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 12 + "y": 0 }, - "hiddenSeries": false, - "id": 123, + "height": null, + "hideTimeOverride": false, + "id": 142, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -14535,64 +19544,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(1.0, sum(rate(tikv_raftstore_event_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999999,(\n sum(rate(\n tikv_raftstore_event_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "C", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.999999,(\n sum(rate(\n tikv_raftstore_event_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(1.0, sum(rate(tikv_broadcast_normal_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999999,(\n sum(rate(\n tikv_broadcast_normal_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "broadcast_normal", - "refId": "A", - "step": 4 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 1 + "metric": "", + "query": "histogram_quantile(0.999999,(\n sum(rate(\n tikv_broadcast_normal_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Max duration of raft store events", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -14600,14 +19615,16 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -14618,10 +19635,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -14631,67 +19649,101 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed for checking memory locks for replica reads", + "description": "Replica read lock checking duration", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 21 + "y": 7 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 7235, + "id": 143, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_replica_read_lock_check_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_replica_read_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "C", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_replica_read_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Replica read lock checking duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -14701,163 +19753,244 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The length of peer msgs for each round handling", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 21 + "y": 7 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 23763572958, + "id": 144, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(delta(tikv_raftstore_peer_msg_len_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_peer_msg_len_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "C", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_peer_msg_len_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Peer msg length distribution", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "none", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Raft Process", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 13 + "y": 0 }, - "id": 2750, + "height": null, + "hideTimeOverride": false, + "id": 145, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of Raft messages sent by each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 29 + "y": 0 }, - "hiddenSeries": false, - "id": 1615, + "height": null, + "hideTimeOverride": false, + "id": 146, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_raft_sent_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 - } - ], + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Sent messages per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -14865,6 +19998,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -14873,6 +20007,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -14883,89 +20018,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of Raft messages flushed by each TiKV instance", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 29 + "y": 0 }, - "hiddenSeries": false, - "id": 1616, + "height": null, + "hideTimeOverride": false, + "id": 147, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_server_raft_message_flush_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, reason)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_server_raft_message_flush_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-{{reason}}", - "metric": "tikv_server_raft_message_flush_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_server_raft_message_flush_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Flush messages per server", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -14973,6 +20131,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -14981,6 +20140,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -14991,41 +20151,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of Raft messages received by each TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 36 + "y": 7 }, - "hiddenSeries": false, - "id": 106, + "height": null, + "hideTimeOverride": false, + "id": 148, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -15033,42 +20208,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_server_raft_message_recv_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_server_raft_message_recv_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_server_raft_message_recv_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Receive messages per server", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15076,6 +20264,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -15084,6 +20273,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15094,88 +20284,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of different types of Raft messages that are sent", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 36 + "y": 7 }, - "hiddenSeries": false, - "id": 11, + "height": null, + "hideTimeOverride": false, + "id": 149, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_raft_sent_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Messages", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15183,6 +20397,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -15191,6 +20406,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15201,88 +20417,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total number of vote messages that are sent in Raft", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 43 + "y": 14 }, - "hiddenSeries": false, - "id": 25, + "height": null, + "hideTimeOverride": false, + "id": 150, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_raft_sent_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"vote\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"vote\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"vote\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Vote", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15290,14 +20530,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15308,88 +20550,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of dropped Raft messages per type", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 43 + "y": 14 }, - "hiddenSeries": false, - "id": 1309, + "height": null, + "hideTimeOverride": false, + "id": 151, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_raft_dropped_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_dropped_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_dropped_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft dropped messages", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15397,6 +20663,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -15405,6 +20672,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15415,95 +20683,154 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Raft Message", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 14 + "y": 0 }, - "id": 2752, + "height": null, + "hideTimeOverride": false, + "id": 152, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of admin proposals", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 11 + "y": 0 }, - "id": 76, + "height": null, + "hideTimeOverride": false, + "id": 153, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_proposal_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"conf_change|transfer_leader\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"conf_change|transfer_leader\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_raftstore_proposal_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_proposal_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"conf_change|transfer_leader\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Admin proposals", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15511,6 +20838,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -15519,6 +20847,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15529,79 +20858,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The number of the processed apply command", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 11 + "y": 0 }, - "id": 77, + "height": null, + "hideTimeOverride": false, + "id": 154, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_admin_cmd_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", status=\"success\", type!=\"compact\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_admin_cmd_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"compact\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_raftstore_admin_cmd_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_admin_cmd_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"compact\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Admin apply", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15609,14 +20971,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15627,79 +20991,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of raftstore split checksss", + "description": "The number of raftstore split checks", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 18 + "y": 7 }, - "id": 70, + "height": null, + "hideTimeOverride": false, + "id": 155, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_check_split_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type!=\"ignore\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_check_split_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"ignore\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_check_split_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"ignore\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Check split", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15707,6 +21104,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -15715,6 +21113,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15725,80 +21124,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The time consumed when running split check in .9999", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 18 + "y": 7 }, - "id": 71, + "height": null, + "hideTimeOverride": false, + "id": 156, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.9999, sum(rate(tikv_raftstore_check_split_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_check_split_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_duration_seconds_bucket", - "refId": "A", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_check_split_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "99.99% Check split duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15806,14 +21237,16 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15824,94 +21257,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {} - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 25 + "y": 14 }, - "hiddenSeries": false, - "id": 3636, + "height": null, + "hideTimeOverride": false, + "id": 157, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sideWidth": 400, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.3.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(delta(tikv_load_base_split_event[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_load_base_split_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 - }, - { - "expr": "", - "interval": "", - "legendFormat": "", - "refId": "B" + "metric": "", + "query": "sum(delta(\n tikv_load_base_split_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Load base split event", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -15919,6 +21370,7 @@ }, "yaxes": [ { + "decimals": null, "format": "opm", "label": null, "logBase": 1, @@ -15927,6 +21379,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -15937,105 +21390,142 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 36 + "x": 12, + "y": 14 }, - "hiddenSeries": false, - "id": 23763572060, + "height": null, + "hideTimeOverride": false, + "id": 158, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.80, sum(rate(tikv_load_base_split_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.8,(\n sum(rate(\n tikv_load_base_split_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "80%-{{instance}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "histogram_quantile(0.8,(\n sum(rate(\n tikv_load_base_split_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.90, sum(rate(tikv_load_base_split_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_load_base_split_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "99%-{{instance}}", - "refId": "B" + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_load_base_split_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_load_base_split_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_load_base_split_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_load_base_split_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "avg-{{instance}}", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_load_base_split_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_load_base_split_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Load base split duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16043,6 +21533,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -16051,6 +21542,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16061,87 +21553,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 36 + "w": 24, + "x": 0, + "y": 21 }, - "hiddenSeries": false, - "id": 23763573619, + "height": null, + "hideTimeOverride": false, + "id": 159, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sideWidth": 300, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_raftstore_peer_in_flashback_state{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_peer_in_flashback_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_peer_in_flashback_state", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum((\n tikv_raftstore_peer_in_flashback_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Peer in Flashback State", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16149,7 +21666,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -16157,6 +21675,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16167,88 +21686,169 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft Admin", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 160, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {} - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 25 + "x": 0, + "y": 0 }, - "hiddenSeries": false, - "id": 3637, + "height": null, + "hideTimeOverride": false, + "id": 161, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sideWidth": 300, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.3.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_read_qps_topn{order=\"0\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "99.99%-{{instance}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "TopN QPS exceeds threshold", + "title": "Raft log GC write duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16256,14 +21856,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "s", "label": null, - "logBase": 1, + "logBase": 10, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16274,59 +21876,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Raft Admin", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 15 - }, - "id": 12797, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 16 + "x": 12, + "y": 0 }, - "hiddenSeries": false, - "id": 12882, + "height": null, + "hideTimeOverride": false, + "id": 162, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -16334,195 +21933,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_log_gc_write_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "99% - {{instance}}", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_raft_log_gc_write_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_raft_log_kv_sync_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "95% - {{instance}}", - "refId": "B", - "step": 10 + "intervalFactor": 1, + "legendFormat": "99.99%-{{instance}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_raft_log_kv_sync_duration_secs_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_write_duration_secs_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tikv_raftstore_raft_log_gc_write_duration_secs_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "avg - {{instance}}", - "refId": "C", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Raft log GC write duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 10, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "hiddenSeries": false, - "id": 12886, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_log_kv_sync_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "99% - {{instance}}", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_raft_log_kv_sync_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "95% - {{instance}}", - "refId": "B", - "step": 10 - }, - { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_kv_sync_duration_secs_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tikv_raftstore_raft_log_kv_sync_duration_secs_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_raft_log_kv_sync_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_raft_log_kv_sync_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "avg - {{instance}}", - "refId": "C", - "step": 10 + "intervalFactor": 1, + "legendFormat": "avg-{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_raft_log_kv_sync_duration_secs_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_raft_log_kv_sync_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft log GC kv sync duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16530,6 +22004,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 10, @@ -16538,6 +22013,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16548,91 +22024,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 24 + "y": 7 }, - "hiddenSeries": false, - "id": 12881, + "height": null, + "hideTimeOverride": false, + "id": 163, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_write_duration_secs_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_log_gc_write_duration_secs_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft log GC write operations ", + "title": "Raft log GC write operations", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16640,6 +22137,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -16648,6 +22146,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16658,91 +22157,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 24 + "y": 7 }, - "hiddenSeries": false, - "id": 12884, + "height": null, + "hideTimeOverride": false, + "id": 164, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_seek_operations_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_log_gc_seek_operations_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_log_gc_seek_operations_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft log GC seek operations ", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16750,6 +22270,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -16758,6 +22279,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16768,91 +22290,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 32 + "y": 14 }, - "hiddenSeries": false, - "id": 12887, + "height": null, + "hideTimeOverride": false, + "id": 165, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_log_lag_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_log_lag_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_log_lag_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft log lag", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16860,6 +22403,7 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, @@ -16868,6 +22412,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16878,91 +22423,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 32 + "y": 14 }, - "hiddenSeries": false, - "id": 12975, + "height": null, + "hideTimeOverride": false, + "id": 166, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_skipped{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, reason)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_log_gc_skipped\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{reason}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{reason}}", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_log_gc_skipped\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft log gc skipped", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -16970,6 +22536,7 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, @@ -16978,6 +22545,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -16988,91 +22556,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 40 + "y": 21 }, - "hiddenSeries": false, - "id": 12974, + "height": null, + "hideTimeOverride": false, + "id": 167, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_failed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_raft_log_gc_failed\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_raft_log_gc_failed\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft log GC failed", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17080,6 +22669,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -17088,6 +22678,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17098,91 +22689,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 40 + "y": 21 }, - "hiddenSeries": false, - "id": 23763572229, + "height": null, + "hideTimeOverride": false, + "id": 168, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_entry_fetches{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_entry_fetches\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{type}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_raftstore_entry_fetches\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft log fetch", + "title": "Raft log fetch ", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17190,6 +22802,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -17198,6 +22811,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17208,124 +22822,153 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 24, "x": 0, - "y": 48 + "y": 28 }, - "hiddenSeries": false, - "id": 23763572555, + "height": null, + "hideTimeOverride": false, + "id": 169, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, + "current": true, + "hideEmpty": true, "hideZero": true, - "max": false, + "max": true, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": false + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { "alias": "/pending-task/", + "bars": false, + "fill": 1, + "fillBelowTo": null, + "lines": true, "transform": "negative-Y", - "yaxis": 2 + "yaxis": 2, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_entry_fetches_task_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "B", - "step": 10 + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_raftstore_entry_fetches_task_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_raftstore_entry_fetches_task_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_entry_fetches_task_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "C", - "step": 10 + "intervalFactor": 1, + "legendFormat": "avg-{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_raftstore_entry_fetches_task_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_raftstore_entry_fetches_task_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(tikv_worker_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\", name=~\"raftlog-fetch-worker\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_worker_pending_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftlog-fetch-worker\"}\n \n)) by (instance) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "pending-task", - "refId": "D" + "metric": "", + "query": "sum((\n tikv_worker_pending_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftlog-fetch-worker\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Raft log async fetch task duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17342,6 +22985,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17352,48 +22996,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Raft Log", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 16 + "y": 0 }, - "id": 2753, + "height": null, + "hideTimeOverride": false, + "id": 170, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of rejections from the local read thread and The number of total requests", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 12, + "w": 24, "x": 0, - "y": 12 + "y": 0 }, - "id": 2292, + "height": null, + "hideTimeOverride": false, + "id": 171, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -17401,56 +23095,95 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { - "alias": "/.*-total/i", - "yaxis": 2 + "alias": "/.*-total/", + "bars": false, + "fill": 1, + "fillBelowTo": null, + "lines": true, + "yaxis": 2, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_local_read_reject_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, reason)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_local_read_reject_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-reject-by-{{reason}}", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_local_read_reject_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_raftstore_local_read_executed_requests{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_local_read_executed_requests\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-total", - "refId": "B" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_local_read_executed_requests\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_raftstore_local_read_executed_stale_read_requests{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_local_read_executed_stale_read_requests\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}-stale-read", - "refId": "C" + "metric": "", + "query": "sum(rate(\n tikv_raftstore_local_read_executed_stale_read_requests\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Local reader requests", + "title": "Raft log async fetch task duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17458,7 +23191,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -17466,6 +23200,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17476,86 +23211,154 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, - "title": "Local reader", + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Local Reader", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 17 + "y": 0 }, - "id": 4200, + "height": null, + "hideTimeOverride": false, + "id": 172, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The time used by each level in the unified read pool per second. Level 0 refers to small queries.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 13 + "y": 0 }, - "id": 4194, + "height": null, + "hideTimeOverride": false, + "id": 173, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sideWidth": 250, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_multilevel_level_elapsed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (level)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_multilevel_level_elapsed\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=\"unified-read-pool\"}\n [$__rate_interval]\n)) by (level) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{level}}", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_multilevel_level_elapsed\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=\"unified-read-pool\"}\n [$__rate_interval]\n)) by (level) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Time used by level", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17563,7 +23366,8 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -17571,6 +23375,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17581,69 +23386,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The chance that level 0 (small) tasks are scheduled in the unified read pool.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 13 + "y": 0 }, - "id": 4196, + "height": null, + "hideTimeOverride": false, + "id": 174, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_multilevel_level0_chance{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_multilevel_level0_chance\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=\"unified-read-pool\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{type}}", + "metric": "", + "query": "((\n tikv_multilevel_level0_chance\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=\"unified-read-pool\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Level 0 chance", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17651,6 +23499,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -17659,6 +23508,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17669,70 +23519,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of concurrently running tasks in the unified read pool.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 21 + "y": 7 }, - "id": 4198, + "height": null, + "hideTimeOverride": false, + "id": 175, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(avg_over_time(tikv_unified_read_pool_running_tasks{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(avg_over_time(\n tikv_unified_read_pool_running_tasks\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A" + "metric": "", + "query": "sum(avg_over_time(\n tikv_unified_read_pool_running_tasks\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Running tasks", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17740,6 +23632,7 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, @@ -17748,6 +23641,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17758,10 +23652,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -17771,158 +23666,270 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 26 + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 23763572469, + "id": 176, + "interval": null, "legend": { "show": false }, - "pluginVersion": "7.5.11", + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified-read.*\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_yatp_pool_schedule_wait_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"unified-read.*\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_yatp_pool_schedule_wait_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"unified-read.*\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Unified Read Pool Wait Duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "Unified read pool task execution time during one schedule.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 34 + "y": 14 }, - "id": 4199, + "height": null, + "hideTimeOverride": false, + "id": 177, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_yatp_task_poll_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "50%", - "refId": "A" + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_yatp_task_poll_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_task_poll_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_task_poll_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_yatp_task_poll_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_yatp_task_poll_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "C" + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_yatp_task_poll_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_yatp_task_poll_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_yatp_task_poll_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "hide": false, + "hide": true, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "999%", - "refId": "D" + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_yatp_task_poll_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Duration of One Time Slice", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -17930,6 +23937,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 2, @@ -17938,6 +23946,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -17948,101 +23957,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "Unified read pool task total execution duration.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 34 + "y": 14 }, - "id": 4202, + "height": null, + "hideTimeOverride": false, + "id": 178, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_yatp_task_exec_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "50%", - "refId": "A" + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_yatp_task_exec_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_task_exec_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_task_exec_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_yatp_task_exec_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_yatp_task_exec_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "C" + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_yatp_task_exec_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_yatp_task_exec_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_yatp_task_exec_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "hide": false, + "hide": true, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "999%", - "refId": "D" + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_yatp_task_exec_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Task Execute Duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -18050,6 +24138,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 2, @@ -18058,6 +24147,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -18068,101 +24158,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "Task schedule number of times.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 24, "x": 0, - "y": 42 + "y": 21 }, - "id": 4204, + "height": null, + "hideTimeOverride": false, + "id": 179, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_yatp_task_execute_times_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "50%", - "refId": "A" + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_yatp_task_execute_times_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_task_execute_times_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_yatp_task_execute_times_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_yatp_task_execute_times_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_yatp_task_execute_times_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "C" + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_yatp_task_execute_times_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_yatp_task_execute_times_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_yatp_task_execute_times_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "hide": false, + "hide": true, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "999%", - "refId": "D" + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_yatp_task_execute_times_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Task Schedule Times", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -18170,6 +24339,7 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 2, @@ -18178,6 +24348,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -18188,43 +24359,86 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, "title": "Unified Read Pool", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 18 + "y": 0 }, - "id": 2754, + "height": null, + "hideTimeOverride": false, + "id": 180, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total count of different kinds of commands received", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 14 + "y": 0 }, - "id": 2, + "height": null, + "hideTimeOverride": false, + "id": 181, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -18236,7 +24450,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -18244,39 +24458,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_storage_command_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_command_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_storage_command_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Storage command total", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -18284,14 +24514,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 10, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -18302,28 +24534,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "The total number of engine asynchronous request errors", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 14 + "y": 0 }, - "id": 8, + "height": null, + "hideTimeOverride": false, + "id": 182, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -18335,7 +24583,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -18343,40 +24591,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_storage_engine_async_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", status!~\"all|success\"}[1m])) by (status)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",status!~\"all|success\"}\n [$__rate_interval]\n)) by (status) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{status}}", - "metric": "tikv_raftstore_raft_process_duration_secs_bucket", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",status!~\"all|success\"}\n [$__rate_interval]\n)) by (status) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Storage async request error", "tooltip": { "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -18384,7 +24647,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -18392,6 +24656,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -18402,10 +24667,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -18415,215 +24681,270 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed by processing asynchronous snapshot requests", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 35 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 15, - "legend": { - "show": false - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(delta(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le)", - "format": "heatmap", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage async snapshot duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed by processing asynchronous write requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 22 + "y": 7 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 109, + "id": 183, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, "title": "Storage async write duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The storage async snapshot duration", + "description": "The storage async write duration", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 35 + "y": 7 }, - "hiddenSeries": false, - "id": 20000, + "height": null, + "hideTimeOverride": false, + "id": 184, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "99%", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"write\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Storage async snapshot duration", + "title": "Storage async write duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -18631,6 +24952,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -18639,6 +24961,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -18649,10 +24972,11 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -18662,129 +24986,302 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The storage async snapshot duration without the involving of raftstore", + "description": "The time consumed by processing asynchronous snapshot requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 48 + "y": 14 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 31111, + "id": 185, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_local_read\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Storage async snapshot duration (pure local read)", + "title": "Storage async snapshot duration", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { - "cards": { - "cardPadding": null, - "cardRound": null + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The storage async snapshot duration", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "Read index propose wait duration associated with async snapshot", "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 60 + "x": 12, + "y": 14 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 31112, + "height": null, + "hideTimeOverride": false, + "id": 186, + "interval": null, + "isNew": true, "legend": { - "show": false + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(delta(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_propose_wait\"}[1m])) by (le)", - "format": "heatmap", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A", - "step": 4 + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Read index propose wait duration", + "title": "Storage async snapshot duration", "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { + "cacheTimeout": null, "cards": { "cardPadding": null, "cardRound": null @@ -18794,148 +25291,270 @@ "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, + "max": null, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "Read index confirm duration associated with async snapshot", + "description": "The storage async snapshot duration without the involving of raftstore", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 72 + "y": 21 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 31113, + "id": 187, + "interval": null, "legend": { "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_confirm\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Read index confirm duration", + "title": "Storage async snapshot duration (pure local read)", "tooltip": { - "show": true, - "showHistogram": false + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": 0, + "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The storage async write duration", + "description": "The storage async snapshot duration without the involving of raftstore", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 22 + "y": 21 }, - "hiddenSeries": false, - "id": 20001, + "height": null, + "hideTimeOverride": false, + "id": 188, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "A" + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_local_read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Storage async write duration", + "title": "Storage async snapshot duration (pure local read)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -18943,6 +25562,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -18951,6 +25571,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -18961,209 +25582,284 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The storage async snapshot duration without the involving of raftstore", + "description": "Read index propose wait duration associated with async snapshot", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 48 + "x": 0, + "y": 28 }, - "hiddenSeries": false, - "id": 31114, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 189, + "interval": null, "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "show": false }, - "percentage": false, - "pluginVersion": "7.5.10", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_local_read\"}[1m])) by (le))", - "interval": "", - "intervalFactor": 1, - "legendFormat": "99%", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_local_read\"}[1m])) by (le))", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_local_read\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_local_read\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", - "refId": "C" + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Storage async snapshot duration (pure local read)", + "title": "Read index propose wait duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "Read index propose wait duration associated with async snapshot", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 2, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 60 + "y": 28 }, - "hiddenSeries": false, - "id": 31115, + "height": null, + "hideTimeOverride": false, + "id": 190, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_propose_wait\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "A" + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_propose_wait\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_propose_wait\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_propose_wait\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "avg", - "refId": "C" + "metric": "", + "query": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_propose_wait\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Read index propose wait duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -19171,6 +25867,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -19179,6 +25876,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -19189,230 +25887,284 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "Read index confirm duration associated with async snapshot", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 72 + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null }, - "hiddenSeries": false, - "id": 31116, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "Read index confirm duration associated with async snapshot", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "percentage": false, - "pluginVersion": "7.5.10", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 35 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 191, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_confirm\"}[1m])) by (le))", - "interval": "", - "intervalFactor": 1, - "legendFormat": "99%", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_confirm\"}[1m])) by (le))", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_confirm\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot_read_index_confirm\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "avg", - "refId": "C" + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, "title": "Read index confirm duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": null, - "title": "Storage", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 19 - }, - "id": 9160, - "panels": [ + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "Read index confirm duration associated with async snapshot", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 20 + "x": 12, + "y": 35 }, - "height": "", - "hiddenSeries": false, - "id": 9552, + "height": null, + "hideTimeOverride": false, + "id": 192, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_scheduler_write_flow{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "write-{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", "metric": "", - "refId": "A", - "step": 40 + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(tikv_scheduler_throttle_flow{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance) != 0", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "throttle-{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_engine_async_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_storage_engine_async_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", "metric": "", - "refId": "B", - "step": 40 + "query": "sum(rate(\n tikv_storage_engine_async_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot_read_index_confirm\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler flow", + "title": "Read index confirm duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -19420,7 +26172,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -19428,6 +26181,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -19438,92 +26192,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "CPU usage measured over a 30 second window", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 20 + "x": 0, + "y": 42 }, - "height": "", - "hiddenSeries": false, - "id": 9553, + "height": null, + "hideTimeOverride": false, + "id": 193, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_scheduler_discard_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance) / 10000000", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_storage_process_stat_cpu_usage\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{instance}}", "metric": "", - "refId": "A", - "step": 40 + "query": "sum((\n tikv_storage_process_stat_cpu_usage\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler discard ratio", + "title": "Process Stat Cpu Usage", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -19531,14 +26305,16 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -19549,163 +26325,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 11512, - "legend": { - "show": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(delta(tikv_scheduler_throttle_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Throttle duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, - "bars": true, + "bars": false, "cacheTimeout": null, - "dashLength": 10, - "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of pending commands per TiKV instance", + "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 28 + "y": 42 }, - "height": "", - "hiddenSeries": false, - "id": 11906, + "height": null, + "hideTimeOverride": false, + "id": 194, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, - "lines": false, - "linewidth": 2, + "lines": true, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_scheduler_throttle_cf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"} != 0", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_full_compact_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{cf}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_full_compact_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_full_compact_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_full_compact_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_full_compact_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_full_compact_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_storage_full_compact_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_full_compact_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_full_compact_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", "metric": "", - "refId": "A", - "step": 40 + "query": "sum(rate(\n tikv_storage_full_compact_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler throttled CF", + "title": "Full compaction duration seconds", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -19713,14 +26506,16 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -19731,35 +26526,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 36 + "y": 49 }, - "hiddenSeries": false, - "id": 9947, + "height": null, + "hideTimeOverride": false, + "id": 195, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -19773,43 +26583,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_throttle_action_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (cf, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}-{{cf}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_storage_full_compact_pause_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Flow controller actions", + "title": "Full compaction pause duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -19817,7 +26707,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -19825,6 +26716,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -19835,120 +26727,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 36 + "y": 49 }, - "height": "", - "hiddenSeries": false, - "id": 10338, + "height": null, + "hideTimeOverride": false, + "id": 196, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_scheduler_l0_flow{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance, cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cf}}_l0_flow-{{instance}}", - "metric": "", - "refId": "D", - "step": 40 - }, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, { - "expr": "sum(tikv_scheduler_flush_flow{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance, cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cf}}_flush_flow-{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", "metric": "", - "refId": "E", - "step": 40 + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(tikv_scheduler_l0_flow{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "total_l0_flow-{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", "metric": "", - "refId": "B", - "step": 40 + "query": "(sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(tikv_scheduler_flush_flow{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": true, - "intervalFactor": 2, - "legendFormat": "total_flush_flow-{{instance}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", "metric": "", - "refId": "C", - "step": 40 + "query": "sum(rate(\n tikv_storage_full_compact_increment_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Flush/L0 flow", + "title": "Full compaction per-increment duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -19956,7 +26908,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -19964,6 +26917,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -19974,109 +26928,169 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Storage", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 197, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 44 + "y": 0 }, - "height": "", - "hiddenSeries": false, - "id": 9944, + "height": null, + "hideTimeOverride": false, + "id": 198, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_scheduler_l0{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "l0-{{instance}}", - "metric": "", - "refId": "A", - "step": 40 - }, - { - "expr": "max(tikv_scheduler_memtable{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_write_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memtable-{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write-{{instance}}", "metric": "", - "refId": "B", - "step": 40 + "query": "sum((\n tikv_scheduler_write_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_scheduler_l0_avg{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_throttle_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) != 0", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg_l0-{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "throttle-{{instance}}", "metric": "", - "refId": "C", - "step": 40 + "query": "sum((\n tikv_scheduler_throttle_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) != 0", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Flow controller factors", + "title": "Scheduler flow", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20084,7 +27098,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -20092,6 +27107,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -20102,41 +27118,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, "description": "", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 44 + "y": 0 }, - "hiddenSeries": false, - "id": 9946, + "height": null, + "hideTimeOverride": false, + "id": 199, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -20144,65 +27175,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "cf", - "yaxis": 2 - }, - { - "alias": "pending-bytes", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_pending_compaction_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"kv\"}) by (cf)", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{cf}}", - "metric": "tikv_engine_pending_compaction_bytes", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "max(tikv_scheduler_pending_compaction_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance) / 10000000", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_discard_ratio\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) / 10000000", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "pending-bytes-{{instance}}", - "metric": "tikv_engine_pending_compaction_bytes", - "refId": "B", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum((\n tikv_scheduler_discard_ratio\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) / 10000000", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction pending bytes", + "title": "Scheduler discard ratio", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20210,7 +27231,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -20218,189 +27240,227 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "30", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "Throttle time for txn storage commands in 1 minute.", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 52 + "y": 7 }, - "hiddenSeries": false, - "id": 23763572363, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 200, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": false, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": true, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "show": false }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_txn_command_throttle_time_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_throttle_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_throttle_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Txn command throttled duration", + "title": "Throttle duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Throttle time for non-txn related processing like analyze or dag in 1 minute.", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 52 + "y": 7 }, - "hiddenSeries": false, - "id": 23763572365, + "height": null, + "hideTimeOverride": false, + "id": 201, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "total": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_non_txn_command_throttle_time_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_scheduler_throttle_cf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) != 0", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{cf}}", + "metric": "", + "query": "((\n tikv_scheduler_throttle_cf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) != 0", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Non-txn command throttled duration", + "title": "Scheduler throttled CF", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20408,7 +27468,8 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -20416,6 +27477,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -20426,57 +27488,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "title": "Flow Control", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 2756, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total number of commands on each stage in commit command", + "description": "", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 10, - "w": 24, + "h": 7, + "w": 12, "x": 0, - "y": 18 + "y": 14 }, - "height": "400", - "hiddenSeries": false, - "id": 168, + "height": null, + "hideTimeOverride": false, + "id": 202, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -20484,58 +27545,55 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 1, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_too_busy_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_throttle_action_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, cf) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "busy", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_scheduler_stage_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (stage)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{stage}}", - "refId": "B", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{cf}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_throttle_action_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, cf) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler stage total", + "title": "Flow controller actions", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20543,6 +27601,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -20551,6 +27610,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -20561,116 +27621,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing commit command", + "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 28 + "x": 12, + "y": 14 }, - "hiddenSeries": false, - "id": 3, + "height": null, + "hideTimeOverride": false, + "id": 203, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_l0_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, cf) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}_l0_flow-{{instance}}", "metric": "", - "refId": "A", - "step": 10 + "query": "sum((\n tikv_scheduler_l0_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, cf) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_command_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_flush_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, cf) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}_flush_flow-{{instance}}", "metric": "", - "refId": "B", - "step": 10 + "query": "sum((\n tikv_scheduler_flush_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, cf) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_scheduler_command_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_command_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_l0_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total_l0_flow-{{instance}}", + "metric": "", + "query": "sum((\n tikv_scheduler_l0_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_flush_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total_flush_flow-{{instance}}", "metric": "", - "refId": "C", - "step": 10 + "query": "sum((\n tikv_scheduler_flush_flow\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler command duration", + "title": "Flush/L0 flow", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20678,7 +27779,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -20686,6 +27788,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -20696,116 +27799,142 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time which is caused by latch wait in commit command", + "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 28 + "x": 0, + "y": 21 }, - "hiddenSeries": false, - "id": 194, + "height": null, + "hideTimeOverride": false, + "id": 204, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_scheduler_l0\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "l0-{{instance}}", "metric": "", - "refId": "A", - "step": 10 + "query": "max((\n tikv_scheduler_l0\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_scheduler_memtable\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "memtable-{{instance}}", "metric": "", - "refId": "B", - "step": 10 + "query": "max((\n tikv_scheduler_memtable\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_scheduler_latch_wait_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_latch_wait_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_scheduler_l0_avg\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg_l0-{{instance}}", "metric": "", - "refId": "C", - "step": 10 + "query": "max((\n tikv_scheduler_l0_avg\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler latch wait duration", + "title": "Flow controller factors", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20813,7 +27942,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -20821,6 +27951,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -20831,116 +27962,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of keys read by a commit command", + "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 36 + "x": 12, + "y": 21 }, - "hiddenSeries": false, - "id": 195, + "height": null, + "hideTimeOverride": false, + "id": 205, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_kv_command_key_read_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "metric": "kv_command_key", - "refId": "A", - "step": 10 - }, - { - "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_kv_command_key_read_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_pending_compaction_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"kv\"}\n \n)) by (cf) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}", "metric": "", - "refId": "B", - "step": 10 + "query": "sum((\n tikv_engine_pending_compaction_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"kv\"}\n \n)) by (cf) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_scheduler_kv_command_key_read_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_kv_command_key_read_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_pending_compaction_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (cf) / 10000000", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "pending-bytes-{{instance}}", "metric": "", - "refId": "C", - "step": 10 + "query": "sum((\n tikv_scheduler_pending_compaction_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (cf) / 10000000", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler keys read", + "title": "Compaction pending bytes", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -20948,7 +28090,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -20956,6 +28099,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -20966,116 +28110,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of keys written by a commit command", + "description": "Throttle time for txn storage commands in 1 minute.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 36 + "x": 0, + "y": 28 }, - "hiddenSeries": false, - "id": 373, + "height": null, + "hideTimeOverride": false, + "id": 206, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_kv_command_key_write_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "metric": "kv_command_key", - "refId": "A", - "step": 10 - }, - { - "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_kv_command_key_write_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "metric": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(rate(tikv_scheduler_kv_command_key_write_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_kv_command_key_write_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_txn_command_throttle_time_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", "metric": "", - "refId": "C", - "step": 10 + "query": "sum(rate(\n tikv_txn_command_throttle_time_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler keys written", + "title": "Txn command throttled duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21083,7 +28223,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -21091,6 +28232,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21101,93 +28243,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The keys scan details of each CF when executing commit command", + "description": "Throttle time for non-txn related processing like analyze or dag in 1 minute.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 44 + "x": 12, + "y": 28 }, - "id": 560, + "height": null, + "hideTimeOverride": false, + "id": 207, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_kv_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"$command\"}[1m])) by (tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_non_txn_command_throttle_time_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{tag}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_non_txn_command_throttle_time_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler scan details", + "title": "Non-txn command throttled duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21195,7 +28356,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -21203,6 +28365,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21213,93 +28376,169 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Flow Control", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 208, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The keys scan details of lock CF when executing commit command", + "description": "The total number of commands on each stage in commit command", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 + "h": 7, + "w": 24, + "x": 0, + "y": 0 }, - "id": 675, + "height": null, + "hideTimeOverride": false, + "id": 209, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_kv_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"$command\", cf=\"lock\"}[1m])) by (tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_too_busy_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{tag}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "busy-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_too_busy_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_stage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (stage) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{stage}}", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_scheduler_stage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (stage) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler scan details [lock]", + "title": "Scheduler stage total", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21307,7 +28546,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -21315,6 +28555,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21325,93 +28566,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The keys scan details of write CF when executing commit command", + "description": "The time consumed when executing commit command", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 52 + "y": 7 }, - "id": 829, + "height": null, + "hideTimeOverride": false, + "id": 210, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_kv_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"$command\", cf=\"write\"}[1m])) by (tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_command_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{tag}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_command_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_command_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_command_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_scheduler_command_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_command_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_scheduler_command_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_command_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_command_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_scheduler_command_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler scan details [write]", + "title": "Scheduler command duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21419,7 +28747,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -21427,6 +28756,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21437,93 +28767,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The keys scan details of default CF when executing commit command", + "description": "The time which is caused by latch wait in commit command", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 52 + "y": 7 }, - "id": 830, + "height": null, + "hideTimeOverride": false, + "id": 211, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_kv_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"$command\", cf=\"default\"}[1m])) by (tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{tag}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_scheduler_latch_wait_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler scan details [default]", + "title": "Scheduler latch wait duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21531,7 +28948,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -21539,6 +28957,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21549,125 +28968,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed on reading when executing commit command", + "description": "The count of keys read by a commit command", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 63 + "y": 14 }, - "hiddenSeries": false, - "id": 23763572710, + "height": null, + "hideTimeOverride": false, + "id": 212, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_kv_command_key_read_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "99%", + "intervalFactor": 1, + "legendFormat": "99.99%", "metric": "", - "refId": "A", - "step": 10 + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_kv_command_key_read_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_kv_command_key_read_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "95%", + "intervalFactor": 1, + "legendFormat": "99%", "metric": "", - "refId": "B", - "step": 10 + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_kv_command_key_read_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(tikv_scheduler_processing_read_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_processing_read_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_scheduler_kv_command_key_read_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_kv_command_key_read_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "avg", "metric": "", - "refId": "C", - "step": 10 + "query": "(sum(rate(\n tikv_scheduler_kv_command_key_read_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_kv_command_key_read_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_kv_command_key_read_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_kv_command_key_read_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler command read duration", + "title": "Scheduler keys read", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21675,7 +29149,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -21683,6 +29158,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21693,133 +29169,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed on checking memory locks", + "description": "The count of keys written by a commit command", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 63 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 7236, - "legend": { - "show": false - }, - "links": [], - "repeat": null, - "repeatDirection": null, - "repeatedByRow": true, - "reverseYBuckets": false, - "scopedVars": { - "command": { - "selected": false, - "text": "acquire_pessimistic_lock", - "value": "acquire_pessimistic_lock" - } - }, - "targets": [ - { - "expr": "sum(delta(tikv_storage_check_mem_lock_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le)", - "format": "heatmap", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "metric": "", - "refId": "A", - "step": 10 + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "timeFrom": null, - "timeShift": null, - "title": "Check memory locks duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - } - ], - "repeat": "command", - "title": "Scheduler - $command", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 26 - }, - "id": 2755, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total number of commands on each stage", "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 10, + "h": 7, "w": 12, - "x": 0, - "y": 15 + "x": 12, + "y": 14 }, - "height": "400", - "id": 167, + "height": null, + "hideTimeOverride": false, + "id": 213, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -21827,47 +29226,123 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 1, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tikv_scheduler_too_busy_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (stage)", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_kv_command_key_write_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "busy", - "refId": "A", - "step": 20 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_kv_command_key_write_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_scheduler_stage_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (stage)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_kv_command_key_write_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{stage}}", - "refId": "B", - "step": 20 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_kv_command_key_write_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_scheduler_kv_command_key_write_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_kv_command_key_write_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_scheduler_kv_command_key_write_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_kv_command_key_write_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_kv_command_key_write_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_kv_command_key_write_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler stage total", + "title": "Scheduler keys written", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21875,7 +29350,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -21883,6 +29359,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -21893,36 +29370,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total writing bytes of commands on each stage", + "description": "The keys scan details of each CF when executing commit command", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 10, + "h": 7, "w": 12, - "x": 12, - "y": 15 + "x": 0, + "y": 21 }, - "height": "400", - "id": 3834, + "height": null, + "hideTimeOverride": false, + "id": 214, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -21930,39 +29427,55 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 1, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_scheduler_writing_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\"}\n [$__rate_interval]\n)) by (tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 20 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\"}\n [$__rate_interval]\n)) by (tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler writing bytes", + "title": "Scheduler scan details", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -21970,7 +29483,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -21978,7 +29492,8 @@ "show": true }, { - "format": "bytes", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -21988,81 +29503,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of different priority commands", + "description": "The keys scan details of lock CF when executing commit command", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 25 + "x": 12, + "y": 21 }, - "height": "", - "id": 1, + "height": null, + "hideTimeOverride": false, + "id": 215, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_scheduler_commands_pri_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (priority)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\", cf=\"lock\"}\n [$__rate_interval]\n)) by (tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{priority}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{tag}}", "metric": "", - "refId": "A", - "step": 40 + "query": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\", cf=\"lock\"}\n [$__rate_interval]\n)) by (tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler priority commands", + "title": "Scheduler scan details [lock]", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -22070,7 +29616,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -22078,6 +29625,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -22088,123 +29636,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 300 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "120s", - "handler": 1, - "message": "TiKV scheduler context total", - "name": "scheduler pending commands alert", - "noDataState": "ok", - "notifications": [] - }, "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of pending commands per TiKV instance", + "description": "The keys scan details of write CF when executing commit command", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 25 + "x": 0, + "y": 28 }, - "height": "", - "id": 193, + "height": null, + "hideTimeOverride": false, + "id": 216, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_scheduler_contex_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\", cf=\"write\"}\n [$__rate_interval]\n)) by (tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{tag}}", "metric": "", - "refId": "A", - "step": 40 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 300 + "query": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\", cf=\"write\"}\n [$__rate_interval]\n)) by (tag) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scheduler pending commands", + "title": "Scheduler scan details [write]", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -22212,7 +29749,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -22220,6 +29758,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -22230,121 +29769,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The keys scan details of default CF when executing commit command", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 45 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763572468, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker.*\"}[1m])) by (le)", - "format": "heatmap", - "interval": "", - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "timeFrom": null, - "timeShift": null, - "title": "Txn Scheduler Pool Wait Duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - } - ], - "repeat": null, - "title": "Scheduler", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 27 - }, - "id": 2758, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of GC tasks processed by gc_worker", "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 46 + "x": 12, + "y": 28 }, - "id": 121, + "height": null, + "hideTimeOverride": false, + "id": 217, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -22352,62 +29826,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gcworker_gc_tasks_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (task)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "total-{{task}}", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_storage_gc_skipped_counter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (task)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\", cf=\"default\"}\n [$__rate_interval]\n)) by (tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "skipped-{{task}}", - "metric": "tikv_storage_gc_skipped_counter", - "refId": "B", - "step": 4 - }, - { - "expr": "sum(rate(tikv_gcworker_gc_task_fail_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (task)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "failed-{{task}}", - "refId": "C" - }, - { - "expr": "sum(rate(tikv_gc_worker_too_busy{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "gcworker-too-busy", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_kv_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"$command\", cf=\"default\"}\n [$__rate_interval]\n)) by (tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "GC tasks", + "title": "Scheduler scan details [default]", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -22415,7 +29882,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -22423,6 +29891,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -22433,35 +29902,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing GC tasks", + "description": "The time consumed on reading when executing commit command", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 46 + "x": 0, + "y": 35 }, - "id": 2224, + "height": null, + "hideTimeOverride": false, + "id": 218, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -22469,62 +29959,123 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_gcworker_gc_task_duration_vec_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, task))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_processing_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max-{{task}}", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_scheduler_processing_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_gcworker_gc_task_duration_vec_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, task))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_processing_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%-{{task}}", - "metric": "tikv_storage_gc_skipped_counter", - "refId": "B", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_scheduler_processing_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.95, sum(rate(tikv_gcworker_gc_task_duration_vec_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, task))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_scheduler_processing_read_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_processing_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%-{{task}}", - "refId": "C" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_scheduler_processing_read_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_scheduler_processing_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_gcworker_gc_task_duration_vec_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (task) / sum(rate(tikv_gcworker_gc_task_duration_vec_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (task)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_processing_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "average-{{task}}", - "refId": "D" + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_processing_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "GC tasks duration", + "title": "Scheduler command read duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -22532,6 +30083,7 @@ }, "yaxes": [ { + "decimals": null, "format": "s", "label": null, "logBase": 1, @@ -22540,6 +30092,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -22550,228 +30103,202 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The GC duration", + "description": "The time consumed on checking memory locks", "editable": true, "error": false, - "fill": 1, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 53 + "x": 12, + "y": 35 }, - "id": 969, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 219, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 2, "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "histogram_quantile(1.0, sum(rate(tidb_tikvclient_gc_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 40 + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_check_mem_lock_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_storage_check_mem_lock_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "TiDB GC seconds", + "title": "Check memory locks duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 2, - "description": "The count of TiDB GC worker actions", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 53 - }, - "id": 966, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tidb_tikvclient_gc_worker_actions_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "TiDB GC worker actions", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 2, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "repeat": "command", + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Scheduler - $command", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] } - }, + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 220, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "Progress of ResolveLocks, the first phase of GC", + "description": "The total number of commands on each stage", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 60 + "y": 0 }, - "id": 2823, + "height": null, + "hideTimeOverride": false, + "id": 221, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -22779,41 +30306,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tidb_tikvclient_range_task_stats{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"resolve-locks.*\"}) by (result)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_too_busy_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (stage) ", "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{result}}", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{stage}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_too_busy_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (stage) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_stage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (stage) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{stage}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_stage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (stage) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "ResolveLocks Progress", + "title": "Scheduler stage total", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -22821,8 +30377,8 @@ }, "yaxes": [ { - "decimals": 0, - "format": "none", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -22830,45 +30386,67 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "Progress of TiKV's GC", + "description": "The total writing bytes of commands on each stage", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 60 + "y": 0 }, - "id": 2821, + "height": null, + "hideTimeOverride": false, + "id": 222, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -22876,41 +30454,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_gcworker_autogc_processed_regions{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"scan\"}) by (instance) / sum(tikv_raftstore_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"region\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_writing_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum((\n tikv_scheduler_writing_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "TiKV Auto GC Progress", + "title": "Scheduler writing bytes", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -22919,95 +30511,131 @@ "yaxes": [ { "decimals": null, - "format": "percentunit", + "format": "bytes", "label": null, "logBase": 1, - "max": "1.1", + "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "keys / second", + "description": "The count of different priority commands", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 67 + "y": 7 }, - "id": 2589, + "height": null, + "hideTimeOverride": false, + "id": 223, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_storage_mvcc_gc_delete_versions_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_scheduler_commands_pri_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (priority) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{key_mode}}_keys/s", - "refId": "E" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{priority}}", + "metric": "", + "query": "sum(rate(\n tikv_scheduler_commands_pri_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (priority) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "GC speed", + "title": "Scheduler priority commands", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 2, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23015,14 +30643,16 @@ }, "yaxes": [ { - "format": "short", - "label": "", + "decimals": null, + "format": "ops", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -23033,35 +30663,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "SafePoint used for TiKV's Auto GC", - "fill": 0, + "description": "The count of pending commands per TiKV instance", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 67 + "y": 7 }, - "id": 2822, + "height": null, + "hideTimeOverride": false, + "id": 224, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -23069,41 +30720,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_gcworker_autogc_safe_point{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}) by (instance) / (2^18)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_scheduler_contex_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum((\n tikv_scheduler_contex_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "TiKV Auto GC SafePoint", + "title": "Scheduler pending commands", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23112,7 +30777,7 @@ "yaxes": [ { "decimals": null, - "format": "dateTimeAsIso", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -23120,216 +30785,213 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": " \tThe lifetime of TiDB GC", + "description": null, "editable": true, "error": false, - "format": "s", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, - "w": 6, + "w": 24, "x": 0, - "y": 74 + "y": 14 }, - "id": 27, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 225, "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "null", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", + "legend": { "show": false }, - "tableColumn": "", + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "max(tidb_tikvclient_gc_config{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"tikv_gc_life_time\"})", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_yatp_pool_schedule_wait_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sched-worker.*\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "refId": "A", - "step": 60 - } - ], - "thresholds": "", - "title": "GC lifetime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_yatp_pool_schedule_wait_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sched-worker.*\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "The interval of TiDB GC", - "editable": true, - "error": false, - "format": "s", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "timeFrom": null, + "timeShift": null, + "title": "Txn Scheduler Pool Wait Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 74 + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] }, - "id": 28, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "null", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "tableColumn": "", - "targets": [ - { - "expr": "max(tidb_tikvclient_gc_config{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"tikv_gc_run_interval\"})", - "format": "time_series", - "intervalFactor": 2, - "refId": "A", - "step": 60 - } - ], - "thresholds": "", - "title": "GC interval", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Scheduler", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 226, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "Keys handled in GC compaction filter", - "fill": 0, + "description": "The count of GC tasks processed by gc_worker", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 74 + "x": 0, + "y": 0 }, - "id": 6596, + "height": null, + "hideTimeOverride": false, + "id": 227, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -23337,97 +30999,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gc_compaction_filtered{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gcworker_gc_tasks_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (task) ", "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{key_mode}}_filtered", - "metric": "tikv_storage_command_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_gc_compaction_filter_skip{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{key_mode}}_skipped", - "refId": "B" - }, - { - "expr": "sum(rate(tikv_gc_compaction_mvcc_rollback{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{key_mode}}_mvcc-rollback/mvcc-lock", - "refId": "C" - }, - { - "expr": "sum(rate(tikv_gc_compaction_filter_orphan_versions{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", - "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{key_mode}}_orphan-versions", - "refId": "D" - }, - { - "expr": "sum(rate(tikv_gc_compaction_filter_perform{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{key_mode}}_performed-times", - "refId": "E" - }, - { - "expr": "sum(rate(tikv_gc_compaction_failure{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode,type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{key_mode}}_failure-{{type}}", - "refId": "F" + "legendFormat": "total-{{task}}", + "metric": "", + "query": "sum(rate(\n tikv_gcworker_gc_tasks_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (task) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_met{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_gc_skipped_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (task) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{key_mode}}_mvcc-deletion-met", - "refId": "G" + "legendFormat": "skipped-{{task}}", + "metric": "", + "query": "sum(rate(\n tikv_storage_gc_skipped_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (task) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_handled{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gcworker_gc_task_fail_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (task) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{key_mode}}_mvcc-deletion-handled", - "refId": "H" + "legendFormat": "failed-{{task}}", + "metric": "", + "query": "sum(rate(\n tikv_gcworker_gc_task_fail_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (task) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_wasted{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_worker_too_busy\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{key_mode}}_mvcc-deletion-wasted", - "refId": "I" + "legendFormat": "gcworker-too-busy", + "metric": "", + "query": "sum(rate(\n tikv_gc_worker_too_busy\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "GC in Compaction Filter", + "title": "GC tasks", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23436,7 +31101,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -23444,45 +31109,67 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "GC scan write details", - "fill": 0, + "description": "The time consumed when executing GC tasks", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 86 + "x": 12, + "y": 0 }, - "id": 8767, + "height": null, + "hideTimeOverride": false, + "id": 228, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -23490,130 +31177,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ { - "expr": "sum(rate(tikv_gcworker_gc_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\", cf=\"write\"}[1m])) by (key_mode,tag)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{key_mode}}_{{tag}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GC scan write details", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, - "description": "GC scan default details", - "fill": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 86 - }, - "id": 8768, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gcworker_gc_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\", cf=\"default\"}[1m])) by (key_mode,tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_gcworker_gc_task_duration_vec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_gcworker_gc_task_duration_vec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_gcworker_gc_task_duration_vec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_gcworker_gc_task_duration_vec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_gcworker_gc_task_duration_vec_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_gcworker_gc_task_duration_vec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_gcworker_gc_task_duration_vec_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_gcworker_gc_task_duration_vec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gcworker_gc_task_duration_vec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": true, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{key_mode}}_{{tag}}", - "refId": "A" + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_gcworker_gc_task_duration_vec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"$command\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "GC scan default details", + "title": "GC tasks duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23622,7 +31302,7 @@ "yaxes": [ { "decimals": null, - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -23630,69 +31310,67 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "GC", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 28 - }, - "id": 2759, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The rate of Raft snapshot messages sent", + "description": "The GC duration", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 31 + "y": 7 }, - "hiddenSeries": false, - "id": 35, + "height": null, + "hideTimeOverride": false, + "id": 229, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -23700,43 +31378,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(delta(tikv_raftstore_raft_sent_message_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(1,(\n sum(rate(\n tidb_tikvclient_gc_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": " ", - "refId": "A", - "step": 60 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "histogram_quantile(1,(\n sum(rate(\n tidb_tikvclient_gc_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Rate snapshot message", + "title": "TiDB GC seconds", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23744,7 +31434,8 @@ }, "yaxes": [ { - "format": "opm", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -23752,6 +31443,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -23762,43 +31454,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of snapshots in different states", + "description": "The count of TiDB GC worker actions", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 31 + "y": 7 }, - "hiddenSeries": false, - "id": 38, + "height": null, + "hideTimeOverride": false, + "id": 230, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -23806,44 +31511,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "expr": "sum(tikv_raftstore_snapshot_traffic_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_tikvclient_gc_worker_actions_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", "metric": "", - "refId": "A", - "step": 60 + "query": "sum(rate(\n tidb_tikvclient_gc_worker_actions_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Snapshot state count", + "title": "TiDB GC worker actions", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23851,7 +31567,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -23859,6 +31576,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -23869,43 +31587,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time snapshot generation tasks waited to be scheduled. ", + "description": "Progress of ResolveLocks, the first phase of GC", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 38 + "y": 14 }, - "hiddenSeries": false, - "id": 37, + "height": null, + "hideTimeOverride": false, + "id": 231, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -23913,45 +31644,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_snapshot_generation_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tidb_tikvclient_range_task_stats\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",type=~\"resolve-locks.*\"}\n \n)) by (result) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 60 + "intervalFactor": 1, + "legendFormat": "{{result}}", + "metric": "", + "query": "max((\n tidb_tikvclient_range_task_stats\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",type=~\"resolve-locks.*\"}\n \n)) by (result) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Snapshot generation wait duration", + "title": "ResolveLocks Progress", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -23959,7 +31700,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -23967,6 +31709,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -23977,43 +31720,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed when handling snapshots", + "description": "Progress of TiKV's GC", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 38 + "y": 14 }, - "hiddenSeries": false, - "id": 23763573704, + "height": null, + "hideTimeOverride": false, + "id": 232, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24021,59 +31777,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_server_send_snapshot_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "send", - "refId": "A", - "step": 60 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_snapshot_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"apply\"}[1m])) by (le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "apply", - "refId": "B", - "step": 60 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_snapshot_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"generate\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum((\n tikv_gcworker_autogc_processed_regions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"scan\"}\n \n)) by (instance) / sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"region\"}\n \n)) by (instance) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "generate", - "refId": "C", - "step": 60 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "(sum((\n tikv_gcworker_autogc_processed_regions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"scan\"}\n \n)) by (instance) / sum((\n tikv_raftstore_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"region\"}\n \n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% Handle snapshot duration", + "title": "TiKV Auto GC Progress", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24081,7 +31833,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -24089,6 +31842,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24099,43 +31853,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The snapshot size (P99.99).9999", + "description": "keys / second", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 45 + "y": 21 }, - "hiddenSeries": false, - "id": 44, + "height": null, + "hideTimeOverride": false, + "id": 233, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24143,44 +31910,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.9999, sum(rate(tikv_snapshot_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_mvcc_gc_delete_versions_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "size", - "metric": "tikv_snapshot_size_bucket", - "refId": "A", - "step": 40 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_keys/s", + "metric": "", + "query": "sum(rate(\n tikv_storage_mvcc_gc_delete_versions_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99.99% Snapshot size", + "title": "GC speed", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24188,7 +31966,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -24196,6 +31975,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24206,43 +31986,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of KV within a snapshot in .9999", + "description": "SafePoint used for TiKV's Auto GC", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 45 + "y": 21 }, - "hiddenSeries": false, - "id": 43, + "height": null, + "hideTimeOverride": false, + "id": 234, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24250,44 +32043,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.9999, sum(rate(tikv_snapshot_kv_count_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_gcworker_autogc_safe_point\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\"}\n \n)) by (instance) / (2^18)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "count", - "metric": "tikv_snapshot_kv_count_bucket", - "refId": "A", - "step": 40 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "max((\n tikv_gcworker_autogc_safe_point\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\"}\n \n)) by (instance) / (2^18)", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99.99% Snapshot KV count", + "title": "TiKV Auto GC SafePoint", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24295,7 +32099,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "dateTimeAsIso", "label": null, "logBase": 1, "max": null, @@ -24303,6 +32108,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24313,155 +32119,206 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Action stats for snapshot generating and applying", + "description": "The lifetime of TiDB GC", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, + "defaults": { + "custom": {}, + "decimals": null, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "s" + }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, - "w": 12, + "w": 6, "x": 0, - "y": 52 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "y": 28 }, - "lines": true, - "linewidth": 1, + "height": null, + "hideTimeOverride": false, + "id": 235, + "interval": null, "links": [], - "nullPointMode": "null as zero", + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "options": { - "alertThreshold": true + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" }, - "percentage": false, - "pluginVersion": "7.5.10", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "repeat": null, + "repeatDirection": null, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_raftstore_snapshot_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type, status)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{type}}-{{status}}", - "refId": "A" - }, - { - "expr": "sum(delta(tikv_raftstore_clean_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tidb_tikvclient_gc_config\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",type=\"tikv_gc_life_time\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "clean-region-by-{{type}}", - "refId": "B" + "legendFormat": null, + "metric": "", + "query": "max((\n tidb_tikvclient_gc_config\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",type=\"tikv_gc_life_time\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Snapshot Actions", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" + "title": "GC lifetime", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The interval of TiDB GC", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "s" + }, + "overrides": [] }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 28 }, - "yaxes": [ - { - "format": "opm", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true + "height": null, + "hideTimeOverride": false, + "id": 236, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tidb_tikvclient_gc_config\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",type=\"tikv_gc_run_interval\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": null, + "metric": "", + "query": "max((\n tidb_tikvclient_gc_config\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",type=\"tikv_gc_run_interval\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "timeFrom": null, + "timeShift": null, + "title": "GC interval", + "transformations": [], + "transparent": false, + "type": "stat" }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The speed of sending or receiving snapshot", + "description": "Keys handled in GC compaction filter", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 52 + "y": 35 }, - "hiddenSeries": false, - "id": 4201, + "height": null, + "hideTimeOverride": false, + "id": 237, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24469,52 +32326,175 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.10", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(tikv_snapshot_limit_transport_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filtered\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}--{{type}}", - "metric": "tikv_snapshot_limit_transport_bytes", - "refId": "A", - "step": 40 - }, - { - "exemplar": true, - "expr": "rate(tikv_snapshot_limit_generate_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", - "hide": true, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_filtered", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filtered\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filter_skip\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_skipped", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filter_skip\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_mvcc_rollback\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_mvcc-rollback/mvcc-lock", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_mvcc_rollback\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filter_orphan_versions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_orphan-versions", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filter_orphan_versions\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filter_perform\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_performed-times", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filter_perform\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_failure\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode, type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_failure-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_failure\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode, type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filter_mvcc_deletion_met\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_mvcc-deletion-met", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filter_mvcc_deletion_met\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filter_mvcc_deletion_handled\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_mvcc-deletion-handled", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filter_mvcc_deletion_handled\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gc_compaction_filter_mvcc_deletion_wasted\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}}--generate", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{key_mode}}_mvcc-deletion-wasted", + "metric": "", + "query": "sum(rate(\n tikv_gc_compaction_filter_mvcc_deletion_wasted\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (key_mode) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Snapshot transport speed", + "title": "GC in Compaction Filter", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24522,74 +32502,76 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Snapshot", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 29 - }, - "id": 2760, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of tasks handled by worker", + "description": "GC scan write details", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 21 + "y": 42 }, - "id": 59, + "height": null, + "hideTimeOverride": false, + "id": 238, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24597,39 +32579,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_worker_handled_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gcworker_gc_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"write\"}\n [$__rate_interval]\n)) by (key_mode, tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}-{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_gcworker_gc_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"write\"}\n [$__rate_interval]\n)) by (key_mode, tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Worker handled tasks", + "title": "GC scan write details", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24637,14 +32635,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24655,40 +32655,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tCurrent pending and running tasks of worker", + "description": "GC scan default details", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 21 + "y": 42 }, - "id": 1395, + "height": null, + "hideTimeOverride": false, + "id": 239, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24696,39 +32712,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_worker_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_gcworker_gc_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"default\"}\n [$__rate_interval]\n)) by (key_mode, tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{key_mode}}-{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_gcworker_gc_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"default\"}\n [$__rate_interval]\n)) by (key_mode, tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Worker pending tasks", + "title": "GC scan default details", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24736,14 +32768,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24754,40 +32788,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "GC", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 240, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of tasks handled by future_pool", + "description": "The rate of Raft snapshot messages sent", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 29 + "y": 0 }, - "id": 1876, + "height": null, + "hideTimeOverride": false, + "id": 241, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24795,39 +32887,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_futurepool_handled_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [1m]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_raftstore_raft_sent_message_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [1m]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "FuturePool handled tasks", + "title": "Rate snapshot message", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24835,14 +32943,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "opm", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24853,40 +32963,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "Current pending and running tasks of future_pool", + "description": "The number of snapshots in different states", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 29 + "y": 0 }, - "id": 1877, + "height": null, + "hideTimeOverride": false, + "id": 242, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -24894,39 +33020,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_snapshot_traffic_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum((\n tikv_raftstore_snapshot_traffic_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "FuturePool pending tasks", + "title": "Snapshot state count", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -24934,14 +33076,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -24952,135 +33096,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Task", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 30 - }, - "id": 2757, - "panels": [ - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#5195ce", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "min": null, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed to handle coprocessor read requests", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 22 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 3062, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": false, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(rate(tikv_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Request duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "The time snapshot generation tasks waited to be scheduled. ", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 22 + "x": 0, + "y": 7 }, - "id": 16, + "height": null, + "hideTimeOverride": false, + "id": 243, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25088,46 +33153,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-100%", - "refId": "E" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_snapshot_generation_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-99%", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_snapshot_generation_wait_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Request duration", + "title": "99% Snapshot generation wait duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 1, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25135,56 +33209,76 @@ }, "yaxes": [ { - "decimals": 1, + "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The time consumed when handling snapshots", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 30 + "x": 12, + "y": 7 }, - "id": 74, + "height": null, + "hideTimeOverride": false, + "id": 244, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25192,41 +33286,85 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_request_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_send_snapshot_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{req}}", - "metric": "tikv_coprocessor_request_error", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "send", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_server_send_snapshot_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_snapshot_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"apply\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "apply", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_snapshot_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"apply\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_snapshot_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"generate\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "generate", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_raftstore_snapshot_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"generate\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Requests", + "title": "99% Handle snapshot duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25234,56 +33372,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "ops", - "label": "", + "decimals": null, + "format": "s", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The snapshot size (P99.99).9999", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 30 + "x": 0, + "y": 14 }, - "id": 3128, + "height": null, + "hideTimeOverride": false, + "id": 245, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25291,41 +33449,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_request_error{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (reason)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_snapshot_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{reason}}", - "metric": "tikv_coprocessor_request_error", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "size", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_snapshot_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Request Errors", + "title": "99.99% Snapshot size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25333,56 +33505,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "ops", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The number of KV within a snapshot in .9999", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 37 + "x": 12, + "y": 14 }, - "id": 52, + "height": null, + "hideTimeOverride": false, + "id": 246, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25390,38 +33582,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_scan_keys_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_snapshot_kv_count_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_snapshot_kv_count_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total KV Cursor Operations", + "title": "99.99% Snapshot KV count", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25429,56 +33638,76 @@ }, "yaxes": [ { - "decimals": 0, - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "Action stats for snapshot generating and applying", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 37 + "x": 0, + "y": 21 }, - "id": 3129, + "height": null, + "hideTimeOverride": false, + "id": 247, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25486,45 +33715,70 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, avg(rate(tikv_coprocessor_scan_keys_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, req)) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_raftstore_snapshot_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, status) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "100%-{{req}}", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{status}}", + "metric": "", + "query": "sum(delta(\n tikv_raftstore_snapshot_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, status) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, avg(rate(tikv_coprocessor_scan_keys_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, req)) ", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_raftstore_clean_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, status) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%-{{req}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "clean-region-by-{{type}}", + "metric": "", + "query": "sum(delta(\n tikv_raftstore_clean_region_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, status) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "KV Cursor Operations", + "title": "Snapshot Actions", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25532,162 +33786,76 @@ }, "yaxes": [ { - "decimals": 0, - "format": "short", + "decimals": null, + "format": "opm", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "The speed of sending or receiving snapshot", "editable": true, "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 2118, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "key_skipped", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\" ,metric=\"internal_delete_skipped_count\"}[1m])) by (req)", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "delete_skipped-{{req}}", - "metric": "scan_details", - "refId": "B", - "step": 4 + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Total RocksDB Perf Statistics", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] }, - "yaxes": [ - { - "decimals": 1, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": null, - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "editable": true, - "error": false, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 44 + "y": 21 }, - "id": 551, + "height": null, + "hideTimeOverride": false, + "id": 248, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25695,41 +33863,70 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_response_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_snapshot_limit_transport_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "size", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_snapshot_limit_transport_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_snapshot_limit_generate_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-generate", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_snapshot_limit_generate_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Response Size", + "title": "Snapshot transport speed", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25737,72 +33934,118 @@ }, "yaxes": [ { - "decimals": 0, - "format": "decbytes", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, - "title": "Coprocessor Overview", + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Snapshot", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 31 + "y": 0 }, - "id": 3197, + "height": null, + "hideTimeOverride": false, + "id": 249, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when handling coprocessor requests", + "description": "The number of tasks handled by worker", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 23 + "y": 0 }, - "id": 113, + "height": null, + "hideTimeOverride": false, + "id": 250, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25810,46 +34053,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-100%", - "refId": "E" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_worker_handled_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-99%", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "metric": "", + "query": "sum(rate(\n tikv_worker_handled_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Handle duration", + "title": "Worker handled tasks", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 1, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25857,58 +34109,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "s", - "label": "", + "decimals": null, + "format": "ops", + "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "decimals": 1, + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed to handle coprocessor requests per TiKV instance (P95)", + "description": "Current pending and running tasks of worker", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 23 + "y": 0 }, - "id": 117, + "height": null, + "hideTimeOverride": false, + "id": 251, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -25916,39 +34186,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance,req))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_worker_pending_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (name) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{req}}", - "refId": "B", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "metric": "", + "query": "sum((\n tikv_worker_pending_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "95% Handle duration by store", + "title": "Worker pending tasks", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -25956,56 +34242,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when coprocessor requests are wait for being handled", + "description": "The number of tasks handled by future_pool", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 30 + "y": 7 }, - "id": 111, + "height": null, + "hideTimeOverride": false, + "id": 252, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26013,46 +34319,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_futurepool_handled_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-100%", - "refId": "D" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-99%", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "metric": "", + "query": "sum(rate(\n tikv_futurepool_handled_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Wait duration", + "title": "FuturePool handled tasks", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26060,56 +34375,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "s", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when coprocessor requests are wait for being handled in each TiKV instance", + "description": "Current pending and running tasks of future_pool", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 30 + "y": 7 }, - "id": 116, + "height": null, + "hideTimeOverride": false, + "id": 253, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26117,39 +34452,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance,req))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(avg_over_time(\n tikv_futurepool_pending_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (name) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{req}}", - "refId": "B", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "metric": "", + "query": "sum(avg_over_time(\n tikv_futurepool_pending_task_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "95% Wait duration by store", + "title": "FuturePool pending tasks", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26157,157 +34508,222 @@ }, "yaxes": [ { - "decimals": 1, - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "decimals": 1, + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Task", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 254, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The time consumed to handle coprocessor read requests", "editable": true, "error": false, - "fill": 1, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 37 + "y": 0 }, - "id": 3195, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 255, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_dag_request_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (vec_type)", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{vec_type}}", + "intervalFactor": 1, + "legendFormat": "{{le}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total DAG Requests", + "title": "Request duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "decimals": 1, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total number of DAG executors", + "description": "The time consumed to handle coprocessor read requests", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 37 + "y": 0 }, - "id": 3264, + "height": null, + "hideTimeOverride": false, + "id": 256, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26315,41 +34731,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ { - "expr": "sum(rate(tikv_coprocessor_executor_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{type}}", + "intervalFactor": 1, + "legendFormat": "99.99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}", "metric": "", - "refId": "A", - "step": 4 + "query": "sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total DAG Executors", + "title": "Request duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26357,56 +34855,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 9, + "h": 7, "w": 12, "x": 0, - "y": 44 + "y": 7 }, - "id": 552, + "height": null, + "hideTimeOverride": false, + "id": 257, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26414,41 +34932,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"select\"}[1m])) by (tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{tag}}", - "metric": "scan_details", - "refId": "B", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Ops Details (Table Scan)", + "title": "Total Requests", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26456,56 +34988,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "short", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 9, + "h": 7, "w": 12, "x": 12, - "y": 44 + "y": 7 }, - "id": 3263, + "height": null, + "hideTimeOverride": false, + "id": 258, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26513,41 +35065,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"index\"}[1m])) by (tag)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_error\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (reason) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{tag}}", - "metric": "scan_details", - "refId": "B", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_error\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (reason) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Ops Details (Index Scan)", + "title": "Total Request Errors", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26555,56 +35121,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "short", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 9, + "h": 7, "w": 12, "x": 0, - "y": 53 + "y": 14 }, - "id": 122, + "height": null, + "hideTimeOverride": false, + "id": 259, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26612,42 +35198,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"select\"}[1m])) by (tag,cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_scan_keys_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{cf}}-{{tag}}", - "metric": "scan_details", - "refId": "B", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_scan_keys_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Ops Details by CF (Table Scan)", + "title": "KV Cursor Operations", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26655,56 +35254,76 @@ }, "yaxes": [ { - "decimals": 1, - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "", "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 9, + "h": 7, "w": 12, "x": 12, - "y": 53 + "y": 14 }, - "id": 554, + "height": null, + "hideTimeOverride": false, + "id": 260, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26712,43 +35331,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "repeat": "cf", - "repeatDirection": "h", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_scan_details{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=\"index\"}[1m])) by (tag,cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_scan_keys_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{cf}}-{{tag}}", - "metric": "scan_details", - "refId": "B", - "step": 4 + "intervalFactor": 1, + "legendFormat": "99.99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_scan_keys_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_scan_keys_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_scan_keys_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_scan_keys_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_scan_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_scan_keys_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_scan_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_scan_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_scan_keys_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total Ops Details by CF (Index Scan)", + "title": "KV Cursor Operations", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26756,144 +35455,76 @@ }, "yaxes": [ { - "decimals": 1, + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#5195ce", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed on checking memory locks for coprocessor requests", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 119 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 7594, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(rate(tikv_coprocessor_mem_lock_check_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Memory lock checking duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - } - ], - "title": "Coprocessor Detail", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 2761, - "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 24 + "y": 21 }, - "id": 2108, + "height": null, + "hideTimeOverride": false, + "id": 261, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -26901,45 +35532,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, - "points": true, + "pointradius": 5, + "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_threads_state{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance, state)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"internal_delete_skipped_count\"}\n [$__rate_interval]\n)) by (req) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}-{{state}}", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(tikv_threads_state{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-total", - "refId": "B" + "legendFormat": "delete_skipped-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"internal_delete_skipped_count\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Threads state", + "title": "Total RocksDB Perf Statistics", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -26947,6 +35588,7 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, @@ -26955,6 +35597,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -26965,28 +35608,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 24 + "y": 21 }, - "id": 2258, + "height": null, + "hideTimeOverride": false, + "id": 262, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -27000,40 +35665,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, - "points": true, + "pointradius": 5, + "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "topk(20, sum(rate(tikv_threads_io_bytes_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) by (name, io) > 1024)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_response_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{name}}-{{io}}", - "refId": "A", - "step": 4 + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_response_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Threads IO", + "title": "Total Response Size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27041,7 +35721,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -27049,6 +35730,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27059,28 +35741,92 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Coprocessor Overview", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 263, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The time consumed when handling coprocessor requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 31 + "y": 0 }, - "id": 2660, + "height": null, + "hideTimeOverride": false, + "id": 264, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -27094,40 +35840,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, - "points": true, + "pointradius": 5, + "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "topk(20, max(rate(tikv_thread_voluntary_context_switches{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) by (name) > 200)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{name}}", - "refId": "A", - "step": 4 + "legendFormat": "99.99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_request_handle_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_request_handle_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Thread Voluntary Context Switches", + "title": "Handle duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27135,7 +35964,8 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -27143,6 +35973,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27153,28 +35984,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The time consumed to handle coprocessor requests per TiKV instance", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 31 + "y": 0 }, - "id": 2661, + "height": null, + "hideTimeOverride": false, + "id": 265, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -27188,40 +36041,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, - "points": true, + "pointradius": 5, + "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "topk(20, max(rate(tikv_thread_nonvoluntary_context_switches{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[30s])) by (name) > 100)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{name}}", - "refId": "A", - "step": 4 + "legendFormat": "99.99%-{{req}}-{{instance}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}-{{instance}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_handle_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_request_handle_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance) / sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}-{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_request_handle_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance) / sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_handle_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (req, instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Thread Nonvoluntary Context Switches", + "title": "Handle duration by store", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27229,7 +36165,8 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -27237,6 +36174,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27247,57 +36185,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": null, - "title": "Threads", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 2762, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of get operations", + "description": "The time consumed when coprocessor requests are wait for being handled", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 59 + "y": 7 }, - "hiddenSeries": false, - "id": 138, + "height": null, + "hideTimeOverride": false, + "id": 266, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -27305,83 +36242,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_memtable_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"memtable_hit\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memtable", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%-{{req}}", "metric": "", - "refId": "B", - "step": 10 + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=~\"block_cache_data_hit|block_cache_filter_hit\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "block_cache", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}", "metric": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_get_served{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"get_hit_l0\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "l0", - "refId": "A", - "step": 10 + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_get_served{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"get_hit_l1\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_request_wait_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "l1", - "refId": "C", - "step": 10 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_request_wait_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_get_served{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"get_hit_l2_and_up\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "l2_and_up", - "refId": "F", - "step": 10 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Get operations", + "title": "Wait duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27389,7 +36366,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -27397,6 +36375,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27407,41 +36386,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing get operations", + "description": "The time consumed when coprocessor requests are wait for being handled in each TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 59 + "y": 7 }, - "hiddenSeries": false, - "id": 82, + "height": null, + "hideTimeOverride": false, + "id": 267, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -27449,73 +36443,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_get_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"get_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%-{{req}}-{{instance}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_get_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"get_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "B", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}-{{instance}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_wait_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_get_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"get_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_request_wait_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance) / sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C", - "step": 10 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}-{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_request_wait_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance) / sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_get_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"get_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "D", - "step": 10 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_wait_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"all\"}\n [$__rate_interval]\n)) by (req, instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Get duration", + "title": "Wait duration by store", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27523,14 +36567,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "s", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27541,41 +36587,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of seek operations", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 67 + "y": 14 }, - "hiddenSeries": false, - "id": 129, + "height": null, + "hideTimeOverride": false, + "id": 268, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -27583,95 +36644,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"number_db_seek\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "seek", - "metric": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"number_db_seek_found\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "seek_found", - "metric": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"number_db_next\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "next", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"number_db_next_found\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_dag_request_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (vec_type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "next_found", - "metric": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"number_db_prev\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "prev", - "metric": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"number_db_prev_found\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "prev_found", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{vec_type}}", "metric": "", - "refId": "F", - "step": 10 + "query": "sum(rate(\n tikv_coprocessor_dag_request_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (vec_type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Seek operations", + "title": "Total DAG Requests", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27679,7 +36700,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -27687,6 +36709,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27697,41 +36720,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing seek operation", + "description": "The total number of DAG executors", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 67 + "y": 14 }, - "hiddenSeries": false, - "id": 125, + "height": null, + "hideTimeOverride": false, + "id": 269, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -27739,73 +36777,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_seek_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"seek_max\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(tikv_engine_seek_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"seek_percentile99\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "B", - "step": 10 - }, - { - "expr": "avg(tikv_engine_seek_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"seek_percentile95\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C", - "step": 10 - }, - { - "expr": "avg(tikv_engine_seek_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"seek_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_executor_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "D", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_executor_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Seek duration", + "title": "Total DAG Executors", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27813,14 +36833,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "ops", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27831,41 +36853,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of write operations", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 75 + "y": 21 }, - "hiddenSeries": false, - "id": 139, + "height": null, + "hideTimeOverride": false, + "id": 270, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -27873,65 +36910,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_write_served{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=~\"write_done_by_self|write_done_by_other\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "done", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_write_served{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"write_timeout\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "timeout", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_write_served{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"write_with_wal\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"select\"}\n [$__rate_interval]\n)) by (tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "with_wal", - "refId": "C", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"select\"}\n [$__rate_interval]\n)) by (tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write operations", + "title": "Total Ops Details (Table Scan)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -27939,6 +36966,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -27947,6 +36975,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -27957,41 +36986,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing write operation", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 75 + "y": 21 }, - "hiddenSeries": false, - "id": 126, + "height": null, + "hideTimeOverride": false, + "id": 271, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -27999,73 +37043,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_write_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_max\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_percentile99\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "B", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_percentile95\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"index\"}\n [$__rate_interval]\n)) by (tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "D", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"index\"}\n [$__rate_interval]\n)) by (tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write duration", + "title": "Total Ops Details (Index Scan)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28073,14 +37099,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "ops", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28091,41 +37119,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe count of WAL sync operations", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 83 + "y": 28 }, - "hiddenSeries": false, - "id": 137, + "height": null, + "hideTimeOverride": false, + "id": 272, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -28133,50 +37176,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_wal_file_synced{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"select\"}\n [$__rate_interval]\n)) by (cf, tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "sync", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}-{{tag}}", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"select\"}\n [$__rate_interval]\n)) by (cf, tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "WAL sync operations", + "title": "Total Ops Details by CF (Table Scan)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28184,6 +37232,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -28192,6 +37241,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28202,41 +37252,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing write wal operation", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 83 + "y": 28 }, - "hiddenSeries": false, - "id": 130, + "height": null, + "hideTimeOverride": false, + "id": 273, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -28244,73 +37309,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_write_wal_time_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"index\"}\n [$__rate_interval]\n)) by (cf, tag) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_wal_time_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_percentile99\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "B", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_wal_time_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_percentile95\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_wal_time_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_average\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "D", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}-{{tag}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_scan_details\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=\"index\"}\n [$__rate_interval]\n)) by (cf, tag) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write WAL duration", + "title": "Total Ops Details by CF (Index Scan)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28318,14 +37365,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "opm", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28336,152 +37385,160 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of compaction and flush operations", + "description": "The time consumed on checking memory locks for coprocessor requests", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 91 + "y": 35 }, - "hiddenSeries": false, - "id": 128, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 274, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 5, - "points": false, - "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(rate(tikv_engine_event_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m])) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "tikv_engine_event_total", - "refId": "B", - "step": 10 + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction operations", + "title": "Memory lock checking duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing WAL sync operation", + "description": "The time consumed on checking memory locks for coprocessor requests", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 91 + "y": 35 }, - "hiddenSeries": false, - "id": 135, + "height": null, + "hideTimeOverride": false, + "id": 275, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -28489,74 +37546,123 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_wal_file_sync_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_wal_file_sync_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B", - "step": 10 + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_wal_file_sync_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_wal_file_sync_micro_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "D", - "step": 10 + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_mem_lock_check_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "WAL sync duration", + "title": "Memory lock checking duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28564,14 +37670,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "s", "label": null, - "logBase": 10, + "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28582,41 +37690,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Coprocessor Detail", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 276, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "Compaction guard actions", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 99 + "y": 0 }, - "hiddenSeries": false, - "id": 2453, + "height": null, + "hideTimeOverride": false, + "id": 277, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -28624,50 +37789,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_raftstore_compaction_guard_action_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", cf=~\"default|write\"}[1m])) by (cf, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_threads_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, state) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{state}}", + "metric": "", + "query": "sum((\n tikv_threads_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, state) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_threads_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cf}}-{{type}}", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-total", "metric": "", - "refId": "B", - "step": 10 + "query": "sum((\n tikv_threads_state\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction guard actions", + "title": "Threads state", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28675,7 +37860,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -28683,6 +37869,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28693,41 +37880,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when executing the compaction and flush operations", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 99 + "y": 0 }, - "hiddenSeries": false, - "id": 136, + "height": null, + "hideTimeOverride": false, + "id": 278, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -28735,74 +37937,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_compaction_time{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"compaction_time_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "topk(20,(\n sum(rate(\n tikv_threads_io_bytes_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name, io) > 1024\n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", "metric": "", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(tikv_engine_compaction_time{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"compaction_time_percentile99\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "B", - "step": 10 - }, - { - "expr": "avg(tikv_engine_compaction_time{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"compaction_time_percentile95\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "C", - "step": 10 - }, - { - "expr": "avg(tikv_engine_compaction_time{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"compaction_time_average\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "D", - "step": 10 + "query": "topk(20,(\n sum(rate(\n tikv_threads_io_bytes_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name, io) > 1024\n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction duration", + "title": "Threads IO", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28810,14 +37993,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "binBps", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28828,41 +38013,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time consumed when reading SST files", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 107 + "y": 7 }, - "hiddenSeries": false, - "id": 140, + "height": null, + "hideTimeOverride": false, + "id": 279, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -28870,77 +38070,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_sst_read_micros{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_max\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "metric": "", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(tikv_engine_sst_read_micros{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_percentile99\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "metric": "", - "refId": "B", - "step": 10 - }, - { - "expr": "avg(tikv_engine_sst_read_micros{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_percentile95\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "avg(tikv_engine_sst_read_micros{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "topk(20,(\n max(rate(\n tikv_thread_voluntary_context_switches\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) > 100\n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", "metric": "", - "refId": "D", - "step": 10 + "query": "topk(20,(\n max(rate(\n tikv_thread_voluntary_context_switches\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) > 100\n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "SST read duration", + "title": "Thread Voluntary Context Switches", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -28948,14 +38126,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "none", "label": null, - "logBase": 10, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -28966,40 +38146,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 107 + "y": 7 }, - "hiddenSeries": false, - "id": 2451, + "height": null, + "hideTimeOverride": false, + "id": 280, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29007,51 +38203,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_compaction_reason{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m])) by (cf, reason)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "topk(20,(\n max(rate(\n tikv_thread_nonvoluntary_context_switches\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) > 100\n \n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{cf}} - {{reason}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", "metric": "", - "refId": "A", - "step": 10 + "query": "topk(20,(\n max(rate(\n tikv_thread_nonvoluntary_context_switches\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (name) > 100\n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction reason", + "title": "Thread Nonvoluntary Context Switches", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -29059,59 +38259,118 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Threads", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 281, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The block cache size. Broken down by column family if shared block cache is disabled.", + "description": "The count of get operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 115 + "y": 0 }, - "hiddenSeries": false, - "id": 102, + "height": null, + "hideTimeOverride": false, + "id": 282, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29119,64 +38378,132 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "topk(20, avg(tikv_engine_block_cache_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by(cf, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_hit\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{cf}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Block cache size", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "memtable", + "metric": "", + "query": "sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_hit\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=~\"block_cache_data_hit|block_cache_filter_hit\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "block_cache", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=~\"block_cache_data_hit|block_cache_filter_hit\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_get_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_hit_l0\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "l0", + "metric": "", + "query": "sum(rate(\n tikv_engine_get_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_hit_l0\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_get_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_hit_l1\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "l1", + "metric": "", + "query": "sum(rate(\n tikv_engine_get_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_hit_l1\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_get_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_hit_l2_and_up\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "l2_and_up", + "metric": "", + "query": "sum(rate(\n tikv_engine_get_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_hit_l2_and_up\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Get operations", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -29187,41 +38514,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The hit rate of memtable", + "description": "The time consumed when executing get operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 115 + "y": 0 }, - "hiddenSeries": false, - "id": 88, + "height": null, + "hideTimeOverride": false, + "id": 283, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29229,49 +38571,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "connected", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_memtable_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"memtable_hit\"}[1m])) / (sum(rate(tikv_engine_memtable_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", db=\"$db\", type=\"memtable_hit\"}[1m])) + sum(rate(tikv_engine_memtable_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", db=\"$db\", type=\"memtable_miss\"}[1m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_max\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "hit", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_average\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_get_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"get_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Memtable hit", + "title": "Get duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -29279,60 +38672,76 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "ops", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The flow of different kinds of block cache operations", + "description": "The count of seek operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 123 + "y": 7 }, - "height": "", - "hiddenSeries": false, - "id": 467, + "height": null, + "hideTimeOverride": false, + "id": 284, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29340,116 +38749,130 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_byte_read\"}[1m]))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "total_read", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_byte_write\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_seek\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "total_written", - "refId": "C", - "step": 10 + "intervalFactor": 1, + "legendFormat": "seek", + "metric": "", + "query": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_seek\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_bytes_insert\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_seek_found\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "data_insert", + "intervalFactor": 1, + "legendFormat": "seek_found", "metric": "", - "refId": "D", - "step": 10 + "query": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_seek_found\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_bytes_insert\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_next\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "filter_insert", + "intervalFactor": 1, + "legendFormat": "next", "metric": "", - "refId": "B", - "step": 10 + "query": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_next\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_bytes_evict\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_next_found\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "filter_evict", + "intervalFactor": 1, + "legendFormat": "next_found", "metric": "", - "refId": "E", - "step": 10 + "query": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_next_found\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_bytes_insert\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_prev\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "index_insert", + "intervalFactor": 1, + "legendFormat": "prev", "metric": "", - "refId": "F", - "step": 10 + "query": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_prev\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_bytes_evict\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_prev_found\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "index_evict", + "intervalFactor": 1, + "legendFormat": "prev_found", "metric": "", - "refId": "G", - "step": 10 + "query": "sum(rate(\n tikv_engine_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"number_db_prev_found\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Block cache flow", + "title": "Seek operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -29457,59 +38880,76 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "ops", "label": null, - "logBase": 10, + "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The hit rate of block cache", + "description": "The time consumed when executing seek operation", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 123 + "y": 7 }, - "hiddenSeries": false, - "id": 80, + "height": null, + "hideTimeOverride": false, + "id": 285, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29517,87 +38957,100 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "connected", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_miss\"}[1m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_max\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "all", - "metric": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_miss\"}[1m])))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "data", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", "metric": "", - "refId": "D", - "step": 10 + "query": "max((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_miss\"}[1m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_percentile99\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "filter", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", "metric": "", - "refId": "B", - "step": 10 + "query": "avg((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_miss\"}[1m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_percentile95\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "index", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", "metric": "", - "refId": "C", - "step": 10 + "query": "avg((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_bloom_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"bloom_prefix_useful\"}[1m])) / sum(rate(tikv_engine_bloom_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"bloom_prefix_checked\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_average\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "bloom prefix", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", "metric": "", - "refId": "E", - "step": 10 + "query": "avg((\n tikv_engine_seek_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"seek_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Block cache hit", + "title": "Seek duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -29605,60 +39058,76 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "ops", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The flow of different kinds of operations on keys", + "description": "The count of write operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 131 + "y": 14 }, - "height": "", - "hiddenSeries": false, - "id": 132, + "height": null, + "hideTimeOverride": false, + "id": 286, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29666,72 +39135,85 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"keys_read\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_write_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=~\"write_done_by_self|write_done_by_other\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "read", - "refId": "B", - "step": 10 + "intervalFactor": 1, + "legendFormat": "done", + "metric": "", + "query": "sum(rate(\n tikv_engine_write_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=~\"write_done_by_self|write_done_by_other\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"keys_written\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_write_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_timeout\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "written", - "refId": "C", - "step": 10 + "intervalFactor": 1, + "legendFormat": "timeout", + "metric": "", + "query": "sum(rate(\n tikv_engine_write_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_timeout\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_compaction_num_corrupt_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_write_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_with_wal\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "corrupt", + "intervalFactor": 1, + "legendFormat": "with_wal", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_engine_write_served\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_with_wal\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Keys flow", + "title": "Write operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -29739,59 +39221,76 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of different kinds of block cache operations", + "description": "The time consumed when executing write operation", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 131 + "y": 14 }, - "hiddenSeries": false, - "id": 468, + "height": null, + "hideTimeOverride": false, + "id": 287, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29799,86 +39298,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_add\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "total_add", - "metric": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_add\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_max\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "data_add", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", "metric": "", - "refId": "C", - "step": 10 + "query": "max((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_add\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_percentile99\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "filter_add", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", "metric": "", - "refId": "D", - "step": 10 + "query": "avg((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_add\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_percentile95\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "index_add", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", "metric": "", - "refId": "E", - "step": 10 + "query": "avg((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"block_cache_add_failures\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_average\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "add_failures", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", "metric": "", - "refId": "B", - "step": 10 + "query": "avg((\n tikv_engine_write_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Block cache operations", + "title": "Write duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -29886,14 +39399,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -29904,42 +39419,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The flow rate of read operations per type", + "description": "The count of WAL sync operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 139 + "y": 21 }, - "height": "", - "hiddenSeries": false, - "id": 85, + "height": null, + "hideTimeOverride": false, + "id": 288, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -29947,61 +39476,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"bytes_read\"}[1m]))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "get", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"iter_bytes_read\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_wal_file_synced\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "scan", - "refId": "C", - "step": 10 + "intervalFactor": 1, + "legendFormat": "sync", + "metric": "", + "query": "sum(rate(\n tikv_engine_wal_file_synced\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Read flow", + "title": "WAL sync operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30009,59 +39532,76 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of keys in each column family", + "description": "The time consumed when executing write wal operation", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 139 + "y": 21 }, - "hiddenSeries": false, - "id": 131, + "height": null, + "hideTimeOverride": false, + "id": 289, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30069,51 +39609,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_estimate_num_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_max\"}\n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{cf}}", - "metric": "tikv_engine_estimate_num_keys", - "refId": "A", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_average\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_write_wal_time_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_wal_micros_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total keys", + "title": "Write WAL duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30121,60 +39710,76 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The flow of different kinds of write operations", + "description": "The count of compaction and flush operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, "w": 12, "x": 0, - "y": 147 + "y": 28 }, - "height": "", - "hiddenSeries": false, - "id": 86, + "height": null, + "hideTimeOverride": false, + "id": 290, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30182,59 +39787,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"wal_file_bytes\"}[1m]))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "wal", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"bytes_written\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_event_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "write", - "refId": "A", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_event_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write flow", + "title": "Compaction operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30242,59 +39843,76 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The bytes per read", + "description": "The time consumed when executing WAL sync operation", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 147 + "y": 28 }, - "hiddenSeries": false, - "id": 133, + "height": null, + "hideTimeOverride": false, + "id": 291, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30302,74 +39920,100 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_bytes_per_read{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_max\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "max", - "refId": "A", - "step": 10 + "metric": "", + "query": "max((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_bytes_per_read{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_percentile99\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_bytes_per_read{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_percentile95\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "95%", - "refId": "C", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_bytes_per_read{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_average\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "D", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_wal_file_sync_micro_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_sync_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Bytes / Read", + "title": "WAL sync duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30377,14 +40021,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 10, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -30395,41 +40041,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The flow rate of compaction operations per type", + "description": "Compaction guard actions", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 155 + "y": 35 }, - "hiddenSeries": false, - "id": 90, + "height": null, + "hideTimeOverride": false, + "id": 292, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30437,68 +40098,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_compaction_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"bytes_read\"}[1m]))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "read", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_compaction_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"bytes_written\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_raftstore_compaction_guard_action_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=~\"default|write\"}\n [$__rate_interval]\n)) by (cf, type) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "written", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(rate(tikv_engine_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"flush_write_bytes\"}[1m]))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "flushed", - "refId": "B", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}-{{ type}}", + "metric": "", + "query": "sum(rate(\n tikv_raftstore_compaction_guard_action_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=~\"default|write\"}\n [$__rate_interval]\n)) by (cf, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction flow", + "title": "Compaction guard actions", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30506,59 +40154,76 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "Bps", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The bytes per write", + "description": "The time consumed when executing the compaction and flush operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 155 + "y": 35 }, - "hiddenSeries": false, - "id": 134, + "height": null, + "hideTimeOverride": false, + "id": 293, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30566,74 +40231,100 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_bytes_per_write{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_max\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "max", - "refId": "A", - "step": 10 + "metric": "", + "query": "max((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_bytes_per_write{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_percentile99\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_bytes_per_write{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_percentile95\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "95%", - "refId": "C", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_bytes_per_write{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_average\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "D", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_compaction_time\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"compaction_time_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Bytes / Write", + "title": "Compaction duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30641,14 +40332,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 10, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -30659,41 +40352,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The read amplification per TiKV instance \t", + "description": "The time consumed when reading SST files", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 163 + "y": 42 }, - "hiddenSeries": false, - "id": 518, + "height": null, + "hideTimeOverride": false, + "id": 294, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30701,51 +40409,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_read_amp_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"read_amp_total_read_bytes\"}[1m])) by (instance) / sum(rate(tikv_engine_read_amp_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", db=\"$db\", type=\"read_amp_estimate_useful_bytes\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_max\"}\n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_average\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", "metric": "", - "refId": "A", - "step": 10 + "query": "avg((\n tikv_engine_sst_read_micros\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"sst_read_micros_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Read amplication", + "title": "SST read duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30753,59 +40510,76 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The pending bytes to be compacted", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 163 + "y": 42 }, - "hiddenSeries": false, - "id": 127, + "height": null, + "hideTimeOverride": false, + "id": 295, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30813,51 +40587,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_pending_compaction_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_compaction_reason\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (cf, reason) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{cf}}", - "metric": "tikv_engine_pending_compaction_bytes", - "refId": "A", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}-{{reason}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_compaction_reason\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (cf, reason) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compaction pending bytes", + "title": "Compaction reason", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30865,59 +40643,76 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "Bps", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of snapshot of each TiKV instance", + "description": "The block cache size. Broken down by column family if shared block cache is disabled.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 171 + "y": 49 }, - "hiddenSeries": false, - "id": 516, + "height": null, + "hideTimeOverride": false, + "id": 296, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -30925,51 +40720,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_engine_num_snapshots{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "topk(20,(\n avg((\n tikv_engine_block_cache_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf, instance) \n \n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{cf}}", "metric": "", - "refId": "A", - "step": 10 + "query": "topk(20,(\n avg((\n tikv_engine_block_cache_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf, instance) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Number of snapshots", + "title": "Block cache size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -30977,59 +40776,76 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The compression ratio of each level", + "description": "The hit rate of memtable", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 171 + "y": 49 }, - "hiddenSeries": false, - "id": 863, + "height": null, + "hideTimeOverride": false, + "id": 297, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31037,51 +40853,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_compression_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (cf, level)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_miss\"}\n [$__rate_interval]\n)) ))", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{cf}} - level - {{level}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "hit", "metric": "", - "refId": "A", - "step": 10 + "query": "(sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_memtable_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"memtable_miss\"}\n [$__rate_interval]\n)) ))", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Compression ratio", + "title": "Memtable hit", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31089,57 +40909,76 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of SST files for different column families in each level", + "description": "The flow of different kinds of block cache operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 179 + "y": 56 }, - "hiddenSeries": false, - "id": 2002, + "height": null, + "hideTimeOverride": false, + "id": 298, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31147,48 +40986,145 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_num_files_at_level{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (cf, level)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_byte_read\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total_read", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_byte_read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_byte_write\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total_written", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_byte_write\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_bytes_insert\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "data_insert", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_bytes_insert\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_bytes_insert\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "filter_insert", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_bytes_insert\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_bytes_evict\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "filter_evict", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_bytes_evict\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_bytes_insert\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "index_insert", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_bytes_insert\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_bytes_evict\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "cf-{{cf}}, level-{{level}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "index_evict", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_bytes_evict\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Number files at each level", + "title": "Block cache flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31196,14 +41132,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "binBps", "label": null, - "logBase": 1, + "logBase": 10, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -31214,41 +41152,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time that the oldest unreleased snapshot survivals", + "description": "The hit rate of block cache", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 179 + "y": 56 }, - "hiddenSeries": false, - "id": 517, + "height": null, + "hideTimeOverride": false, + "id": 299, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31256,51 +41209,115 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_engine_oldest_snapshot_duration{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_miss\"}\n [$__rate_interval]\n)) ))", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_engine_oldest_snapshot_duration", - "refId": "A", - "step": 10 - } + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "all", + "metric": "", + "query": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_miss\"}\n [$__rate_interval]\n)) ))", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_miss\"}\n [$__rate_interval]\n)) ))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "data", + "metric": "", + "query": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_miss\"}\n [$__rate_interval]\n)) ))", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_miss\"}\n [$__rate_interval]\n)) ))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "filter", + "metric": "", + "query": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_miss\"}\n [$__rate_interval]\n)) ))", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_miss\"}\n [$__rate_interval]\n)) ))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "index", + "metric": "", + "query": "(sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_miss\"}\n [$__rate_interval]\n)) ))", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_bloom_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bloom_prefix_useful\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_engine_bloom_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bloom_prefix_checked\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "bloom prefix", + "metric": "", + "query": "(sum(rate(\n tikv_engine_bloom_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bloom_prefix_useful\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_engine_bloom_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bloom_prefix_checked\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Oldest snapshots duration", + "title": "Block cache hit", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31308,58 +41325,76 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Stall conditions changed of each column family", + "description": "The flow of different kinds of operations on keys", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 187 + "y": 63 }, - "hiddenSeries": false, - "id": 2381, + "height": null, + "hideTimeOverride": false, + "id": 300, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31367,48 +41402,85 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_engine_stall_conditions_changed{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"keys_read\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{cf}}-{{type}}", - "refId": "B" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"keys_read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"keys_written\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "written", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"keys_written\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_compaction_num_corrupt_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "corrupt", + "metric": "", + "query": "sum(rate(\n tikv_engine_compaction_num_corrupt_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Stall conditions changed of each CF", + "title": "Keys flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31416,7 +41488,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -31424,6 +41497,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -31434,39 +41508,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed when ingesting SST files", + "description": "The count of different kinds of block cache operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 187 + "y": 63 }, - "hiddenSeries": false, - "id": 2003, + "height": null, + "hideTimeOverride": false, + "id": 301, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31474,55 +41565,115 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_snapshot_ingest_sst_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_add\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total_add", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_add\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_add\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "data_add", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_data_add\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_add\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "filter_add", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_filter_add\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_add\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "index_add", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_index_add\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_snapshot_ingest_sst_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m])) / sum(rate(tikv_snapshot_ingest_sst_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_add_failures\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "average", - "refId": "B" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "add_failures", + "metric": "", + "query": "sum(rate(\n tikv_engine_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"block_cache_add_failures\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Ingest SST duration seconds", + "title": "Block cache operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31530,7 +41681,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -31538,6 +41690,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -31548,40 +41701,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The flow rate of read operations per type", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 195 + "y": 70 }, - "hiddenSeries": false, - "id": 2452, + "height": null, + "hideTimeOverride": false, + "id": 302, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31589,51 +41758,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(increase(tikv_engine_write_stall_reason{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_read\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{type}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "get", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"iter_bytes_read\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "scan", "metric": "", - "refId": "A", - "step": 10 + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"iter_bytes_read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write Stall Reason", + "title": "Read flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31641,59 +41829,76 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The time which is caused by write stall", + "description": "The count of keys in each column family", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 195 + "y": 70 }, - "hiddenSeries": false, - "id": 87, + "height": null, + "hideTimeOverride": false, + "id": 303, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31701,77 +41906,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(tikv_engine_write_stall{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"write_stall_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_estimate_num_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "metric": "", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_stall{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"write_stall_percentile99\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "metric": "", - "refId": "B", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_stall{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"write_stall_percentile95\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "metric": "", - "refId": "C", - "step": 10 - }, - { - "expr": "avg(tikv_engine_write_stall{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"write_stall_average\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}", "metric": "", - "refId": "D", - "step": 10 + "query": "sum((\n tikv_engine_estimate_num_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write stall duration", + "title": "Total keys", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31779,14 +41962,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "short", "label": null, - "logBase": 10, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -31797,121 +41982,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The level that the external file ingests into", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 203 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 12712, - "legend": { - "show": false - }, - "links": [], - "reverseYBuckets": false, - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "sum(delta(tikv_engine_ingestion_picked_level_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}[1m])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Ingestion picked level", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The memtable size of each column family", + "description": "The flow of different kinds of write operations", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 203 + "x": 0, + "y": 77 }, - "hiddenSeries": false, - "id": 103, + "height": null, + "hideTimeOverride": false, + "id": 304, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -31919,49 +42039,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\", type=\"mem-tables\"}) by (cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_bytes\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{cf}}", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "wal", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"wal_file_bytes\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_written\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_written\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Memtable size", + "title": "Write flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -31969,7 +42110,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, @@ -31977,6 +42119,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -31987,54 +42130,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": "db", - "title": "RocksDB - $db", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 12802, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The count of operations per second", + "description": "The bytes per read", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 32 + "x": 12, + "y": 77 }, - "hiddenSeries": false, - "id": 12892, + "height": null, + "hideTimeOverride": false, + "id": 305, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32042,58 +42186,101 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(raft_engine_write_apply_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_max\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "write", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(raft_engine_read_entry_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_percentile95\"}\n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "read_entry", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(raft_engine_read_message_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_average\"}\n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "read_message", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_bytes_per_read\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_read_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Operation", + "title": "Bytes / Read", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32101,14 +42288,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "bytes", "label": null, - "logBase": 1, + "logBase": 10, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32119,38 +42308,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time used in write operation", + "description": "The flow rate of compaction operations per type", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 32 + "x": 0, + "y": 84 }, - "hiddenSeries": false, - "id": 12893, + "height": null, + "hideTimeOverride": false, + "id": 306, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32158,66 +42364,86 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(raft_engine_write_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(raft_engine_write_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_compaction_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_read\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "avg", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "read", + "metric": "", + "query": "sum(rate(\n tikv_engine_compaction_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_read\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(raft_engine_write_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_compaction_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_written\"}\n [$__rate_interval]\n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "95%", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_write_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "99%", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "written", + "metric": "", + "query": "sum(rate(\n tikv_engine_compaction_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_written\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(raft_engine_write_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"flush_write_bytes\"}\n [$__rate_interval]\n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "D" + "intervalFactor": 1, + "legendFormat": "flushed", + "metric": "", + "query": "sum(rate(\n tikv_engine_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"flush_write_bytes\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write Duration", + "title": "Compaction flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32225,14 +42451,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "binBps", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32243,38 +42471,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The I/O flow rate", + "description": "The bytes per write", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 40 + "x": 12, + "y": 84 }, - "hiddenSeries": false, - "id": 12896, + "height": null, + "hideTimeOverride": false, + "id": 307, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32282,50 +42527,101 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(raft_engine_write_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_max\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "write", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "exemplar": true, - "expr": "sum(rate(raft_engine_background_rewrite_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_average\"}\n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "rewrite {{type}}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_bytes_per_write\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"bytes_per_write_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Flow", + "title": "Bytes / Write", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32333,7 +42629,8 @@ }, "yaxes": [ { - "format": "binBps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -32341,6 +42638,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32351,38 +42649,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "99% duration breakdown of write operation", + "description": "The read amplification per TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 40 + "x": 0, + "y": 91 }, - "hiddenSeries": false, - "id": 12895, + "height": null, + "hideTimeOverride": false, + "id": 308, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32390,58 +42705,56 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_write_preprocess_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "wait", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_write_leader_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "wal", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_write_apply_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_read_amp_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"read_amp_total_read_bytes\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_engine_read_amp_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"read_amp_estimate_useful_bytes\"}\n [$__rate_interval]\n)) by (instance) )", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "apply", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "(sum(rate(\n tikv_engine_read_amp_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"read_amp_total_read_bytes\"}\n [$__rate_interval]\n)) by (instance) / sum(rate(\n tikv_engine_read_amp_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"read_amp_estimate_useful_bytes\"}\n [$__rate_interval]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Write Duration Breakdown (99%)", + "title": "Read amplification", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32449,14 +42762,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32467,38 +42782,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The bytes per write", + "description": "The pending bytes to be compacted", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 48 + "x": 12, + "y": 91 }, - "hiddenSeries": false, - "id": 12898, + "height": null, + "hideTimeOverride": false, + "id": 309, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32506,58 +42838,56 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(raft_engine_write_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(raft_engine_write_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", - "interval": "", - "legendFormat": "avg", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_write_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "99%", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(raft_engine_write_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_pending_compaction_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "999%", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "{{cf}}", + "metric": "", + "query": "sum((\n tikv_engine_pending_compaction_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Bytes / Written", + "title": "Compaction pending bytes", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32565,6 +42895,7 @@ }, "yaxes": [ { + "decimals": null, "format": "bytes", "label": null, "logBase": 1, @@ -32573,6 +42904,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32583,38 +42915,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "999% duration breakdown of WAL write operation", + "description": "The number of snapshot of each TiKV instance", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 48 + "x": 0, + "y": 98 }, - "hiddenSeries": false, - "id": 12933, + "height": null, + "hideTimeOverride": false, + "id": 310, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32622,66 +42971,56 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(raft_engine_write_leader_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "total", - "refId": "D" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(raft_engine_sync_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "sync", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(raft_engine_allocate_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "allocate", - "refId": "G" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(raft_engine_rotate_log_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_engine_num_snapshots\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "rotate", - "refId": "H" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "((\n tikv_engine_num_snapshots\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "WAL Duration Breakdown (999%)", + "title": "Number of snapshots", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32689,14 +43028,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32707,38 +43048,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The average number of files", + "description": "The compression ratio of each level", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 56 + "x": 12, + "y": 98 }, - "hiddenSeries": false, - "id": 12899, + "height": null, + "hideTimeOverride": false, + "id": 311, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32746,55 +43104,56 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "avg(raft_engine_log_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "avg(raft_engine_swap_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"})", - "intervalFactor": 1, - "legendFormat": "swap", - "refId": "B" - }, - { - "exemplar": true, - "expr": "avg(raft_engine_recycled_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_compression_ratio\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf, level) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}} - recycle", - "refId": "C" + "legendFormat": "{{cf}}-L{{level}}", + "metric": "", + "query": "avg((\n tikv_engine_compression_ratio\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf, level) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "File Count", + "title": "Compression ratio", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32802,6 +43161,7 @@ }, "yaxes": [ { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32810,6 +43170,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32820,38 +43181,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The 99% duration of operations other than write", + "description": "The number of SST files for different column families in each level", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 56 + "x": 0, + "y": 105 }, - "hiddenSeries": false, - "id": 12897, + "height": null, + "hideTimeOverride": false, + "id": 312, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32859,58 +43237,56 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_read_entry_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "read_entry", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_read_message_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "read_message", - "refId": "D" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(raft_engine_purge_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_num_files_at_level\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf, level) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "purge", - "refId": "E" + "intervalFactor": 1, + "legendFormat": "{{cf}}-L{{level}}", + "metric": "", + "query": "avg((\n tikv_engine_num_files_at_level\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) by (cf, level) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Other Durations (99%)", + "title": "Number files at each level", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -32918,14 +43294,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -32936,38 +43314,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The average number of log entries", + "description": "The time that the oldest unreleased snapshot survivals", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 64 + "x": 12, + "y": 105 }, - "hiddenSeries": false, - "id": 12934, + "height": null, + "hideTimeOverride": false, + "id": 313, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -32975,43 +43370,56 @@ }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "avg(raft_engine_log_entry_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_engine_oldest_snapshot_duration\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{type}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "((\n tikv_engine_oldest_snapshot_duration\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Entry Count", + "title": "Oldest snapshots duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33019,7 +43427,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -33027,6 +43436,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33037,99 +43447,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "title": "Raft Engine", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 35 - }, - "id": 3301, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "Stall conditions changed of each column family", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 0, - "y": 27 + "y": 112 }, - "id": 3555, + "height": null, + "hideTimeOverride": false, + "id": 314, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_titandb_num_live_blob_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "live blob file num", - "refId": "A" - }, - { - "expr": "sum(tikv_engine_titandb_num_obsolete_blob_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_engine_stall_conditions_changed\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "obsolete blob file num", - "refId": "B" + "legendFormat": "{{instance}}-{{cf}}-{{type}}", + "metric": "", + "query": "((\n tikv_engine_stall_conditions_changed\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob file count", + "title": "Stall conditions changed of each CF", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33137,6 +43560,7 @@ }, "yaxes": [ { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33145,6 +43569,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33155,98 +43580,197 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The time consumed when ingesting SST files", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 27 + "y": 112 }, - "id": 3557, + "height": null, + "hideTimeOverride": false, + "id": 315, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_titandb_live_blob_file_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "live blob file size", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(tikv_engine_titandb_obsolete_blob_file_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "obsolete blob file size", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Blob file size", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_snapshot_ingest_sst_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Ingest SST duration seconds", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33257,76 +43781,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 119 }, - "id": 3523, + "height": null, + "hideTimeOverride": false, + "id": 316, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_titandb_live_blob_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_write_stall_reason\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "live blob size", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_write_stall_reason\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Live blob size", + "title": "Write Stall Reason", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33334,7 +43894,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -33342,6 +43903,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33352,35 +43914,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The blob cache size.", + "description": "The time which is caused by write stall", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 34 + "y": 119 }, - "id": 4655, + "height": null, + "hideTimeOverride": false, + "id": 317, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -33388,38 +43971,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "topk(20, avg(tikv_engine_blob_cache_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"}) by(cf, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_max\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{cf}}", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_average\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_write_stall\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"write_stall_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob cache size", + "title": "Write stall duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33427,7 +44072,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -33435,6 +44081,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33445,35 +44092,160 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The level that the external file ingests into", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 126 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 318, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_ingestion_picked_level_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_ingestion_picked_level_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Ingestion picked level", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The hit rate of block cache", - "fill": 0, + "description": "The memtable size of each column family", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 126 }, - "id": 4020, + "height": null, + "hideTimeOverride": false, + "id": 319, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -33481,47 +44253,55 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 2, - "nullPointMode": "connected", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_cache_hit\"}[1m])) / (sum(rate(tikv_engine_blob_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", db=\"$titan_db\", type=\"blob_cache_hit\"}[1m])) + sum(rate(tikv_engine_blob_cache_efficiency{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", db=\"$titan_db\", type=\"blob_cache_miss\"}[1m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_memory_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"mem-tables-all\"}\n \n)) by (cf) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "all", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cf}}", "metric": "", - "refId": "A", - "step": 10 + "query": "avg((\n tikv_engine_memory_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$db\",type=\"mem-tables-all\"}\n \n)) by (cf) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob cache hit", + "title": "Memtable size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33529,50 +44309,118 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "ops", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": "db", + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "RocksDB - $db", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 320, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The count of operations per second", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 0, - "y": 39 + "y": 0 }, - "id": 4023, + "height": null, + "hideTimeOverride": false, + "id": 321, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -33580,69 +44428,85 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_iter_touch_blob_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_iter_touch_blob_file_count_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_write_apply_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_iter_touch_blob_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_iter_touch_blob_file_count_percentile95\"})", - "format": "time_series", + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "C" + "legendFormat": "write", + "metric": "", + "query": "sum(rate(\n raft_engine_write_apply_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_iter_touch_blob_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_iter_touch_blob_file_count_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_read_entry_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "D" + "legendFormat": "read_entry", + "metric": "", + "query": "sum(rate(\n raft_engine_read_entry_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_iter_touch_blob_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_iter_touch_blob_file_count_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_read_message_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max", - "refId": "B" + "legendFormat": "read_message", + "metric": "", + "query": "sum(rate(\n raft_engine_read_message_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Iter touched blob file count", + "title": "Operation", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33650,7 +44514,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -33658,6 +44523,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33668,76 +44534,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The time used in write operation", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 39 + "y": 0 }, - "id": 4025, + "height": null, + "hideTimeOverride": false, + "id": 322, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_engine_titandb_blob_file_discardable_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"}) by (ratio)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n raft_engine_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ratio}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n raft_engine_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n raft_engine_write_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n raft_engine_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n raft_engine_write_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n raft_engine_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n raft_engine_write_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob file discardable ratio distribution", + "title": "Write Duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33745,7 +44715,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -33753,6 +44724,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33763,32 +44735,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The I/O flow rate", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 0, - "y": 45 + "y": 7 }, - "id": 3414, + "height": null, + "hideTimeOverride": false, + "id": 323, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -33796,69 +44792,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_key_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_key_size_average\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_key_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_key_size_percentile95\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "95%", - "refId": "C" - }, - { - "expr": "avg(tikv_engine_blob_key_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_key_size_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_write_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "D" + "legendFormat": "write", + "metric": "", + "query": "sum(rate(\n raft_engine_write_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_key_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_key_size_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_background_rewrite_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max", - "refId": "B" + "legendFormat": "rewrite-{{type}}", + "metric": "", + "query": "sum(rate(\n raft_engine_background_rewrite_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob key size", + "title": "Flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33866,14 +44863,16 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "binBps", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -33884,97 +44883,142 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "99% duration breakdown of write operation", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 45 + "y": 7 }, - "id": 3446, + "height": null, + "hideTimeOverride": false, + "id": 324, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_value_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_value_size_average\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_value_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_value_size_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_preprocess_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "wait", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_preprocess_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_value_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_value_size_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_leader_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "C" + "legendFormat": "wal", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_leader_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_value_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_value_size_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_apply_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max", - "refId": "D" + "legendFormat": "apply", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_apply_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob value size", + "title": "Write Duration Breakdown (99%)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -33982,14 +45026,16 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "s", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34000,32 +45046,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The bytes per write", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 0, - "y": 51 + "y": 14 }, - "id": 3746, + "height": null, + "hideTimeOverride": false, + "id": 325, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -34033,47 +45103,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } - }, - "seriesOverrides": [], - "spaceLength": 10, + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"number_blob_get\"}[2m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n raft_engine_write_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "get", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n raft_engine_write_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n raft_engine_write_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n raft_engine_write_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n raft_engine_write_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n raft_engine_write_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n raft_engine_write_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n raft_engine_write_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n raft_engine_write_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob get operations", + "title": "Bytes / Written", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34081,7 +45227,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -34089,6 +45236,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34099,97 +45247,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "999% duration breakdown of WAL write operation", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 51 + "y": 14 }, - "id": 3655, + "height": null, + "hideTimeOverride": false, + "id": 326, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_get_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_average\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_write_leader_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_write_leader_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_get_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile95\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_sync_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "B" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "sync", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_sync_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_get_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile99\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_allocate_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "C" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "allocate", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_allocate_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_get_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_max\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_rotate_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "rotate", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_rotate_log_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob get duration", + "title": "WAL Duration Breakdown (999%)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34197,14 +45405,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "s", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34215,94 +45425,142 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The average number of files", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 0, - "y": 57 + "y": 21 }, - "id": 3338, + "height": null, + "hideTimeOverride": false, + "id": 327, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"number_blob_seek\"}[2m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n raft_engine_log_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "seek", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "avg((\n raft_engine_log_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_blob_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"number_blob_prev\"}[2m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n raft_engine_swap_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "prev", - "refId": "B" + "legendFormat": "swap", + "metric": "", + "query": "avg((\n raft_engine_swap_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_engine_blob_locate{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"number_blob_next\"}[2m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n raft_engine_recycled_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "next", - "refId": "C" + "legendFormat": "{{type}}-recycle", + "metric": "", + "query": "avg((\n raft_engine_recycled_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob iter operations", + "title": "File Count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34310,7 +45568,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -34318,6 +45577,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34328,32 +45588,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The 99% duration of operations other than write", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, "x": 12, - "y": 57 + "y": 21 }, - "id": 3412, + "height": null, + "hideTimeOverride": false, + "id": 328, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -34361,66 +45645,85 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_seek_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_average\"}) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_seek_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile95\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_read_entry_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "B" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read_entry", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_read_entry_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_seek_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile99\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_read_message_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "C" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read_message", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_read_message_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_seek_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_max\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_purge_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "purge", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n raft_engine_purge_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob seek duration", + "title": "Other Durations (99%)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34428,7 +45731,8 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "s", "label": null, "logBase": 2, "max": null, @@ -34436,6 +45740,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34446,32 +45751,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The average number of log entries", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, - "w": 12, + "h": 7, + "w": 24, "x": 0, - "y": 63 + "y": 28 }, - "id": 4092, + "height": null, + "hideTimeOverride": false, + "id": 329, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -34479,66 +45808,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_next_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_average\"}) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_next_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile95\"}) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "B" - }, - { - "expr": "avg(tikv_engine_blob_next_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile99\"}) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "C" - }, - { - "expr": "max(tikv_engine_blob_next_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_max\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n raft_engine_log_entry_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "avg((\n raft_engine_log_entry_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob next duration", + "title": "Entry Count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34546,14 +45864,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34564,32 +45884,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft Engine", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 330, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 63 + "x": 0, + "y": 0 }, - "id": 4093, + "height": null, + "hideTimeOverride": false, + "id": 331, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -34597,66 +45983,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_prev_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_average\"}) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_prev_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile95\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_titandb_num_live_blob_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "95%", - "refId": "B" - }, - { - "expr": "avg(tikv_engine_blob_prev_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_percentile99\"}) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99%", - "refId": "C" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "live blob file num", + "metric": "", + "query": "sum((\n tikv_engine_titandb_num_live_blob_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_prev_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\".*_max\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_titandb_num_obsolete_blob_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "max", - "refId": "D" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "obsolete blob file num", + "metric": "", + "query": "sum((\n tikv_engine_titandb_num_obsolete_blob_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob prev duration", + "title": "Blob file count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34664,14 +46054,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "none", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34682,76 +46074,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 69 + "x": 12, + "y": 0 }, - "id": 3645, + "height": null, + "hideTimeOverride": false, + "id": 332, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\"keys.*\"}[30s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_titandb_live_blob_file_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "live blob file size", + "metric": "", + "query": "sum((\n tikv_engine_titandb_live_blob_file_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_titandb_obsolete_blob_file_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "obsolete blob file size", + "metric": "", + "query": "sum((\n tikv_engine_titandb_obsolete_blob_file_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob keys flow", + "title": "Blob file size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34759,7 +46202,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -34767,6 +46211,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34777,76 +46222,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 69 + "x": 0, + "y": 7 }, - "id": 3643, + "height": null, + "hideTimeOverride": false, + "id": 333, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\"bytes.*\"}[2m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_titandb_live_blob_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "live blob size", + "metric": "", + "query": "sum((\n tikv_engine_titandb_live_blob_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob bytes flow", + "title": "Live blob size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34854,7 +46335,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -34862,6 +46344,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34872,97 +46355,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The hit rate of block cache", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 75 + "x": 12, + "y": 7 }, - "id": 3657, + "height": null, + "hideTimeOverride": false, + "id": 334, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_file_read_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_read_micros_average\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_file_read_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_read_micros_percentile99\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99%", - "refId": "B" - }, - { - "expr": "avg(tikv_engine_blob_file_read_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_read_micros_percentile95\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "95%", - "refId": "C" - }, - { - "expr": "max(tikv_engine_blob_file_read_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_read_micros_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_engine_blob_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_cache_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_blob_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_cache_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_blob_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_cache_miss\"}\n [$__rate_interval]\n)) ))", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max", - "refId": "D" + "legendFormat": "all", + "metric": "", + "query": "(sum(rate(\n tikv_engine_blob_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_cache_hit\"}\n [$__rate_interval]\n)) / (sum(rate(\n tikv_engine_blob_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_cache_hit\"}\n [$__rate_interval]\n)) + sum(rate(\n tikv_engine_blob_cache_efficiency\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_cache_miss\"}\n [$__rate_interval]\n)) ))", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob file read duration", + "title": "Blob cache hit", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -34970,14 +46468,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "percentunit", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -34988,97 +46488,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 75 + "x": 0, + "y": 14 }, - "id": 3408, + "height": null, + "hideTimeOverride": false, + "id": 335, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_file_write_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_write_micros_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_average\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "avg", - "refId": "A" + "metric": "", + "query": "avg((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_file_write_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_write_micros_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_percentile95\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "B" + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_file_write_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_write_micros_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_percentile99\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "C" + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_file_write_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_write_micros_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_max\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "max", - "refId": "D" + "metric": "", + "query": "max((\n tikv_engine_blob_iter_touch_blob_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_iter_touch_blob_file_count_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob file write duration", + "title": "Iter touched blob file count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35086,14 +46646,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "none", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35104,76 +46666,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The blob cache size.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 81 + "x": 12, + "y": 14 }, - "id": 3651, + "height": null, + "hideTimeOverride": false, + "id": 336, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_file_synced{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"}[2m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "topk(20,(\n avg((\n tikv_engine_blob_cache_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) by (cf, instance) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "sync", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{cf}}", + "metric": "", + "query": "topk(20,(\n avg((\n tikv_engine_blob_cache_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) by (cf, instance) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob file sync operations", + "title": "Blob cache size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35181,7 +46779,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -35189,6 +46788,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35199,97 +46799,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 81 + "x": 0, + "y": 21 }, - "id": 3653, + "height": null, + "hideTimeOverride": false, + "id": 337, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_sync_micros_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_average\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "A" + "metric": "", + "query": "avg((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_sync_micros_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_percentile95\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "95%", - "refId": "B" + "metric": "", + "query": "avg((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_sync_micros_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_percentile99\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "99%", - "refId": "C" + "metric": "", + "query": "avg((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_file_sync_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_file_sync_micros_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_max\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "max", - "refId": "D" + "metric": "", + "query": "max((\n tikv_engine_blob_key_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_key_size_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob file sync duration", + "title": "Blob key size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35297,14 +46957,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "bytes", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35315,77 +46977,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 87 + "x": 12, + "y": 21 }, - "id": 5018, + "height": null, + "hideTimeOverride": false, + "id": 338, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_gc_action_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"}[2m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_average\"}\n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "B" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_max\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_value_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_value_size_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC action", + "title": "Blob value size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35393,7 +47135,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -35401,6 +47144,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35411,97 +47155,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 87 + "x": 0, + "y": 28 }, - "id": 3410, + "height": null, + "hideTimeOverride": false, + "id": 339, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_gc_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_micros_average\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_gc_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_micros_percentile95\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" - }, - { - "expr": "avg(tikv_engine_blob_gc_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_micros_percentile99\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99%", - "refId": "C" - }, - { - "expr": "avg(tikv_engine_blob_gc_micros_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_micros_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_get\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max", - "refId": "D" + "legendFormat": "get", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_get\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC duration", + "title": "Blob get operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35509,14 +47268,16 @@ }, "yaxes": [ { - "format": "µs", + "decimals": null, + "format": "ops", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35527,76 +47288,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 93 + "x": 12, + "y": 28 }, - "id": 3649, + "height": null, + "hideTimeOverride": false, + "id": 340, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_gc_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\"keys.*\"}[30s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{type}}", + "metric": "", + "query": "avg((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%-{{type}}", + "metric": "", + "query": "avg((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{type}}", + "metric": "", + "query": "avg((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max-{{type}}", + "metric": "", + "query": "max((\n tikv_engine_blob_get_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC keys flow", + "title": "Blob get duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35604,7 +47446,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -35612,8 +47455,9 @@ "show": true }, { - "format": "decbytes", - "label": "", + "decimals": null, + "format": "short", + "label": null, "logBase": 1, "max": null, "min": null, @@ -35622,76 +47466,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 93 + "x": 0, + "y": 35 }, - "id": 3340, + "height": null, + "hideTimeOverride": false, + "id": 341, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_gc_flow_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=~\"bytes.*\"}[30s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_engine_titandb_blob_file_discardable_ratio\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) by (ratio) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ratio}}", + "metric": "", + "query": "sum((\n tikv_engine_titandb_blob_file_discardable_ratio\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n \n)) by (ratio) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC bytes flow", + "title": "Blob file discardable ratio distribution", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35699,7 +47579,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -35707,8 +47588,9 @@ "show": true }, { - "format": "decbytes", - "label": "", + "decimals": null, + "format": "short", + "label": null, "logBase": 1, "max": null, "min": null, @@ -35717,97 +47599,142 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 99 + "x": 12, + "y": 35 }, - "id": 4021, + "height": null, + "hideTimeOverride": false, + "id": 342, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_gc_input_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_input_file_average\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg", - "refId": "A" - }, - { - "expr": "avg(tikv_engine_blob_gc_input_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_input_file_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_seek\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "95%", - "refId": "B" + "legendFormat": "seek", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_seek\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_gc_input_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_input_file_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_prev\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "99%", - "refId": "C" + "legendFormat": "prev", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_prev\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_gc_input_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_input_file_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_next\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max", - "refId": "D" + "legendFormat": "next", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_locate\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"number_blob_next\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC input file size", + "title": "Blob iter operations", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35815,7 +47742,8 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -35823,6 +47751,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35833,97 +47762,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 12, - "y": 99 + "x": 0, + "y": 42 }, - "id": 4022, + "height": null, + "hideTimeOverride": false, + "id": 343, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_blob_gc_output_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_output_file_average\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "A" + "metric": "", + "query": "avg((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_gc_output_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_output_file_percentile95\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "95%", - "refId": "B" + "metric": "", + "query": "avg((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "avg(tikv_engine_blob_gc_output_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_output_file_percentile99\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "99%", - "refId": "C" + "metric": "", + "query": "avg((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "max(tikv_engine_blob_gc_outputt_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\", type=\"blob_gc_output_file_max\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "max", - "refId": "D" + "metric": "", + "query": "max((\n tikv_engine_blob_seek_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC output file size", + "title": "Blob seek duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -35931,7 +47920,8 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -35939,6 +47929,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -35949,77 +47940,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, + "h": 7, "w": 12, - "x": 0, - "y": 105 + "x": 12, + "y": 42 }, - "id": 3344, + "height": null, + "hideTimeOverride": false, + "id": 344, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", - "paceLength": 10, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "scopedVars": { - "titan_db": { - "selected": false, - "text": "kv", - "value": "kv" - } - }, + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_engine_blob_gc_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$titan_db\"}[2m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "B" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_next_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Blob GC file count", + "title": "Blob next duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36027,7 +48098,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -36035,6 +48107,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36045,43 +48118,44 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "repeat": "titan_db", - "title": "Titan - $titan_db", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 36 - }, - "id": 2820, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 29 + "y": 49 }, - "id": 2991, + "height": null, + "hideTimeOverride": false, + "id": 345, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, @@ -36093,7 +48167,7 @@ "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -36101,47 +48175,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"waiter_manager.*\"}[1m])) by (instance, name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{name}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{type}}", + "metric": "", + "query": "avg((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_average\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"deadlock_detect.*\"}[1m])) by (instance, name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{name}}", - "refId": "B" + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%-{{type}}", + "metric": "", + "query": "avg((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile95\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{type}}", + "metric": "", + "query": "avg((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_percentile99\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max-{{type}}", + "metric": "", + "query": "max((\n tikv_engine_blob_prev_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\".*_max\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Lock Manager Thread CPU", + "title": "Blob prev duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36149,7 +48276,8 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -36157,6 +48285,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36167,39 +48296,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 29 + "y": 49 }, - "id": 2877, + "height": null, + "hideTimeOverride": false, + "id": 346, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -36207,39 +48353,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_lock_manager_task_counter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"keys.*\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"keys.*\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Lock Manager Handled tasks", + "title": "Blob keys flow", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36247,14 +48409,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36265,31 +48429,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 37 + "y": 56 }, - "id": 2993, - "interval": "", + "height": null, + "hideTimeOverride": false, + "id": 347, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, - "hideZero": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -36303,53 +48486,100 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_lock_manager_waiter_lifetime_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_lock_manager_waiter_lifetime_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_average\"}\n \n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "avg", - "refId": "A", - "step": 10 + "metric": "", + "query": "avg((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_average\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_percentile99\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_percentile99\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_lock_manager_waiter_lifetime_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_percentile95\"}\n \n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "avg((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_percentile95\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.9999, sum(rate(tikv_lock_manager_waiter_lifetime_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_max\"}\n \n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "99.99%", - "refId": "C" + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_file_read_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_read_micros_max\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Waiter lifetime duration", + "title": "Blob file read duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36357,14 +48587,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36375,39 +48607,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 37 + "y": 56 }, - "id": 4018, + "height": null, + "hideTimeOverride": false, + "id": 348, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -36415,46 +48664,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(max_over_time(tikv_lock_manager_wait_table_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[15s])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"bytes.*\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(max_over_time(tikv_lock_wait_queue_entries_gauge_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[15s])) by (type)", "hide": false, - "intervalFactor": 2, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "B" + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"bytes.*\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Lock Waiting Queue", + "title": "Blob bytes flow", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36462,14 +48720,16 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36480,30 +48740,50 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 45 + "y": 63 }, - "id": 2995, + "height": null, + "hideTimeOverride": false, + "id": 349, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -36517,44 +48797,100 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_lock_manager_detect_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_lock_manager_detect_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_average\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "avg", - "refId": "A" + "metric": "", + "query": "avg((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_average\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_percentile99\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_percentile99\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_lock_manager_detect_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_percentile95\"}\n \n)) by (type) ", "format": "time_series", - "intervalFactor": 2, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, "legendFormat": "99%", - "refId": "B" + "metric": "", + "query": "avg((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_percentile95\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_max\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_file_write_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_write_micros_max\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Deadlock detect duration", + "title": "Blob file write duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36562,14 +48898,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "\u00b5s", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36580,39 +48918,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 45 + "y": 63 }, - "id": 2934, + "height": null, + "hideTimeOverride": false, + "id": 350, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -36620,39 +48975,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_lock_manager_error_counter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_file_synced\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n [$__rate_interval]\n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "sync", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_file_synced\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Detect error", + "title": "Blob file sync operations", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36660,14 +49031,16 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36678,84 +49051,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, - "bars": true, + "bars": false, "cacheTimeout": null, - "dashLength": 10, - "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 0, + "description": null, "editable": true, "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 53 + "y": 70 }, - "id": 4019, + "height": null, + "hideTimeOverride": false, + "id": 351, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(max_over_time(tikv_lock_manager_detector_leader_heartbeat{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[15s])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_gc_action_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_gc_action_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Deadlock detector leader", + "title": "Blob GC action", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36763,57 +49164,76 @@ }, "yaxes": [ { - "decimals": 0, + "decimals": null, "format": "none", - "label": "", + "label": null, "logBase": 1, - "max": "2", - "min": "0", - "show": false + "max": null, + "min": null, + "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 65 + "y": 70 }, - "hiddenSeries": false, - "id": 23763572093, + "height": null, + "hideTimeOverride": false, + "id": 352, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -36821,44 +49241,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "tikv_pessimistic_lock_memory_size{tidb_cluster=\"$tidb_cluster\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_average\"}\n \n)) by (type) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_average\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_percentile95\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_percentile95\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_percentile99\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_percentile99\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_max\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_file_sync_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_file_sync_micros_max\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Total pessimistic locks memory size", + "title": "Blob file sync duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36866,7 +49342,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, @@ -36874,6 +49351,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36884,45 +49362,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 73 + "y": 77 }, - "hiddenSeries": false, - "id": 23763572094, + "height": null, + "hideTimeOverride": false, + "id": 353, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -36930,45 +49419,100 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_in_memory_pessimistic_locking{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (result)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_average\"}\n \n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{result}}", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_average\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_percentile95\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_percentile95\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_percentile99\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_percentile99\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_max\"}\n \n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_gc_micros_seconds\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_micros_max\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "In-memory pessimistic locking result", + "title": "Blob GC duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -36976,14 +49520,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -36994,46 +49540,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The number of active keys and waiters.", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 75 + "y": 77 }, - "hiddenSeries": false, - "id": 23763573091, + "height": null, + "hideTimeOverride": false, + "id": 354, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, - "hideZero": false, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -37041,45 +49597,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_lock_wait_queue_entries_gauge_vec{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_gc_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"keys.*\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{type}}", - "refId": "A", - "step": 4 + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_gc_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"keys.*\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Pessimistic lock activities", + "title": "Blob GC keys flow", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -37087,14 +49653,16 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -37105,170 +49673,157 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateOranges", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The length includes the entering transaction itself", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 83 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573092, - "legend": { - "show": false - }, - "links": [], - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_lock_wait_queue_length_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Lengths of lock wait queues when transaction enqueues", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - } - ], - "title": "Pessimistic Locking", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 37 - }, - "id": 23763573235, - "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "percentunit" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, + "w": 12, "x": 0, - "y": 45 + "y": 84 }, - "hiddenSeries": false, - "id": 23763573350, + "height": null, + "hideTimeOverride": false, + "id": 355, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_average\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_percentile95\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_max\"}\n \n)) ", + "format": "time_series", "hide": false, - "refId": "B" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_gc_input_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_input_file_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "CPU Usage", + "title": "Blob GC input file size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -37276,7 +49831,8 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -37284,6 +49840,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -37294,117 +49851,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "(AP)apply-99": "#88509f", - "(AP)get_permit-99": "#922870", - "(AP)queuing-99": "#9d0041", - "(DL)exec_download-99": "#73a0fe", - "(DL)queue-99": "#7d78ce", - "exec_download-99": "light-orange", - "get_permit-99": "red", - "queuing-99": "blue", - "total-99": "rgb(252, 252, 252)" - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 7, - "w": 9, - "x": 6, - "y": 45 + "w": 12, + "x": 12, + "y": 84 }, - "hiddenSeries": false, - "id": 23763573351, + "height": null, + "hideTimeOverride": false, + "id": 356, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total-99", - "bars": false, - "fill": 2, - "lines": true, - "linewidth": 0, - "stack": false, - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": true, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.99, avg(rate(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=\"apply\"}[1m])) by (le, request))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_gc_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"bytes.*\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "total-99", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, avg(rate(tikv_import_apply_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"queue|exec_download\"}[1m])) by (le, type))", - "hide": false, - "interval": "", - "legendFormat": "(DL){{type}}-99", - "refId": "C" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, avg(rate(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", - "hide": false, - "interval": "", - "legendFormat": "(AP){{type}}-99", - "refId": "B" + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_gc_flow_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=~\"bytes.*\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "P99 RPC Duration", + "title": "Blob GC bytes flow", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -37412,7 +49964,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, @@ -37420,7 +49973,8 @@ "show": true }, { - "format": "s", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -37430,41 +49984,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 9, - "x": 15, - "y": 45 + "w": 12, + "x": 0, + "y": 91 }, - "hiddenSeries": false, - "id": 23763573352, + "height": null, + "hideTimeOverride": false, + "id": 357, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -37472,56 +50041,100 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_import_rpc_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=\"apply\"}[$__rate_interval])) by (instance, request)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_average\"}\n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}} :: {{request}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "legendFormat": "avg", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_average\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_import_rpc_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request!=\"switch_mode\"}[30s])) by (request)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_percentile95\"}\n \n)) ", "format": "time_series", - "hide": true, + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_percentile95\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_percentile99\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "avg((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_percentile99\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_max\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "total - {{request}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "B", - "step": 10 + "legendFormat": "max", + "metric": "", + "query": "max((\n tikv_engine_blob_gc_output_file\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\",type=\"blob_gc_output_file_max\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import RPC Ops", + "title": "Blob GC output file size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -37529,7 +50142,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -37537,6 +50151,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -37547,78 +50162,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "cps" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 6, - "x": 0, - "y": 52 + "h": 7, + "w": 12, + "x": 12, + "y": 91 }, - "hiddenSeries": false, - "id": 23763573032, + "height": null, + "hideTimeOverride": false, + "id": 358, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_import_apply_cache_event{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])) by (instance, type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_engine_blob_gc_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n [$__rate_interval]\n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}} :: {{type}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_engine_blob_gc_file_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",db=\"$titan_db\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Cache Events", + "title": "Blob GC file count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -37626,7 +50275,8 @@ }, "yaxes": [ { - "format": "cps", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -37634,6 +50284,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -37644,285 +50295,169 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": "titan_db", + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Titan - $titan_db", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 359, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 52 + "h": 7, + "w": 12, + "x": 0, + "y": 0 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573348, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=\"apply\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Overall RPC Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 52 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573558, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_apply_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"exec_download\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Read File into Memory Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#37872D", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 52 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573229, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"queuing\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Queuing Time", - "tooltip": { - "show": true, - "showHistogram": true - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": { - "unit": "bytes" - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 60 - }, - "hiddenSeries": false, - "id": 23763573349, + "height": null, + "hideTimeOverride": false, + "id": 360, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "rate(tikv_import_apply_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"waiter_manager.*\"}\n [$__rate_interval]\n)) by (instance, name) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{name}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"waiter_manager.*\"}\n [$__rate_interval]\n)) by (instance, name) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"deadlock_detect.*\"}\n [$__rate_interval]\n)) by (instance, name) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{name}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"deadlock_detect.*\"}\n [$__rate_interval]\n)) by (instance, name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Apply Request Throughput", + "title": "Lock Manager Thread CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -37930,7 +50465,8 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -37938,6 +50474,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -37948,287 +50485,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateBlues", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 60 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573344, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_download_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Downloaded File Size", - "tooltip": { - "show": true, - "showHistogram": true - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "decbytes", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolatePurples", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 60 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573233, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_apply_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Apply Batch Size", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": null, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "decbytes", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 60 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573230, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"get_permit\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Blocked by Concurrency Time", - "tooltip": { - "show": true, - "showHistogram": true - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "ops" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 5, - "w": 6, - "x": 0, - "y": 66 + "h": 7, + "w": 12, + "x": 12, + "y": 0 }, - "hiddenSeries": false, - "id": 23763573118, + "height": null, + "hideTimeOverride": false, + "id": 361, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "rate(tikv_import_applier_event{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"begin_req\"}[$__rate_interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_lock_manager_task_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}} :: {{type}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_lock_manager_task_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Apply Request Speed", + "title": "Lock Manager Handled tasks", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38236,6 +50598,7 @@ }, "yaxes": [ { + "decimals": null, "format": "ops", "label": null, "logBase": 1, @@ -38244,6 +50607,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38254,78 +50618,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "decbytes" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 6, - "x": 6, - "y": 68 + "h": 7, + "w": 12, + "x": 0, + "y": 7 }, - "hiddenSeries": false, - "id": 23763573346, + "height": null, + "hideTimeOverride": false, + "id": 362, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "tikv_import_apply_cached_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_lock_manager_waiter_lifetime_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Cached File in Memory", + "title": "Waiter lifetime duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38333,14 +50799,16 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38351,76 +50819,127 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 6, + "h": 7, + "w": 12, "x": 12, - "y": 68 + "y": 7 }, - "hiddenSeries": false, - "id": 23763573119, + "height": null, + "hideTimeOverride": false, + "id": 363, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, - "total": false, - "values": false - }, - "lines": false, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "increase(tikv_import_applier_event{instance=~\"$instance\", type!=\"begin_req\"}[$__rate_interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(max_over_time(\n tikv_lock_manager_wait_table_status\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30s]\n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(max_over_time(\n tikv_lock_manager_wait_table_status\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30s]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(max_over_time(\n tikv_lock_wait_queue_entries_gauge_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30s]\n)) by (type) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 3, - "legendFormat": "{{instance}} :: {{type}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(max_over_time(\n tikv_lock_wait_queue_entries_gauge_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30s]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Engine Requests Unfinished", + "title": "Lock Waiting Queue", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38428,6 +50947,7 @@ }, "yaxes": [ { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38436,6 +50956,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38446,149 +50967,180 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": null, - "cardRound": 2 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 68 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763573231, - "legend": { - "show": false - }, - "pluginVersion": "7.5.11", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"apply\"}[$__rate_interval])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Apply Time", - "tooltip": { - "show": true, - "showHistogram": true - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", "description": "", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "bytes" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 5, - "w": 6, + "h": 7, + "w": 12, "x": 0, - "y": 71 + "y": 14 }, - "hiddenSeries": false, - "id": 23763573449, + "height": null, + "hideTimeOverride": false, + "id": 364, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_server_mem_trace_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore-.*\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_lock_manager_detect_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_lock_manager_detect_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_lock_manager_detect_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_lock_manager_detect_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_lock_manager_detect_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_lock_manager_detect_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_lock_manager_detect_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_lock_manager_detect_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_lock_manager_detect_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_lock_manager_detect_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Raft Store Memory Usage", + "title": "Deadlock detect duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38596,14 +51148,16 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38614,59 +51168,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "title": "Point In Time Restore", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 38 - }, - "id": 8389, - "panels": [ + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe CPU utilization of resolved ts worker", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 39 + "h": 7, + "w": 12, + "x": 12, + "y": 14 }, - "hiddenSeries": false, - "id": 8385, + "height": null, + "hideTimeOverride": false, + "id": 365, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -38674,44 +51225,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"resolved_ts.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_lock_manager_error_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_lock_manager_error_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Resolved TS Worker CPU", + "title": "Detect error", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38719,14 +51281,16 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38737,44 +51301,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe CPU utilization of advance ts worker", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 39 + "h": 7, + "w": 12, + "x": 0, + "y": 21 }, - "hiddenSeries": false, - "id": 9162, + "height": null, + "hideTimeOverride": false, + "id": 366, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -38782,44 +51358,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"advance_ts.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(max_over_time(\n tikv_lock_manager_detector_leader_heartbeat\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30s]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-tso", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(max_over_time(\n tikv_lock_manager_detector_leader_heartbeat\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30s]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Advance ts Worker CPU", + "title": "Deadlock detector leader", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38827,14 +51414,16 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38845,44 +51434,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": " \tThe CPU utilization of scan lock worker", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 39 + "h": 7, + "w": 12, + "x": 12, + "y": 21 }, - "hiddenSeries": false, - "id": 9164, + "height": null, + "hideTimeOverride": false, + "id": 367, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -38890,44 +51491,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"inc_scan.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_pessimistic_lock_memory_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-scan", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "((\n tikv_pessimistic_lock_memory_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Scan lock Worker CPU", + "title": "Total pessimistic locks memory size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -38935,14 +51547,16 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -38953,43 +51567,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The gap between resolved ts (the maximum candidate of safe-ts) and current time.", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 47 + "y": 28 }, - "hiddenSeries": false, - "id": 8387, + "height": null, + "hideTimeOverride": false, + "id": 368, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -38997,43 +51624,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_resolved_ts_min_resolved_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_in_memory_pessimistic_locking\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (result) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 60 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{result}}", + "metric": "", + "query": "sum(rate(\n tikv_in_memory_pessimistic_locking\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (result) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Max gap of resolved-ts", + "title": "In-memory pessimistic locking result", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39041,7 +51680,8 @@ }, "yaxes": [ { - "format": "ms", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -39049,6 +51689,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -39059,43 +51700,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The gap between safe ts and current time", + "description": "The number of active keys and waiters.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 47 + "y": 28 }, - "hiddenSeries": false, - "id": 23763573805, + "height": null, + "hideTimeOverride": false, + "id": 369, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -39103,45 +51757,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_safe_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_lock_wait_queue_entries_gauge_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 60 + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum((\n tikv_lock_wait_queue_entries_gauge_vec\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Max gap of safe-ts", + "title": "Pessimistic lock activities", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39149,7 +51813,8 @@ }, "yaxes": [ { - "format": "ms", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -39157,6 +51822,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -39167,159 +51833,160 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The region that has minimal resolved ts", + "description": "The length includes the entering transaction itself", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 55 + "y": 35 }, - "hiddenSeries": false, - "id": 23763572078, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 370, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_resolved_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_lock_wait_queue_length_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_lock_wait_queue_length_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Min Resolved TS Region", + "title": "Lengths of lock wait queues when transaction enqueues", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The region that has minimal safe ts", + "description": "The duration scan in-memory pessimistic locks with read lock", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 55 + "y": 35 }, - "hiddenSeries": false, - "id": 23763573804, + "height": null, + "hideTimeOverride": false, + "id": 371, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -39327,51 +51994,123 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { - "alias": "total", - "lines": false + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_safe_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "legendFormat": "99.99%-{{type}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{type}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) / sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{type}}", + "metric": "", + "query": "(sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) / sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_storage_mvcc_scan_lock_read_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Min Safe TS Region", + "title": "In-memory scan lock read duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39379,15 +52118,17 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -39397,114 +52138,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed when handle a check leader request", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 63 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 9168, - "legend": { - "show": false - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(delta(tikv_resolved_ts_check_leader_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Check leader duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Pessimistic Locking", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 372, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The gap between resolved ts of leaders and current time", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 63 + "x": 0, + "y": 0 }, - "hiddenSeries": false, - "id": 23763572077, + "height": null, + "hideTimeOverride": false, + "id": 373, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -39512,45 +52237,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_leader_resolved_ts_gap_millis{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A", - "step": 60 + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Max gap of resolved-ts in region leaders", + "title": "CPU Usage", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39558,7 +52293,8 @@ }, "yaxes": [ { - "format": "ms", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -39566,6 +52302,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -39576,43 +52313,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Bucketed histogram of region count in a check leader request", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 71 + "x": 12, + "y": 0 }, - "hiddenSeries": false, - "id": 12308, + "height": null, + "hideTimeOverride": false, + "id": 374, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -39620,45 +52370,85 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_check_leader_request_item_count_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=\"apply\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_snapshot_size_bucket", - "refId": "A", - "step": 40 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total-99", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=\"apply\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_apply_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"queue|exec_download\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "(DL){{type}}-99", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_apply_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"queue|exec_download\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "(AP){{type}}-99", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% CheckLeader request region count", + "title": "P99 RPC Duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39666,7 +52456,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -39674,6 +52465,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -39684,44 +52476,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The region that its leader has minimal resolved ts.", + "description": "", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 71 + "x": 0, + "y": 7 }, - "hiddenSeries": false, - "id": 23763572079, + "height": null, + "hideTimeOverride": false, + "id": 375, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -39729,51 +52533,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(tikv_resolved_ts_min_leader_resolved_ts_region{tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=\"apply\"}\n [$__rate_interval]\n)) by (instance, request) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "legendFormat": "{{instance}}-{{request}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=\"apply\"}\n [$__rate_interval]\n)) by (instance, request) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request!=\"switch_mode\"}\n [$__rate_interval]\n)) by (request) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total-{{request}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request!=\"switch_mode\"}\n [$__rate_interval]\n)) by (request) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Min Leader Resolved TS Region", + "title": "Import RPC Ops", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39781,15 +52604,17 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -39799,44 +52624,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "Total bytes in memory of resolved-ts observe regions's lock heap", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 79 + "x": 12, + "y": 7 }, - "hiddenSeries": false, - "id": 8379, + "height": null, + "hideTimeOverride": false, + "id": 376, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -39844,43 +52681,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "avg(tikv_resolved_ts_lock_heap_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_apply_cache_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_import_apply_cache_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Lock heap size", + "title": "Cache Events", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -39888,14 +52737,16 @@ }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "cps", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -39906,270 +52757,368 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The status of resolved-ts observe regions", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 79 + "x": 0, + "y": 14 }, - "hiddenSeries": false, - "id": 8377, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 377, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(tikv_resolved_ts_region_resolve_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=\"apply\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=\"apply\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Observe region status", + "title": "Overall RPC Duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of fail to advance resolved-ts", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 87 + "x": 12, + "y": 14 }, - "hiddenSeries": false, - "id": 9166, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 378, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "lines": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(delta(tikv_resolved_ts_fail_advance_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance, reason)", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_apply_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"exec_download\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{reason}}", - "refId": "A", - "step": 10 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_apply_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"exec_download\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Fail advance ts count", + "title": "Read File into Memory Duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 379, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"queuing\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"queuing\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "timeFrom": null, + "timeShift": null, + "title": "Queuing Time", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Bucketed histogram of the check leader request size", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 87 + "y": 21 }, - "hiddenSeries": false, - "id": 8383, + "height": null, + "hideTimeOverride": false, + "id": 380, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -40177,53 +53126,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_check_leader_request_size_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_apply_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_snapshot_size_bucket", - "refId": "A", - "step": 40 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_check_leader_request_item_count_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "hide": true, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}-check-num", - "refId": "B" + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_import_apply_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "99% CheckLeader request size", + "title": "Apply Request Throughput", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -40231,6 +53182,7 @@ }, "yaxes": [ { + "decimals": null, "format": "bytes", "label": null, "logBase": 1, @@ -40239,6 +53191,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -40249,160 +53202,368 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "Total bytes of pending commands in the channel", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 95 + "y": 28 }, - "hiddenSeries": false, - "id": 8381, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 381, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "avg(tikv_resolved_ts_channel_penging_cmd_bytes_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 10 + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_download_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_download_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Pending command size", + "title": "Downloaded File Size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 382, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_apply_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_apply_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Apply Batch Size", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 35 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 383, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"get_permit\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"get_permit\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "title": "Resolved-TS", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 39 - }, - "id": 2763, - "panels": [ + "timeFrom": null, + "timeShift": null, + "title": "Blocked by Concurrency Time", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 44 + "x": 12, + "y": 35 }, - "hiddenSeries": false, - "id": 23763573729, + "height": null, + "hideTimeOverride": false, + "id": 384, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -40410,42 +53571,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_allocator_stats{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_applier_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"begin_req\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{type}}", - "refId": "A" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_import_applier_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"begin_req\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Allocator Stats", + "title": "Apply Request Speed", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -40453,7 +53627,8 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -40461,6 +53636,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -40471,85 +53647,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "binBps" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 44 + "x": 0, + "y": 42 }, - "hiddenSeries": false, - "id": 23763573730, + "height": null, + "hideTimeOverride": false, + "id": 385, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_allocator_thread_allocation{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"alloc\"}[$__rate_interval])) by (thread_name) - sum(rate(tikv_allocator_thread_allocation{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"dealloc\"}[$__rate_interval])) by (thread_name) != 0", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_import_apply_cached_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{thread_name}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_import_apply_cached_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Send Allocated(+) / Release Received(-) Bytes Rate", + "title": "Cached File in Memory", "tooltip": { + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -40557,7 +53760,8 @@ }, "yaxes": [ { - "format": "binBps", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -40565,6 +53769,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -40575,83 +53780,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 51 + "x": 12, + "y": 42 }, - "hiddenSeries": false, - "id": 2696, + "height": null, + "hideTimeOverride": false, + "id": 386, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, - "show": false, - "sort": "current", + "show": true, + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(delta(tikv_allocator_thread_allocation{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"alloc\"}[$__rate_interval])) by (thread_name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_applier_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"begin_req\"}\n [$__rate_interval]\n)) by (instance, type) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{thread_name}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_import_applier_event\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type!=\"begin_req\"}\n [$__rate_interval]\n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Newly Allocated Bytes by Thread", + "title": "Engine Requests Unfinished", "tooltip": { + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -40659,7 +53893,8 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -40667,6 +53902,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -40677,83 +53913,216 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 387, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"apply\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_engine_request_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"apply\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Apply Time", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 51 + "y": 49 }, - "hiddenSeries": false, - "id": 23763573731, + "height": null, + "hideTimeOverride": false, + "id": 388, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, - "show": false, - "sort": "current", + "show": true, + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(delta(tikv_allocator_thread_allocation{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"alloc\"}[$__rate_interval])) by (thread_name)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_server_mem_trace_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftstore-.*\"}\n \n)) by (instance) ", "format": "time_series", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{thread_name}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_server_mem_trace_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"raftstore-.*\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Recently Released Bytes by Thread", + "title": "Raft Store Memory Usage", "tooltip": { + "msResolution": true, "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -40761,7 +54130,8 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -40769,6 +54139,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -40779,60 +54150,98 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], "repeat": null, - "title": "Memory", + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Point In Time Restore", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 40 + "y": 0 }, - "id": 3922, + "height": null, + "hideTimeOverride": false, + "id": 389, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The CPU utilization of resolved ts worker", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, - "w": 12, + "w": 8, "x": 0, - "y": 65 + "y": 0 }, - "hiddenSeries": false, - "id": 3924, + "height": null, + "hideTimeOverride": false, + "id": 390, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -40840,71 +54249,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/backup-auto-throttle/", - "fill": 5, - "fillGradient": 2, - "linewidth": 0 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"b.*k.*w.*k.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"resolved_ts.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "backup-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - }, - { - "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"backup_io\"}[1m])) by (instance)", - "format": "time_series", - "hide": true, - "interval": "", - "intervalFactor": 2, - "legendFormat": "backup-io-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "B", - "step": 4 - }, - { - "exemplar": true, - "expr": "tikv_backup_softlimit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", "hide": false, + "instant": false, "interval": "", - "legendFormat": "backup-auto-throttle-{{instance}}", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"resolved_ts.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Backup CPU Utilization", + "title": "Resolved TS Worker CPU", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -40912,6 +54305,7 @@ }, "yaxes": [ { + "decimals": null, "format": "percentunit", "label": null, "logBase": 1, @@ -40920,6 +54314,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -40930,144 +54325,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "columns": [ - { - "text": "Current", - "value": "current" - } - ], - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fontSize": "100%", - "gridPos": { - "h": 7, - "w": 4, - "x": 12, - "y": 65 - }, - "id": 3926, - "links": [], - "pageSize": null, - "scroll": true, - "showHeader": true, - "sort": { - "col": 0, - "desc": true - }, - "styles": [ - { - "alias": "Time", - "align": "auto", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "date" - }, - { - "alias": "", - "align": "auto", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "/.*/", - "thresholds": [], - "type": "number", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(tikv_backup_thread_pool_size{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by(instance)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Backup Thread Count", - "transform": "timeseries_aggregations", - "type": "table" - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The CPU utilization of advance ts worker", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 32 + "h": 7, + "w": 8, + "x": 8, + "y": 0 }, - "hiddenSeries": false, - "id": 23763571993, + "height": null, + "hideTimeOverride": false, + "id": 391, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_cloud_request_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (cloud, req)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"advance_ts.*\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{cloud}}-{{req}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"advance_ts.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "cloud request", + "title": "Advance ts Worker CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -41075,7 +54438,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -41083,6 +54447,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -41093,81 +54458,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 2, - "description": "", + "description": "The CPU utilization of scan lock worker", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 8, "x": 16, - "y": 65 + "y": 0 }, - "hiddenSeries": false, - "id": 5264, + "height": null, + "hideTimeOverride": false, + "id": 392, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "delta(tikv_backup_error_counter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"inc_scan.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}-{{error}}", - "refId": "D" + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"inc_scan.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Backup Errors", + "title": "Scan lock Worker CPU", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -41175,220 +54571,341 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The gap between resolved ts (the maximum candidate of safe-ts) and current time.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 8, + "w": 12, "x": 0, - "y": 72 + "y": 7 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 3927, + "height": null, + "hideTimeOverride": false, + "id": 393, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "max(rate(tikv_backup_range_size_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", cf=\"write\"}[1m])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_min_resolved_ts_gap_millis\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_resolved_ts_min_resolved_ts_gap_millis\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Backup Write CF SST Size", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Max gap of resolved-ts", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The gap between now() and the minimal (non-zero) safe ts for followers", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 8, - "x": 8, - "y": 72 + "w": 12, + "x": 12, + "y": 7 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5266, + "height": null, + "hideTimeOverride": false, + "id": 394, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "max(rate(tikv_backup_range_size_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", cf=\"default\"}[1m])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_min_follower_safe_ts_gap_millis\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_resolved_ts_min_follower_safe_ts_gap_millis\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Backup Default CF SST Size", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Max gap of follower safe-ts", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The region that has minimal resolved ts", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 8, - "x": 16, - "y": 72 + "w": 12, + "x": 0, + "y": 14 }, - "hiddenSeries": false, - "id": 3929, + "height": null, + "hideTimeOverride": false, + "id": 395, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -41397,60 +54914,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_backup_range_size_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "total", - "metric": "", - "refId": "A", - "step": 4 - }, - { - "expr": "rate(tikv_backup_range_size_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_min_resolved_ts_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}} {{cf}}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", "metric": "", - "refId": "B", - "step": 4 + "query": "sum((\n tikv_resolved_ts_min_resolved_ts_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Backup SST Generation Throughput", + "title": "Min Resolved TS Region", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -41458,7 +54970,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -41466,7 +54979,8 @@ "show": true }, { - "format": "Bps", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -41476,283 +54990,55 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 4, - "x": 0, - "y": 79 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5597, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "max(rate(tikv_backup_range_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Backup Scan SST Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 4, - "y": 79 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 3931, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "max(rate(tikv_backup_range_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"scan\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Backup Scan SST Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 10, - "y": 79 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 6905, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "max(rate(tikv_backup_range_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"save.*\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Backup Save SST Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The region id of the follower that has minimal safe ts", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 8, - "x": 16, - "y": 79 + "w": 12, + "x": 12, + "y": 14 }, - "hiddenSeries": false, - "id": 3928, + "height": null, + "hideTimeOverride": false, + "id": 396, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, @@ -41761,60 +55047,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_backup_range_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_min_follower_safe_ts_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}} - 99%", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", "metric": "", - "refId": "A", - "step": 4 - }, - { - "expr": "histogram_quantile(0.95, sum(rate(tikv_backup_range_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}} - 95%", - "refId": "B", - "step": 4 - }, - { - "expr": "sum(rate(tikv_backup_range_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_backup_range_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}} - avg", - "refId": "C", - "step": 4 + "query": "sum((\n tikv_resolved_ts_min_follower_safe_ts_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Backup SST Duration", + "title": "Min Safe TS Follower Region", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -41822,7 +55103,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -41830,6 +55112,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -41840,85 +55123,108 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { + "cacheTimeout": null, "cards": { - "cardPadding": 0, - "cardRound": 0 + "cardPadding": null, + "cardRound": null }, "color": { - "cardColor": "#FF9830", - "colorScale": "linear", + "cardColor": "#b4ff00", + "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, "max": null, - "min": 0, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The time consumed when handle a check leader request", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 86 + "y": 21 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 3930, + "id": 397, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "max(rate(tikv_external_storage_create_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_resolved_ts_check_leader_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, "instant": false, - "intervalFactor": 2, + "interval": "", + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tikv_resolved_ts_check_leader_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "External Storage Create Duration", + "timeFrom": null, + "timeShift": null, + "title": "Check leader duration", "tooltip": { - "show": true, - "showHistogram": true + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" }, - "tooltipDecimals": 1, + "transformations": [], + "transparent": false, "type": "heatmap", "xAxis": { - "show": true + "mode": "time", + "name": null, + "show": true, + "values": [] }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 1, "format": "s", + "label": null, "logBase": 1, "max": null, "min": null, - "show": true, - "splitFactor": null + "show": true }, "yBucketBound": "upper", "yBucketNumber": null, @@ -41927,37 +55233,50 @@ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "The gap between resolved ts of leaders and current time", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 86 + "y": 21 }, - "id": 4936, + "height": null, + "hideTimeOverride": false, + "id": 398, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -41965,47 +55284,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_external_storage_create_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}-100%", - "refId": "E" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_external_storage_create_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,type))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_min_leader_resolved_ts_gap_millis\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}-99%", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_resolved_ts_min_leader_resolved_ts_gap_millis\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "External Storage Create Duration", + "title": "Max gap of resolved-ts in region leaders", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 1, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42014,7 +55341,7 @@ "yaxes": [ { "decimals": null, - "format": "s", + "format": "ms", "label": null, "logBase": 1, "max": null, @@ -42022,53 +55349,67 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "Bucketed histogram of region count in a check leader request", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "grid": {}, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 93 + "y": 28 }, - "id": 5267, + "height": null, + "hideTimeOverride": false, + "id": 399, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -42076,47 +55417,55 @@ "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"analyze.*|checksum.*\"}[1m])) by (le,req))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-100%", - "refId": "E" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"analyze.*|checksum.*\"}[1m])) by (le,req))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_check_leader_request_item_count_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{req}}-99%", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_check_leader_request_item_count_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Checksum Request Duration", + "title": "99% CheckLeader request region count", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, - "sort": 1, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42125,7 +55474,7 @@ "yaxes": [ { "decimals": null, - "format": "s", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -42133,94 +55482,227 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The backoff duration before starting initial scan", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 93 + "y": 28 }, - "id": 5269, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 400, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, - "lines": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_resolved_ts_initial_scan_backoff_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_resolved_ts_initial_scan_backoff_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Initial scan backoff duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Total bytes in memory of resolved-ts observe regions's lock heap", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 35 + }, + "height": null, + "hideTimeOverride": false, + "id": 401, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": true, + "pointradius": 5, + "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_io_time_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_resolved_ts_lock_heap_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{device}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "avg((\n tikv_resolved_ts_lock_heap_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "IO Utilization", + "title": "Lock heap size", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42228,7 +55710,8 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -42236,6 +55719,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -42246,42 +55730,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, + "description": "The region that its leader has minimal resolved ts.", "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "grid": {}, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 100 + "x": 12, + "y": 35 }, - "id": 5925, + "height": null, + "hideTimeOverride": false, + "id": 402, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -42289,67 +55787,55 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/import-count.*/", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_min_leader_resolved_ts_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "import-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance, tid) > 0", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "backup-{{instance}}-{{tid}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "C", - "step": 4 - }, - { - "expr": "count(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "import-count-{{instance}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "D", - "step": 4 + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_resolved_ts_min_leader_resolved_ts_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import CPU Utilization", + "title": "Min Leader Resolved TS Region", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42357,7 +55843,8 @@ }, "yaxes": [ { - "format": "percentunit", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -42365,6 +55852,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -42375,145 +55863,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, - { - "columns": [ - { - "text": "Current", - "value": "current" - } - ], - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fontSize": "100%", - "gridPos": { - "h": 7, - "w": 4, - "x": 12, - "y": 100 - }, - "id": 5926, - "links": [], - "pageSize": null, - "scroll": true, - "showHeader": true, - "sort": { - "col": 0, - "desc": true - }, - "styles": [ - { - "alias": "Time", - "align": "auto", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "date" - }, - { - "alias": "", - "align": "auto", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "/.*/", - "thresholds": [], - "type": "number", - "unit": "short" - } - ], - "targets": [ - { - "expr": "count(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Import Thread Count", - "transform": "timeseries_aggregations", - "type": "table" - }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 2, - "description": "", + "description": "The status of resolved-ts observe regions", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 8, - "x": 16, - "y": 100 + "w": 12, + "x": 0, + "y": 42 }, - "id": 5932, + "height": null, + "hideTimeOverride": false, + "id": 403, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "delta(tikv_import_error_counter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_resolved_ts_region_resolve_status\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}} {{error}} {{instance}}", - "refId": "D" + "legendFormat": "{{type}}", + "metric": "", + "query": "sum((\n tikv_resolved_ts_region_resolve_status\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import Errors", + "title": "Observe region status", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42521,100 +55976,147 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The count of fail to advance resolved-ts", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 107 + "x": 12, + "y": 42 }, - "id": 5931, + "height": null, + "hideTimeOverride": false, + "id": 404, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, avg(rate(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, request))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_resolved_ts_fail_advance_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{request}}-99%", - "refId": "A" + "legendFormat": "{{instance}}-{{reason}}", + "metric": "", + "query": "sum(delta(\n tikv_resolved_ts_fail_advance_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, reason) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "histogram_quantile(0.5, sum(rate(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, request, instance))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_raftstore_check_stale_peer\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", - "hide": true, + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}-{{request}}-50%", - "refId": "B" + "legendFormat": "{{instance}}-stale-peer", + "metric": "", + "query": "sum(delta(\n tikv_raftstore_check_stale_peer\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import RPC Duration", + "title": "Fail advance ts count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42622,7 +56124,8 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -42630,6 +56133,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -42640,39 +56144,56 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "description": "Bucketed histogram of the check leader request size", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 107 + "x": 0, + "y": 49 }, - "id": 6267, + "height": null, + "hideTimeOverride": false, + "id": 405, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -42680,51 +56201,70 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_import_rpc_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request!=\"switch_mode\"}[30s])) by (instance, request)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_check_leader_request_size_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}} - {{request}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "A", - "step": 10 + "legendFormat": "{{instance}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_check_leader_request_size_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_import_rpc_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request!=\"switch_mode\"}[30s])) by (request)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_check_leader_request_item_count_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", "format": "time_series", - "hide": true, + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "total - {{request}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", - "refId": "B", - "step": 10 + "legendFormat": "{{instance}}-check-num", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_check_leader_request_item_count_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import RPC Ops", + "title": "99% CheckLeader request size", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -42732,7 +56272,8 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -42740,6 +56281,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -42750,652 +56292,420 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "Total bytes of pending commands in the channel", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, - "x": 0, - "y": 114 + "w": 12, + "x": 12, + "y": 49 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5930, + "height": null, + "hideTimeOverride": false, + "id": 406, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "max(rate(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=~\"download|write\"}[1m])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "avg((\n tikv_resolved_ts_channel_penging_cmd_bytes_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "avg((\n tikv_resolved_ts_channel_penging_cmd_bytes_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Import Write/Download RPC Duration", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pending command size", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Resolved TS", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 407, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, - "x": 6, - "y": 114 + "w": 12, + "x": 0, + "y": 0 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5929, + "height": null, + "hideTimeOverride": false, + "id": 408, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "max(rate(tikv_import_download_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"queue\"}[1m])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_allocator_stats\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "format": "time_series", + "hide": false, "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "metric": "", + "query": "sum((\n tikv_allocator_stats\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance, type) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Import Wait Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 114 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 6906, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "max(rate(tikv_import_download_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"read\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Import Read SST Duration", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Allocator Stats", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 114 + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5928, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "values": [] }, - "links": [], - "reverseYBuckets": false, - "targets": [ + "yaxes": [ { - "expr": "max(rate(tikv_import_download_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"rewrite\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Import Rewrite SST Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 121 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5939, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ + "decimals": null, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { - "expr": "max(rate(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=~\"ingest\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } ], - "title": "Import Ingest RPC Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 121 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5938, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "max(rate(tikv_import_ingest_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"ingest\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "title": "Import Ingest SST Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, + "w": 12, "x": 12, - "y": 121 + "y": 0 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 5937, + "height": null, + "hideTimeOverride": false, + "id": 409, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "max(rate(tikv_import_ingest_byte{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Import Ingest SST Bytes", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 121 - }, - "id": 5927, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_import_download_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" - }, - { - "expr": "sum(rate(tikv_import_download_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"alloc\"}\n [$__rate_interval]\n)) by (thread_name) - sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"dealloc\"}\n [$__rate_interval]\n)) by (thread_name) )", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "total", - "refId": "B" + "legendFormat": "{{thread_name}}", + "metric": "", + "query": "(sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"alloc\"}\n [$__rate_interval]\n)) by (thread_name) - sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"dealloc\"}\n [$__rate_interval]\n)) by (thread_name) )", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import Download SST Throughput", + "title": "Send Allocated(+) / Release Received(-) Bytes Rate", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -43403,7 +56713,8 @@ }, "yaxes": [ { - "format": "Bps", + "decimals": null, + "format": "binBps", "label": null, "logBase": 1, "max": null, @@ -43411,7 +56722,8 @@ "show": true }, { - "format": "Bps", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -43421,77 +56733,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 2, - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 128 + "y": 7 }, - "id": 12309, + "height": null, + "hideTimeOverride": false, + "id": 410, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "delta(tikv_import_local_write_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"alloc\"}\n [$__rate_interval]\n)) by (thread_name) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}} {{instance}}", - "refId": "D" + "legendFormat": "{{thread_name}}", + "metric": "", + "query": "sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"alloc\"}\n [$__rate_interval]\n)) by (thread_name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import Local Write keys", + "title": "Newly Allocated Bytes by Thread", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -43499,95 +56846,132 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 2, - "description": "", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, "w": 12, "x": 12, - "y": 128 + "y": 7 }, - "id": 12310, + "height": null, + "hideTimeOverride": false, + "id": 411, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "delta(tikv_import_local_write_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"dealloc\"}\n [$__rate_interval]\n)) by (thread_name) ", "format": "time_series", "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{type}} {{instance}}", - "refId": "D" + "legendFormat": "{{thread_name}}", + "metric": "", + "query": "sum(rate(\n tikv_allocator_thread_allocation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"dealloc\"}\n [$__rate_interval]\n)) by (thread_name) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Import Local Write bytes", + "title": "Recently Released Bytes by Thread", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -43595,207 +56979,204 @@ }, "yaxes": [ { - "format": "decbytes", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], "yaxis": { "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The accumulated TTL expired KV count during backup", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 135 - }, - "hiddenSeries": false, - "id": 23763572861, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(tikv_backup_raw_expired_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (instance)", - "hide": true, - "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(tikv_backup_raw_expired_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"})", - "hide": false, - "interval": "", - "legendFormat": "sum", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "TTL Expired", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null + "alignLevel": 0 } } ], - "title": "Backup & Import", + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "transformations": [], + "transparent": false, "type": "row" }, { + "cacheTimeout": null, "collapsed": true, "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 1, + "h": 7, "w": 24, "x": 0, - "y": 41 + "y": 0 }, - "id": 4466, + "height": null, + "hideTimeOverride": false, + "id": 412, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, "panels": [ { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Total number of encryption data keys in use", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 8, "x": 0, - "y": 58 + "y": 0 }, - "id": 4464, + "height": null, + "hideTimeOverride": false, + "id": 413, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_encryption_data_key_storage_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"b.*k.*w.*k.*\"}\n [$__rate_interval]\n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "backup-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"b.*k.*w.*k.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"backup_io\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "backup-io-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"backup_io\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_backup_softlimit\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "backup-auto-throttle-{{instance}}", + "metric": "", + "query": "((\n tikv_backup_softlimit\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Encryption data keys", + "title": "Backup CPU Utilization", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -43803,8 +57184,8 @@ }, "yaxes": [ { - "decimals": 0, - "format": "short", + "decimals": null, + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -43812,7 +57193,7 @@ "show": true }, { - "decimals": 0, + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -43823,69 +57204,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Number of files being encrypted", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 58 + "h": 7, + "w": 8, + "x": 8, + "y": 0 }, - "id": 4554, + "height": null, + "hideTimeOverride": false, + "id": 414, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, - "min": true, + "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_encryption_file_num{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_backup_thread_pool_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", - "refId": "A" + "metric": "", + "query": "sum((\n tikv_backup_thread_pool_size\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Encrypted files", + "title": "Backup Thread Count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -43893,7 +57317,8 @@ }, "yaxes": [ { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -43901,6 +57326,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -43911,69 +57337,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Flag to indicate if encryption is initialized", + "description": "", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 66 + "h": 7, + "w": 8, + "x": 16, + "y": 0 }, - "id": 4555, + "height": null, + "hideTimeOverride": false, + "id": 415, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, - "max": false, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "tikv_encryption_is_initialized{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_backup_error_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, error) ", "format": "time_series", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{instance}}-{{error}}", + "metric": "", + "query": "sum(delta(\n tikv_backup_error_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, error) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Encryption initialized", + "title": "Backup Errors", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -43981,8 +57450,8 @@ }, "yaxes": [ { - "decimals": 0, - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -43990,7 +57459,7 @@ "show": true }, { - "decimals": 0, + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -44001,270 +57470,335 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "Total size of encryption meta files", - "fill": 1, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 66 + "h": 7, + "w": 8, + "x": 0, + "y": 7 }, - "id": 4556, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 416, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "tikv_encryption_meta_file_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_size_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"write\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{name}}-{{instance}}", - "refId": "A" + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_size_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"write\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Encryption meta files size", + "title": "Backup Write CF SST Size", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "decbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fill": 1, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 74 + "h": 7, + "w": 8, + "x": 8, + "y": 7 }, - "id": 4557, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 417, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\" ,metric=\"encrypt_data_nanos\"}[1m])) by (req)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "encrypt-{{req}}", - "refId": "A" - }, - { - "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\" ,metric=\"decrypt_data_nanos\"}[1m])) by (req)", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_size_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"default\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "decrypt-{{req}}", - "refId": "B" + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_size_bytes_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",cf=\"default\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Encrypt/decrypt data nanos", + "title": "Backup Default CF SST Size", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Writing or reading file duration (second)", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 74 + "h": 7, + "w": 8, + "x": 16, + "y": 7 }, - "id": 4559, + "height": null, + "hideTimeOverride": false, + "id": 418, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, "percentage": false, - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(1, sum(rate(tikv_encryption_write_read_file_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type, operation))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_size_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "max-{{type}}-{{operation}}", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(tikv_encryption_write_read_file_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type, operation))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "95%-{{type}}-{{operation}}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "total", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_size_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" }, { - "expr": "sum(rate(tikv_encryption_write_read_file_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type, operation) / sum(rate(tikv_encryption_write_read_file_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type, operation)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_size_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, cf) ", "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "avg-{{type}}-{{operation}}", - "refId": "C" + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{cf}}", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_size_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, cf) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Read/write encryption meta duration", + "title": "Backup SST Generation Throughput", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -44273,7 +57807,7 @@ "yaxes": [ { "decimals": null, - "format": "s", + "format": "binBps", "label": null, "logBase": 1, "max": null, @@ -44281,6 +57815,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -44291,609 +57826,1170 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - } - ], - "title": "Encryption", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 42 - }, - "id": 13016, - "panels": [ + }, { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "from": "", - "id": 1, - "text": "Disabled", - "to": "", - "type": 1, - "value": "0" - }, - { - "from": "", - "id": 2, - "text": "Enabled", - "to": "", - "type": 1, - "value": "1" - } - ], - "noValue": "Disabled", "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "rgba(0, 0, 0, 0.2)", - "value": null - }, - { - "color": "dark-red", - "value": 0 - }, - { - "color": "dark-green", - "value": 1 - } - ] + "steps": [] } - }, - "overrides": [] + } }, "gridPos": { - "h": 4, - "w": 5, + "h": 7, + "w": 6, "x": 0, - "y": 55 + "y": 14 }, - "id": 14361, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 419, + "interval": null, + "legend": { + "show": false }, - "pluginVersion": "7.5.11", + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "tikv_log_backup_enabled{instance=~\"$instance\"}", - "instant": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"snapshot\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Endpoint Status", + "title": "Backup Scan SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, "transformations": [], - "type": "stat" + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The average flush size of last 30mins.", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] + "steps": [] + } + } }, "gridPos": { - "h": 8, - "w": 8, - "x": 5, - "y": 55 + "h": 7, + "w": 6, + "x": 6, + "y": 14 }, - "id": 14507, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 420, + "interval": null, + "legend": { + "show": false }, - "pluginVersion": "7.5.11", + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "increase(tikv_log_backup_flush_file_size_sum{instance=~\"$instance\"}[30m]) / on(instance) increase(tikv_log_backup_flush_duration_sec_count{stage=~\"save_files\",instance=~\"$instance\"}[30m])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"scan\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, - "instant": true, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"scan\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Average Flush Size ", - "type": "stat" + "title": "Backup Scan SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The current total flushed file number of this run.", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] + "steps": [] + } + } }, "gridPos": { - "h": 8, - "w": 8, - "x": 13, - "y": 55 + "h": 7, + "w": 6, + "x": 12, + "y": 14 }, - "id": 14363, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 421, + "interval": null, + "legend": { + "show": false }, - "pluginVersion": "7.5.11", + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "round(increase(tikv_log_backup_flush_file_size_count{instance=~\"$instance\"}[30m]))", - "instant": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"save.*\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"save.*\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Flushed Files (Last 30m) Per Host", - "type": "stat" + "title": "Backup Save SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot. \n**NOTE: The size may get reduced if some of TiKVs reboot.**", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { - "h": 2, - "w": 3, - "x": 21, - "y": 55 + "h": 7, + "w": 6, + "x": 18, + "y": 14 + }, + "height": null, + "hideTimeOverride": false, + "id": 422, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true }, - "id": 14508, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" + "alertThreshold": true, + "dataLinks": [] }, - "pluginVersion": "7.5.11", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "round(sum(increase(tikv_log_backup_flush_duration_sec_count{stage=~\"save_files\",instance=~\"$instance\"}[30m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.999,(\n sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "format": "time_series", "hide": false, - "instant": true, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{type}}-99.9%", + "metric": "", + "query": "histogram_quantile(0.999,(\n sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_backup_range_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum((\n tikv_backup_range_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) / sum((\n tikv_backup_range_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-avg", + "metric": "", + "query": "(sum((\n tikv_backup_range_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) / sum((\n tikv_backup_range_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (type) )", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Flush Times (Last 30m)", - "type": "stat" - }, - { - "datasource": "${DS_TEST-CLUSTER}", - "description": "This is the summary of the size has been flushed, summered by the data each TiKV has flushed since last boot. \n**NOTE: The size may get reduced if some of TiKVs reboot.**", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "bytes" - }, - "overrides": [] + "title": "Backup SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "gridPos": { - "h": 3, - "w": 3, - "x": 21, - "y": 57 + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] }, - "id": 14362, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "7.5.11", - "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_flush_file_size_sum{instance=~\"$instance\"}[30m]))", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "{{ instance }}", - "refId": "B" + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } ], - "timeFrom": null, - "timeShift": null, - "title": "Total Flushed Size (Last 30m)", - "type": "stat" + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "from": "", - "id": 1, - "text": "Running", - "to": "", - "type": 1, - "value": "0" - }, - { - "from": "", - "id": 2, - "text": "Paused", - "to": "", - "type": 1, - "value": "1" - }, - { - "from": "", - "id": 3, - "text": "Error", - "to": "", - "type": 1, - "value": "2" - } - ], - "noValue": "Disabled", "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "rgba(0, 0, 0, 0.2)", - "value": null - }, - { - "color": "dark-green", - "value": 0 - }, - { - "color": "#EAB839", - "value": 1 - }, - { - "color": "dark-red", - "value": 2 - } - ] + "steps": [] } - }, - "overrides": [] + } }, "gridPos": { - "h": 4, - "w": 2, + "h": 7, + "w": 12, "x": 0, - "y": 59 + "y": 21 }, - "id": 14907, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 423, + "interval": null, + "legend": { + "show": false }, - "pluginVersion": "7.5.11", + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "min(tikv_log_backup_task_status{instance=~\"$instance\"})", - "instant": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_external_storage_create_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_external_storage_create_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, "timeShift": null, - "title": "Task Status", + "title": "External Storage Create Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, "transformations": [], - "type": "stat" + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "min": 1, "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "dark-blue", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { - "h": 4, - "w": 3, - "x": 2, - "y": 59 + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "height": null, + "hideTimeOverride": false, + "id": 424, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true }, - "id": 15361, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "name" + "alertThreshold": true, + "dataLinks": [] }, - "pluginVersion": "7.5.11", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "tidb_log_backup_advancer_owner > 0", - "instant": true, + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_external_storage_create_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_external_storage_create_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_external_storage_create_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_external_storage_create_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_external_storage_create_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_external_storage_create_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_external_storage_create_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_external_storage_create_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_external_storage_create_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_external_storage_create_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Advancer Owner", - "type": "stat" + "title": "External Storage Create Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot. \n**NOTE: The size may get reduced if some of TiKVs reboot.**", + "description": "", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], "thresholds": { "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { - "h": 3, - "w": 3, - "x": 21, - "y": 60 + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "height": null, + "hideTimeOverride": false, + "id": 425, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true }, - "id": 14911, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 }, - "text": {}, - "textMode": "auto" + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{req}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_coprocessor_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_coprocessor_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{req}}", + "metric": "", + "query": "(sum(rate(\n tikv_coprocessor_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req) / sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",req=~\"analyze.*|checksum.*\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Checksum Request Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "height": null, + "hideTimeOverride": false, + "id": 426, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] }, - "pluginVersion": "7.5.11", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "round(sum(increase(tikv_log_backup_flush_file_size_count{instance=~\"$instance\"}[30m])))", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n node_disk_io_time_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, device) ", + "format": "time_series", "hide": false, - "instant": true, + "instant": false, "interval": "", - "legendFormat": "{{ instance }}", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{device}}", + "metric": "", + "query": "sum(rate(\n node_disk_io_time_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance, device) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Flush Files (Last 30m)", - "type": "stat" + "title": "IO Utilization", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The CPU utilization of log backup threads. \n**(Note this is the average usage for a period of time, some peak of CPU usage may be lost.)**", + "description": null, "editable": true, "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, - "fill": 0, - "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 10, - "w": 6, + "h": 7, + "w": 8, "x": 0, - "y": 63 + "y": 35 }, - "hiddenSeries": false, - "id": 13262, + "height": null, + "hideTimeOverride": false, + "id": 427, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, - "sort": "current", + "sort": "max", "sortDesc": true, "total": false, "values": true @@ -44901,714 +58997,5913 @@ "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"backup_stream|log-backup-scan(-[0-9]+)?\"}[2m])) by (instance)", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "import-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance, tid) > 0", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "import-{{instance}}-{{tid}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance, tid) > 0", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "count(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "import-count-{{instance}}", + "metric": "", + "query": "count(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import CPU Utilization", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 35 + }, + "height": null, + "hideTimeOverride": false, + "id": 428, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "count(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "count(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"sst_.*\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import Thread Count", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 35 + }, + "height": null, + "hideTimeOverride": false, + "id": 429, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_import_error_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, error, instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{error}}-{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_import_error_counter\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, error, instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import Errors", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 42 + }, + "height": null, + "hideTimeOverride": false, + "id": 430, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%-{{request}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{request}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_import_rpc_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request) / sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{request}}", + "metric": "", + "query": "(sum(rate(\n tikv_import_rpc_duration_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request) / sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count-{{request}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (request) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import RPC Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 42 + }, + "height": null, + "hideTimeOverride": false, + "id": 431, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request!=\"switch_mode\"}\n [$__rate_interval]\n)) by (request) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{request}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request!=\"switch_mode\"}\n [$__rate_interval]\n)) by (request) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import RPC Ops", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 49 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 432, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=~\"download|write\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=~\"download|write\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Write/Download RPC Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 49 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 433, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_download_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"queue\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_download_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"queue\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Wait Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 49 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 434, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_download_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"read\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_download_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"read\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Read SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 49 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 435, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_download_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"rewrite\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_download_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"rewrite\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Rewrite SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 56 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 436, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=~\"ingest\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_rpc_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",request=~\"ingest\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Ingest RPC Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 56 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 437, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_ingest_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"ingest\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_ingest_duration_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=~\"ingest\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Ingest SST Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 56 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 438, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_ingest_byte_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_import_ingest_byte_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Import Ingest SST Bytes", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 56 + }, + "height": null, + "hideTimeOverride": false, + "id": 439, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_download_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_import_download_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_download_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total", + "metric": "", + "query": "sum(rate(\n tikv_import_download_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import Download SST Throughput", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "binBps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 63 + }, + "height": null, + "hideTimeOverride": false, + "id": 440, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_import_local_write_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_import_local_write_keys\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import Local Write keys", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 63 + }, + "height": null, + "hideTimeOverride": false, + "id": 441, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_import_local_write_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_import_local_write_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type, instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Import Local Write bytes", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "binBps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 70 + }, + "height": null, + "hideTimeOverride": false, + "id": 442, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_backup_raw_expired_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_backup_raw_expired_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_backup_raw_expired_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "sum", + "metric": "", + "query": "sum((\n tikv_backup_raw_expired_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "TTL Expired", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 70 + }, + "height": null, + "hideTimeOverride": false, + "id": 443, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_cloud_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (cloud, req) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{cloud}}-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_cloud_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (cloud, req) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "cloud request", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Backup & Import", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 444, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Total number of encryption data keys in use", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 445, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_encryption_data_key_storage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_encryption_data_key_storage_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Encryption data keys", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Number of files being encrypted", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 446, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_encryption_file_num\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_encryption_file_num\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Encrypted files", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Flag to indicate if encryption is initialized", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "height": null, + "hideTimeOverride": false, + "id": 447, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_encryption_is_initialized\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "((\n tikv_encryption_is_initialized\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Encryption initialized", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Total size of encryption meta files", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "height": null, + "hideTimeOverride": false, + "id": 448, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_encryption_meta_file_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}-{{instance}}", + "metric": "", + "query": "((\n tikv_encryption_meta_file_size_bytes\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Encryption meta files size", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "height": null, + "hideTimeOverride": false, + "id": 449, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"encrypt_data_nanos\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "encrypt-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"encrypt_data_nanos\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"decrypt_data_nanos\"}\n [$__rate_interval]\n)) by (req) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "decrypt-{{req}}", + "metric": "", + "query": "sum(rate(\n tikv_coprocessor_rocksdb_perf\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",metric=\"decrypt_data_nanos\"}\n [$__rate_interval]\n)) by (req) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Encrypt/decrypt data nanos", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Writing or reading file duration (second)", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "height": null, + "hideTimeOverride": false, + "id": 450, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_encryption_write_read_file_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_encryption_write_read_file_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_encryption_write_read_file_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_encryption_write_read_file_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_encryption_write_read_file_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_encryption_write_read_file_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg", + "metric": "", + "query": "(sum(rate(\n tikv_encryption_write_read_file_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) / sum(rate(\n tikv_encryption_write_read_file_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_encryption_write_read_file_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "metric": "", + "query": "sum(rate(\n tikv_encryption_write_read_file_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Read/write encryption meta duration", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Encryption", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 451, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": null, + "text": "Disabled" + }, + "1": { + "color": "green", + "index": null, + "text": "Enabled" + } + }, + "type": "value" + } + ], + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 452, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "((\n tikv_log_backup_enabled\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "metric": "", + "query": "((\n tikv_log_backup_enabled\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Endpoint Status", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": null, + "text": "Running" + }, + "1": { + "color": "yellow", + "index": null, + "text": "Paused" + }, + "2": { + "color": "red", + "index": null, + "text": "Error" + } + }, + "type": "value" + } + ], + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 453, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "min((\n tikv_log_backup_task_status\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "min((\n tikv_log_backup_task_status\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Task Status", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 454, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "tidb_log_backup_advancer_owner > 0", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "metric": "", + "query": "tidb_log_backup_advancer_owner > 0", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Advancer Owner", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The average flush size of last 30mins.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 455, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(increase(\n tikv_log_backup_flush_file_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) / sum(increase(\n tikv_log_backup_flush_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_files\"}\n [30m]\n)) by (instance) )", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "metric": "", + "query": "(sum(increase(\n tikv_log_backup_flush_file_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) / sum(increase(\n tikv_log_backup_flush_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_files\"}\n [30m]\n)) by (instance) )", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Average Flush Size", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The current total flushed file number of this run.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 7 + }, + "height": null, + "hideTimeOverride": false, + "id": 456, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_log_backup_flush_file_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) > 0", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_log_backup_flush_file_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) > 0", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Flushed Files (Last 30m) Per Host", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot.\n**NOTE: The size may get reduced if some of TiKVs reboot.**", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 7 + }, + "height": null, + "hideTimeOverride": false, + "id": 457, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_log_backup_flush_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_files\"}\n [30m]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_log_backup_flush_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_files\"}\n [30m]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Flush Times (Last 30m)", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "This is the summary of the size has been flushed, summered by the data each TiKV has flushed since last boot.\n**NOTE: The size may get reduced if some of TiKVs reboot.**", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 7 + }, + "height": null, + "hideTimeOverride": false, + "id": 458, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_log_backup_flush_file_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_log_backup_flush_file_size_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Flushed Size (Last 30m)", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot.\n**NOTE: The size may get reduced if some of TiKVs reboot.**", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": null, + "noValue": "none", + "thresholds": { + "mode": "absolute", + "steps": "" + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 7 + }, + "height": null, + "hideTimeOverride": false, + "id": 459, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_log_backup_flush_file_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_log_backup_flush_file_size_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [30m]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Flush Files (Last 30m)", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The CPU utilization of log backup threads. \n**(Note this is the average usage for a period of time, some peak of CPU usage may be lost.)**", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "height": null, + "hideTimeOverride": false, + "id": 460, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"backup_stream|log-backup-scan(-[0-9]+)?\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_thread_cpu_seconds_total\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",name=~\"backup_stream|log-backup-scan(-[0-9]+)?\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "height": null, + "hideTimeOverride": false, + "id": 461, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_handle_kv_batch_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_handle_kv_batch_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Handle Event Rate", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The data rate of initial scanning emitting events.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "height": null, + "hideTimeOverride": false, + "id": 462, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_incremental_scan_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_incremental_scan_bytes_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Initial Scan Generate Event Throughput", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "binBps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "height": null, + "hideTimeOverride": false, + "id": 463, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(time() * 1000 - max((\n tidb_log_backup_last_checkpoint\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (task) / 262144 > 0)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ task }}", + "metric": "", + "query": "(time() * 1000 - max((\n tidb_log_backup_last_checkpoint\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (task) / 262144 > 0)", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Abnormal Checkpoint TS Lag", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The estimated memory usage by the streaming backup module.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "height": null, + "hideTimeOverride": false, + "id": 464, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_log_backup_heap_memory\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_log_backup_heap_memory\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Of Events", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "height": null, + "hideTimeOverride": false, + "id": 465, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_log_backup_observed_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_log_backup_observed_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_log_backup_observed_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-total", + "metric": "", + "query": "sum((\n tikv_log_backup_observed_region\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Observed Region Count", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The errors met when backing up.\n**They are retryable, don't worry.**", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 35 + }, + "height": null, + "hideTimeOverride": false, + "id": 466, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_log_backup_errors\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_log_backup_errors\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The errors met when backing up.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 35 + }, + "height": null, + "hideTimeOverride": false, + "id": 467, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(delta(\n tikv_log_backup_fatal_errors\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, instance) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}-{{instance}}", + "metric": "", + "query": "sum(delta(\n tikv_log_backup_fatal_errors\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [1m]\n)) by (type, instance) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Fatal Errors", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_TEST-CLUSTER}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 35 + }, + "height": null, + "hideTimeOverride": false, + "id": 468, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [ + { + "alias": "Current Time", + "bars": false, + "dashes": true, + "fill": 0, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 + } + ], + "span": null, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "max((\n tidb_log_backup_last_checkpoint\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (task) / 262144 > 0", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "metric": "", + "query": "max((\n tidb_log_backup_last_checkpoint\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (task) / 262144 > 0", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "time() * 1000", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Current Time", + "metric": "", + "query": "time() * 1000", + "refId": "", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Checkpoint TS of Tasks", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "dateTimeAsIsoNoDateIfToday", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The duration of flushing a batch of file.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 42 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 469, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_flush_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_files\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_flush_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_files\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Flush Duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The duration of scanning the initial data from local DB and transform them into apply events.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 42 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 470, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_initial_scan_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_initial_scan_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Initial scanning duration", + "tooltip": { + "msResolution": true, + "shared": true, + "showHistogram": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The duration of converting a raft request into a apply event.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 42 + }, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 471, + "interval": null, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, + "targets": [ + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_event_handle_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"to_stream_event\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "metric": "tikv_thread_cpu_seconds_total", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_event_handle_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"to_stream_event\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "CPU Usage", + "title": "Convert Raft Event duration", "tooltip": { - "msResolution": false, + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The duration of waiting the mutex of the controller.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 10, + "h": 7, "w": 6, - "x": 6, - "y": 63 + "x": 18, + "y": 42 }, - "hiddenSeries": false, - "id": 12843, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 472, + "interval": null, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "rate(tikv_log_backup_handle_kv_batch_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_event_handle_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"get_router_lock\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_event_handle_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"get_router_lock\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Handle Event Rate", + "title": "Wait for Lock Duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The data rate of initial scanning emitting events.", + "description": "The number of KV-modify of each raft command observed.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 10, + "h": 7, "w": 6, - "x": 12, - "y": 63 + "x": 0, + "y": 49 }, - "hiddenSeries": false, - "id": 14135, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 473, + "interval": null, "legend": { - "alignAsTable": true, - "avg": true, - "current": false, - "max": true, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "show": false }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "rate(tikv_log_backup_incremental_scan_bytes_sum{instance=~\"$instance\"}[$__rate_interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_handle_kv_batch_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_handle_kv_batch_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Initial Scan Generate Event Throughput", + "title": "Command Batch Size", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "binBps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "alert": { - "alertRuleTags": {}, - "conditions": [ - { - "evaluator": { - "params": [ - 600000 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "for": "5m", - "frequency": "1m", - "handler": 1, - "name": "Checkpoint Lag Too Huge", - "noDataState": "no_data", - "notifications": [] + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": "The total cost of saving an event into temporary file.", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "ms" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 10, + "h": 7, "w": 6, - "x": 18, - "y": 63 + "x": 6, + "y": 49 }, - "hiddenSeries": false, - "id": 14774, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 474, + "interval": null, "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "show": false }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "time() * 1000 - max(tidb_log_backup_last_checkpoint / 262144 > 0) by (task)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_event_handle_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_to_temp_file\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, "instant": false, "interval": "", - "legendFormat": "{{ task }}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "time() * 1000", - "hide": true, - "interval": "", - "legendFormat": "Current Time", - "refId": "B" - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 600000, - "visible": true + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_event_handle_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=~\"save_to_temp_file\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Abnormal Checkpoint TS Lag", + "title": "Save to Temp File Duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": "3000000", - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The estimated memory usage by the streaming backup module.", + "description": "The total cost of writing a event into temporary file.\nComparing to the ***Save*** duration, it doesn't contain the time cost of routing the task by range / task.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 10, + "h": 7, "w": 6, - "x": 0, - "y": 73 + "x": 12, + "y": 49 }, - "hiddenSeries": false, - "id": 13100, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 475, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "show": true, - "total": false, - "values": true + "show": false }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "tikv_log_backup_heap_memory{instance=~\"$instance\"}", - "format": "time_series", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_on_event_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=\"write_to_tempfile\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_on_event_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=\"write_to_tempfile\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Memory Of Events", + "title": "Write to Temp File Duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The duration of collecting metadata and call the UNIX system call *write* for each event.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 10, + "h": 7, "w": 6, - "x": 6, - "y": 73 + "x": 18, + "y": 49 }, - "hiddenSeries": false, - "id": 14630, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 476, + "interval": null, "legend": { - "avg": false, - "current": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "show": false }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "total", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "tikv_log_backup_observed_region{instance=~\"$instance\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(tikv_log_backup_observed_region{instance=~\"$instance\"})", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_on_event_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=\"syscall_write\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", "hide": false, + "instant": false, "interval": "", - "legendFormat": "total", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_on_event_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",stage=\"syscall_write\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Observed Region Count", + "title": "System Write Call Duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The errors met when backing up.\n**They are retryable, don't worry.**", + "description": "The internal message type count.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 5, - "w": 6, - "x": 12, - "y": 73 + "h": 7, + "w": 8, + "x": 0, + "y": 56 }, - "hiddenSeries": false, - "id": 13101, + "height": null, + "hideTimeOverride": false, + "id": 477, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, "hideZero": true, - "max": false, + "max": true, "min": false, - "show": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": true, + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "increase(tikv_log_backup_errors{instance=~\"$instance\"}[$__interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_interal_actor_acting_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (message) ", "format": "time_series", "hide": false, "instant": false, - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{type}}@{{instance}}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "tikv_log_backup_errors{instance=~\"$instance\"}", - "hide": true, - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "", - "refId": "B" + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{message}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_interal_actor_acting_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (message) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Errors", + "title": "Internal Message Type", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -45616,14 +64911,16 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "ops", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -45634,90 +64931,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", + "description": "The internal handling message duration.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 10, - "w": 6, - "x": 18, - "y": 73 + "h": 7, + "w": 8, + "x": 8, + "y": 56 }, - "hiddenSeries": false, - "id": 14910, + "height": null, + "hideTimeOverride": false, + "id": 478, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Current Time", - "dashes": true, - "fill": 0 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "max(tidb_log_backup_last_checkpoint{instance=~\"$instance\"} / 262144 > 0) by (task)", - "instant": false, - "interval": "", - "legendFormat": "{{ task }}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "time() * 1000", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_log_backup_interal_actor_acting_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (message, le) \n \n \n)) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "Current Time", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{message}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_log_backup_interal_actor_acting_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (message, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Checkpoint TS of Tasks", + "title": "Internal Message Handling Duration (P99)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -45725,7 +65044,8 @@ }, "yaxes": [ { - "format": "dateTimeAsIsoNoDateIfToday", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -45733,6 +65053,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -45743,100 +65064,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The errors met when backing up.", + "description": "The internal handling message duration.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 5, - "w": 6, - "x": 12, - "y": 78 + "h": 7, + "w": 8, + "x": 16, + "y": 56 }, - "hiddenSeries": false, - "id": 14908, + "height": null, + "hideTimeOverride": false, + "id": 479, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, + "current": true, + "hideEmpty": true, "hideZero": true, - "max": false, + "max": true, "min": false, - "show": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": true, + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "increase(tikv_log_backup_fatal_errors{instance=~\"$instance\"}[$__interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9,(\n sum(rate(\n tikv_log_backup_interal_actor_acting_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (message, le) \n \n \n)) ", "format": "time_series", "hide": false, "instant": false, - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{type}}@{{instance}}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "", - "hide": true, - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "", - "refId": "B" - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 0, - "yaxis": "left" + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{message}}", + "metric": "", + "query": "histogram_quantile(0.9,(\n sum(rate(\n tikv_log_backup_interal_actor_acting_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (message, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], + "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Fatal Errors", + "title": "Internal Message Handling Duration (P90)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -45844,14 +65177,16 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -45862,939 +65197,852 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateBlues", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The duration of flushing a batch of file.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 83 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 14078, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_flush_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"save_files\"}[$__interval])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Flush Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateReds", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The duration of scanning the initial data from local DB and transform them into apply events. \n", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 83 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 14136, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_initial_scan_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__interval])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Initial scanning duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateGreens", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The duration of converting a raft request into a apply event. \n*This duration is for consuming a batch of events.*", + "description": "The internal read throughput of RocksDB during initial scanning. This panel can roughly present the read through to the hard disk of initial scanning.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 83 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 13934, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"to_stream_event\"}[$__interval])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "title": "Convert Raft Event duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateGreens", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "The duration of waiting the mutex of the controller. \n*This duration is for consuming a batch of events.*", - "fieldConfig": { - "defaults": {}, - "overrides": [] + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 83 + "w": 12, + "x": 0, + "y": 63 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 12840, + "height": null, + "hideTimeOverride": false, + "id": 480, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"get_router_lock\"}[$__interval])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_initial_scan_operations\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=~\"read_bytes\"}\n [$__rate_interval]\n)) by (cf) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{cf}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_initial_scan_operations\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op=~\"read_bytes\"}\n [$__rate_interval]\n)) by (cf) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Wait for Lock Duration", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Initial Scan RocksDB Throughput", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "binBps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateCividis", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of KV-modify of each raft command observed.", + "description": "Misc statistics of RocksDB during initial scanning.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, - "x": 0, - "y": 90 + "w": 12, + "x": 12, + "y": 63 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 15059, + "height": null, + "hideTimeOverride": false, + "id": 481, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_handle_kv_batch_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__interval])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_initial_scan_operations\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op!~\"read_bytes\"}\n [$__rate_interval]\n)) by (cf, op) > 0", + "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{cf}}-{{op}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_initial_scan_operations\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",op!~\"read_bytes\"}\n [$__rate_interval]\n)) by (cf, op) > 0", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Command Batch Size", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Initial Scan RocksDB Operation", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "short", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The total cost of saving an event into temporary file. \n*This duration is for consuming a batch of events.*", + "description": "The reason of triggering initial scanning.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, - "x": 6, - "y": 90 + "w": 12, + "x": 0, + "y": 70 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 12841, + "height": null, + "hideTimeOverride": false, + "id": 482, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"save_to_temp_file\"}[$__interval])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_initial_scan_reason\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (reason) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_initial_scan_reason\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (reason) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Save to Temp File Duration", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Initial Scanning Trigger Reason", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The total cost of writing a event into temporary file.\nComparing to the ***Save*** duration, it doesn't contain the time cost of routing the task by range / task. \n*This duration is for consuming a batch of events, for one region or one table.*", + "description": "", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, + "w": 12, "x": 12, - "y": 90 + "y": 70 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 13552, + "height": null, + "hideTimeOverride": false, + "id": 483, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_on_event_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=\"write_to_tempfile\"}[$__interval])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_log_backup_metadata_key_operation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{type}}", + "metric": "", + "query": "sum(rate(\n tikv_log_backup_metadata_key_operation\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (type) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Write to Temp File Duration", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Region Checkpoint Key Putting", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "decimals": null, + "format": "cps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } }, { + "cacheTimeout": null, "cards": { - "cardPadding": 0, - "cardRound": 0 + "cardPadding": null, + "cardRound": null }, "color": { - "cardColor": "#FF9830", - "colorScale": "linear", + "cardColor": "#b4ff00", + "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, "max": null, - "min": 0, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The duration of collecting metadata and call the UNIX system call *write* for each event. \n*This duration is for consuming a batch of events, for one region or one table.*", + "description": null, + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 90 + "w": 12, + "x": 0, + "y": 77 }, "heatmap": {}, + "height": null, + "hideTimeOverride": false, "hideZeroBuckets": true, "highlightCards": true, - "id": 13551, + "id": 484, + "interval": null, "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "show": false }, "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_on_event_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=\"syscall_write\"}[$__interval])) by (le)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_log_backup_advancer_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"checkpoint\"}\n [$__rate_interval]\n)) by (le) ", "format": "heatmap", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, + "intervalFactor": 1, "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "System Write Call Duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal message type count.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 97 - }, - "hiddenSeries": false, - "id": 14914, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(tikv_log_backup_interal_actor_acting_duration_sec_count{instance=~\"$instance\"}[$__rate_interval])) by (message)", - "interval": "", - "legendFormat": "{{ message }}", - "queryType": "randomWalk", - "refId": "A" + "metric": "", + "query": "sum(rate(\n tidb_log_backup_advancer_batch_size_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",type=\"checkpoint\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Internal Message Type", + "title": "Request Checkpoint Batch Size", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 2, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { - "aliasColors": { - "watch_task": "orange" + "cacheTimeout": null, + "cards": { + "cardPadding": null, + "cardRound": null }, - "bars": false, - "dashLength": 10, - "dashes": false, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal handling message duration.", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "s" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 6, - "w": 6, + "h": 7, + "w": 12, "x": 12, - "y": 97 + "y": 77 }, - "hiddenSeries": false, - "id": 14912, + "heatmap": {}, + "height": null, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 485, + "interval": null, "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true + "show": false }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "links": [], + "maxDataPoints": 512, + "maxPerRow": null, + "minSpan": null, + "options": {}, + "repeat": null, + "repeatDirection": null, + "reverseYBuckets": false, + "span": null, "targets": [ { - "exemplar": true, - "expr": "sum(histogram_quantile(0.99, rate(tikv_log_backup_interal_actor_acting_duration_sec_bucket{instance=~\"$instance\"}[10m]))) by (message)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",step=\"tick\"}\n [$__rate_interval]\n)) by (le) ", + "format": "heatmap", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ message }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "query": "sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",step=\"tick\"}\n [$__rate_interval]\n)) by (le) ", + "refId": "", + "step": 10, + "target": "" } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Internal Message Handling Duration (P99)", + "title": "Tick Duration", "tooltip": { + "msResolution": true, "shared": true, + "showHistogram": true, "sort": 0, "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { "mode": "time", "name": null, "show": true, "values": [] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal handling message duration.", + "description": "The reason of advancer failed to be advanced.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 97 + "h": 7, + "w": 12, + "x": 0, + "y": 84 }, - "hiddenSeries": false, - "id": 14913, + "height": null, + "hideTimeOverride": false, + "id": 486, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(histogram_quantile(0.9, rate(tikv_log_backup_interal_actor_acting_duration_sec_bucket{instance=~\"$instance\"}[10m]))) by (message)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_log_backup_region_request_failure\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",reason!=\"retryable-scan-region\"}\n [$__rate_interval]\n)) by (reason) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ message }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "metric": "", + "query": "sum(rate(\n tidb_log_backup_region_request_failure\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",reason!=\"retryable-scan-region\"}\n [$__rate_interval]\n)) by (reason) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Internal Message Handling Duration (P90)", + "title": "Region Checkpoint Failure Reason", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -46802,14 +66050,16 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -46820,78 +66070,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal read throughput of RocksDB during initial scanning. This panel can roughly present the read through to the hard disk of initial scanning.", + "description": "The result of getting region checkpoints.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 103 + "h": 7, + "w": 12, + "x": 12, + "y": 84 }, - "hiddenSeries": false, - "id": 14271, + "height": null, + "hideTimeOverride": false, + "id": 487, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_log_backup_initial_scan_operations{instance=~\"$instance\", op=~\"read_bytes\"}[$__rate_interval])) BY (op, cf)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_log_backup_region_request\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (result) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ cf }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{result}}", + "metric": "", + "query": "sum(rate(\n tidb_log_backup_region_request\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (result) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Initial Scan RocksDB Throughput ", + "title": "Request Result", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -46899,14 +66183,16 @@ }, "yaxes": [ { - "format": "binBps", + "decimals": null, + "format": "none", "label": null, - "logBase": 2, + "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -46917,78 +66203,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "Misc statistics of RocksDB during initial scanning.", + "description": "The internal handling message duration.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 103 + "h": 7, + "w": 12, + "x": 0, + "y": 91 }, - "hiddenSeries": false, - "id": 14270, + "height": null, + "hideTimeOverride": false, + "id": 488, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_log_backup_initial_scan_operations{instance=~\"$instance\", op!~\"read_bytes\"}[$__rate_interval])) BY (op, cf) > 0", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (step, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ cf }}/{{ op }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{ step }}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (step, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Initial Scan RocksDB Operation ", + "title": "Tick Duration (P99)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -46996,14 +66316,16 @@ }, "yaxes": [ { - "format": "ops", + "decimals": null, + "format": "s", "label": null, - "logBase": 2, + "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -47014,80 +66336,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "leader-changed": "blue", - "region-changed": "purple" - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The reason of triggering initial scanning.", + "description": "The internal handling message duration.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, - "w": 6, + "h": 7, + "w": 12, "x": 12, - "y": 103 + "y": 91 }, - "hiddenSeries": false, - "id": 14915, + "height": null, + "hideTimeOverride": false, + "id": 489, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": false, + "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": false + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tikv_log_backup_initial_scan_reason{instance=~\"$instance\"}[$__rate_interval])) by (reason)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9,(\n sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (step, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ message }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{ step }}", + "metric": "", + "query": "histogram_quantile(0.9,(\n sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (step, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Initial Scanning Trigger Reason", + "title": "Tick Duration (P90)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47095,7 +66449,8 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -47103,6 +66458,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -47113,80 +66469,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "del": "dark-red", - "put": "green" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The frequent of getting region level checkpoint.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 103 + "h": 7, + "w": 12, + "x": 0, + "y": 98 }, - "hiddenSeries": false, - "id": 15176, + "height": null, + "hideTimeOverride": false, + "id": 490, + "interval": null, + "isNew": true, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(tikv_log_backup_metadata_key_operation{instance=~\"$instance\"}[$__rate_interval])) by (type)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",step=\"get-regions-in-range\"}\n [$__rate_interval]\n)) by (step, instance) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ type }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{step}}-{{instance}}", + "metric": "", + "query": "sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",step=\"get-regions-in-range\"}\n [$__rate_interval]\n)) by (step, instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Region Checkpoint Key Putting", + "title": "Get Region Operation Count", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47194,7 +66582,8 @@ }, "yaxes": [ { - "format": "cps", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -47202,6 +66591,7 @@ "show": true }, { + "decimals": null, "format": "short", "label": null, "logBase": 1, @@ -47212,261 +66602,287 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The variant of checkpoint group.", + "editable": true, + "error": false, "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 109 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 15544, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(increase(tidb_log_backup_advancer_batch_size_bucket{type=\"checkpoint\"}[$__interval])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } } - ], - "title": "Request Checkpoint Batch Size", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "none", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 0, - "mode": "spectrum" }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] + "fill": 1, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "gridPos": { "h": 7, - "w": 6, - "x": 6, - "y": 109 + "w": 12, + "x": 12, + "y": 98 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 15716, + "height": null, + "hideTimeOverride": false, + "id": 491, + "interval": null, + "isNew": true, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, - "sort": "current", + "sideWidth": null, + "sort": "max", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, + "stack": false, + "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tidb_log_backup_advancer_tick_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", step=~\"tick\"}[$__interval])) by (le)", - "format": "heatmap", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",step=\"try-advance\"}\n [$__rate_interval]\n)) by (step, instance) ", + "format": "time_series", + "hide": false, "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{step}}-{{instance}}", + "metric": "", + "query": "sum(rate(\n tidb_log_backup_advancer_tick_duration_sec_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\",step=\"try-advance\"}\n [$__rate_interval]\n)) by (step, instance) ", + "refId": "", + "step": 10, + "target": "" } ], - "title": "Tick Duration", + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Try Advance Trigger Time", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, + "yaxes": [ + { + "decimals": null, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Backup Log", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 492, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { - "aliasColors": { - "epoch-not-match": "purple", - "not-leader": "blue", - "watch_task": "orange" - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The reason of advancer failed to be advanced.", + "description": "The changing trend of the slowness on I/O operations. 'value > 0' means the related store might have a slow trend.", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "none" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, - "x": 12, - "y": 109 + "w": 12, + "x": 0, + "y": 0 }, - "hiddenSeries": false, - "id": 23763572666, + "height": null, + "hideTimeOverride": false, + "id": 493, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": false, + "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [], - "spaceLength": 10, - "stack": true, + "span": null, + "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tidb_log_backup_region_request_failure{reason!=\"retryable-scan-region\"}[$__interval])) by (reason)", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ reason }}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_slow_trend\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_slow_trend\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Region Checkpoint Failure Reason", + "title": "Slow Trend", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47474,15 +66890,17 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -47492,101 +66910,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "fail": "red", - "success": "green", - "watch_task": "orange" - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The result of getting region checkpoints.", + "description": "The changing trend of QPS on each store. 'value < 0' means the QPS has a dropping trend.", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "none" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 109 + "w": 12, + "x": 12, + "y": 0 }, - "hiddenSeries": false, - "id": 23763572665, + "height": null, + "hideTimeOverride": false, + "id": 494, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": false, + "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "fail", - "transform": "negative-Y", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(increase(tidb_log_backup_region_request[$__interval])) by (result)", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ result }}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_slow_trend_result\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "", - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_slow_trend_result\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Request Result", + "title": "QPS Changing Trend", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47594,15 +67023,17 @@ }, "yaxes": [ { + "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "none", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -47612,96 +67043,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "watch_task": "orange" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal handling message duration.", + "description": "The sampling latency of recent queries. A larger value indicates that the store is more likely to be the slowest store.", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "s" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, + "w": 12, "x": 0, - "y": 116 + "y": 7 }, - "hiddenSeries": false, - "id": 15359, + "height": null, + "hideTimeOverride": false, + "id": 495, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "consistency-check", - "yaxis": 1 - }, - { - "alias": "get-checkpoints-of-store", - "yaxis": 2 - }, - { - "alias": "get-checkpoints-in-range", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(histogram_quantile(0.99, rate(tidb_log_backup_advancer_tick_duration_sec_bucket[10m]))) by (step)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_slow_trend_l0\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ step }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_slow_trend_l0\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Tick Duration (P99)", + "title": "AVG Sampling Latency", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47709,15 +67156,17 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "\u00b5s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "s", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -47727,96 +67176,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "watch_task": "orange" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal handling message duration.", + "description": "The QPS of each store.", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "s" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, - "x": 6, - "y": 116 + "w": 12, + "x": 12, + "y": 7 }, - "hiddenSeries": false, - "id": 15360, + "height": null, + "hideTimeOverride": false, + "id": 496, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "get-checkpoints-of-store", - "yaxis": 2 - }, - { - "alias": "get-checkpoints-in-range", - "yaxis": 2 - }, - { - "alias": "consistency-check", - "yaxis": 1 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(histogram_quantile(0.9, rate(tidb_log_backup_advancer_tick_duration_sec_bucket[10m]))) by (step)", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum((\n tikv_raftstore_slow_trend_result_value\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{ step }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "query": "sum((\n tikv_raftstore_slow_trend_result_value\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n \n)) by (instance) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Tick Duration (P90)", + "title": "QPS of each store", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47824,15 +67289,17 @@ }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "s", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -47842,96 +67309,222 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } - }, + } + ], + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Slow Trend Statistics", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "cacheTimeout": null, + "collapsed": true, + "datasource": null, + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "height": null, + "hideTimeOverride": false, + "id": 497, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "panels": [ { - "aliasColors": { - "watch_task": "orange" - }, + "aliasColors": {}, "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The frequent of getting region level checkpoint.", + "description": "The 99 quantile durtion of status server API requests", + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "none" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, - "x": 12, - "y": 116 + "w": 12, + "x": 0, + "y": 0 }, - "hiddenSeries": false, - "id": 23763572733, + "height": null, + "hideTimeOverride": false, + "id": 498, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", + "repeat": null, + "repeatDirection": null, "seriesOverrides": [ { - "alias": "get-checkpoints-of-store", - "yaxis": 2 - }, - { - "alias": "get-checkpoints-in-range", - "yaxis": 2 + "alias": "count", + "bars": false, + "dashLength": 1, + "dashes": true, + "fill": 2, + "fillBelowTo": null, + "lines": true, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 }, { - "alias": "consistency-check", - "yaxis": 2 + "alias": "avg", + "bars": false, + "fill": 7, + "fillBelowTo": null, + "lines": true, + "yaxis": 1, + "zindex": 0 } ], - "spaceLength": 10, + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "rate(tidb_log_backup_advancer_tick_duration_sec_count{step=\"get-regions-in-range\"}[$__rate_interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_status_server_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path, le) \n \n \n)) ", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99.99%-{{path}}", + "metric": "", + "query": "histogram_quantile(0.9999,(\n sum(rate(\n tikv_status_server_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "histogram_quantile(0.99,(\n sum(rate(\n tikv_status_server_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path, le) \n \n \n)) ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%-{{path}}", + "metric": "", + "query": "histogram_quantile(0.99,(\n sum(rate(\n tikv_status_server_request_duration_seconds_bucket\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path, le) \n \n \n)) ", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "(sum(rate(\n tikv_status_server_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) / sum(rate(\n tikv_status_server_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) )", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg-{{path}}", + "metric": "", + "query": "(sum(rate(\n tikv_status_server_request_duration_seconds_sum\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) / sum(rate(\n tikv_status_server_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) )", + "refId": "", + "step": 10, + "target": "" + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_status_server_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) ", + "format": "time_series", + "hide": true, + "instant": false, "interval": "", - "legendFormat": "{{ step }} {{ instance }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "count-{{path}}", + "metric": "", + "query": "sum(rate(\n tikv_status_server_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Get Region Operation Count", + "title": "Status API Request Duration", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -47939,15 +67532,17 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "s", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -47957,97 +67552,112 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } }, { - "aliasColors": { - "watch_task": "orange" - }, - "bars": true, - "dashLength": 10, - "dashes": false, + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "description": "The variant of checkpoint group.", + "description": null, + "editable": true, + "error": false, "fieldConfig": { "defaults": { - "unit": "none" - }, - "overrides": [] + "thresholds": { + "mode": "absolute", + "steps": [] + } + } }, "fill": 1, - "fillGradient": 0, + "fillGradient": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 116 + "w": 12, + "x": 12, + "y": 0 }, - "hiddenSeries": false, - "id": 23763572734, + "height": null, + "hideTimeOverride": false, + "id": 499, + "interval": null, + "isNew": true, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, - "hideEmpty": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, "total": false, - "values": false + "values": true }, - "lines": false, + "lines": true, "linewidth": 1, - "nullPointMode": "null", + "links": [], + "maxDataPoints": null, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "null as zero", "options": { - "alertThreshold": true + "alertThreshold": true, + "dataLinks": [] }, "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 2, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "get-checkpoints-of-store", - "yaxis": 2 - }, - { - "alias": "get-checkpoints-in-range", - "yaxis": 2 - }, - { - "alias": "consistency-check", - "yaxis": 2 - } - ], - "spaceLength": 10, + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": null, "stack": false, "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "increase(tidb_log_backup_advancer_tick_duration_sec_count{step=\"try-advance\"}[$__interval])", + "datasource": "${DS_TEST-CLUSTER}", + "expr": "sum(rate(\n tikv_status_server_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) ", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ step }} {{ instance }}", - "queryType": "randomWalk", - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{path}}", + "metric": "", + "query": "sum(rate(\n tikv_status_server_request_duration_seconds_count\n {k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\"}\n [$__rate_interval]\n)) by (path) ", + "refId": "", + "step": 10, + "target": "" } ], "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Try Advance Trigger Time", + "title": "Status API Request (op/s)", "tooltip": { + "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], + "transparent": false, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", "name": null, "show": true, @@ -48055,15 +67665,17 @@ }, "yaxes": [ { - "format": "none", + "decimals": null, + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "s", + "decimals": null, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -48073,179 +67685,189 @@ ], "yaxis": { "align": false, - "alignLevel": null + "alignLevel": 0 } } ], - "title": "Backup Log", + "repeat": null, + "repeatDirection": null, + "span": null, + "targets": [], + "timeFrom": null, + "timeShift": null, + "title": "Status Server", + "transformations": [], + "transparent": false, "type": "row" } ], "refresh": "1m", - "schemaVersion": 27, + "rows": [], + "schemaVersion": 14, + "sharedCrosshair": false, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, - "current": {}, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, "datasource": "${DS_TEST-CLUSTER}", - "definition": "", - "description": null, - "error": null, "hide": 2, "includeAll": false, - "label": "K8s-cluster", + "label": "k8s_cluster", "multi": false, "name": "k8s_cluster", "options": [], - "query": { - "query": "label_values(tikv_engine_block_cache_size_bytes, k8s_cluster)", - "refId": "quota-k8s_cluster-Variable-Query" - }, + "query": "label_values(tikv_engine_block_cache_size_bytes, k8s_cluster)", "refresh": 2, - "regex": "", - "skipUrlSync": false, + "regex": null, "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "tagValuesQuery": null, + "tagsQuery": null, "type": "query", "useTags": false }, { "allValue": null, - "current": {}, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, "datasource": "${DS_TEST-CLUSTER}", - "definition": "", - "description": null, - "error": null, "hide": 2, "includeAll": false, "label": "tidb_cluster", "multi": false, "name": "tidb_cluster", "options": [], - "query": { - "query": "label_values(tikv_engine_block_cache_size_bytes{k8s_cluster=\"$k8s_cluster\"}, tidb_cluster)", - "refId": "quota-tidb_cluster-Variable-Query" - }, + "query": "label_values(tikv_engine_block_cache_size_bytes{k8s_cluster =\"$k8s_cluster\"}, tidb_cluster)", "refresh": 2, - "regex": "", - "skipUrlSync": false, + "regex": null, "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "tagValuesQuery": null, + "tagsQuery": null, "type": "query", "useTags": false }, { "allValue": null, - "current": {}, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, "datasource": "${DS_TEST-CLUSTER}", - "definition": "", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "db", "multi": true, "name": "db", "options": [], - "query": { - "query": "label_values(tikv_engine_block_cache_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, db)", - "refId": "quota-db-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, + "query": "label_values(tikv_engine_block_cache_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, db)", + "refresh": 2, + "regex": null, "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "tagValuesQuery": null, + "tagsQuery": null, "type": "query", "useTags": false }, { "allValue": null, - "current": {}, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, "datasource": "${DS_TEST-CLUSTER}", - "definition": "label_values(tikv_storage_command_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, type)", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "command", "multi": true, "name": "command", "options": [], - "query": { - "query": "query_result(tikv_storage_command_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"} != 0)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "/type=\"([^\"]+)\"/", - "skipUrlSync": false, + "query": "query_result(tikv_storage_command_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"} != 0)", + "refresh": 2, + "regex": "/\\btype=\"([^\"]+)\"/", "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "tagValuesQuery": null, + "tagsQuery": null, "type": "query", "useTags": false }, { "allValue": ".*", - "current": {}, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, "datasource": "${DS_TEST-CLUSTER}", - "definition": "", - "description": null, - "error": null, "hide": 0, "includeAll": true, - "label": "Instance", + "label": "instance", "multi": false, "name": "instance", "options": [], - "query": { - "query": "label_values(tikv_engine_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, instance)", - "refId": "quota-instance-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, + "query": "label_values(tikv_engine_size_bytes{k8s_cluster =\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, instance)", + "refresh": 2, + "regex": null, "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "tagValuesQuery": null, + "tagsQuery": null, "type": "query", "useTags": false }, { "allValue": null, - "current": {}, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, "datasource": "${DS_TEST-CLUSTER}", - "definition": "label_values(tikv_engine_titandb_num_live_blob_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, db)", - "description": null, - "error": null, "hide": 2, "includeAll": true, "label": "titan_db", "multi": true, "name": "titan_db", "options": [], - "query": { - "query": "label_values(tikv_engine_titandb_num_live_blob_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, db)", - "refId": "quota-titan_db-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, + "query": "label_values(tikv_engine_titandb_num_live_blob_file{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, db)", + "refresh": 2, + "regex": null, "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", + "tagValuesQuery": null, + "tagsQuery": null, "type": "query", "useTags": false } @@ -48256,6 +67878,7 @@ "to": "now" }, "timepicker": { + "hidden": false, "refresh_intervals": [ "5s", "10s", @@ -48283,5 +67906,5 @@ "timezone": "browser", "title": "Test-Cluster-TiKV-Details", "uid": "RDVQiEzZz", - "version": 1 -} \ No newline at end of file + "version": 0 +} diff --git a/metrics/grafana/tikv_details.json.sha256 b/metrics/grafana/tikv_details.json.sha256 new file mode 100644 index 00000000000..cc9c7769755 --- /dev/null +++ b/metrics/grafana/tikv_details.json.sha256 @@ -0,0 +1 @@ +75c3d3d71080a5e3bd40273bc2250797ab929e6c6ab46df89cad79d837531a2d ./metrics/grafana/tikv_details.json diff --git a/metrics/grafana/tikv_fast_tune.json b/metrics/grafana/tikv_fast_tune.json index 85e9d5c7f02..f5c3a634c77 100644 --- a/metrics/grafana/tikv_fast_tune.json +++ b/metrics/grafana/tikv_fast_tune.json @@ -2712,7 +2712,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker-.*\"}[1m]))", + "expr": "sum(avg_over_time(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker-.*\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 1, @@ -5629,7 +5629,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"select|index\"}[1m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"select|index\", type=\"all\"}[1m])) by (le))", "format": "time_series", "hide": false, "intervalFactor": 1, @@ -5645,14 +5645,14 @@ "refId": "A" }, { - "expr": "histogram_quantile(0.999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"select|index\"}[1m])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"select|index\", type=\"all\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "duration-999%", "refId": "B" }, { - "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"select|index\"}[1m])) by (le))", + "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", req=~\"select|index\", type=\"all\"}[1m])) by (le))", "format": "time_series", "hide": true, "intervalFactor": 1, @@ -5763,7 +5763,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cop-normal\"}[1m]))", + "expr": "sum(avg_over_time(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cop-normal\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 1, diff --git a/metrics/grafana/tikv_trouble_shooting.json b/metrics/grafana/tikv_trouble_shooting.json index bf1fd5baacf..f4f5261ad3c 100644 --- a/metrics/grafana/tikv_trouble_shooting.json +++ b/metrics/grafana/tikv_trouble_shooting.json @@ -3995,14 +3995,14 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", + "expr": "histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"all\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99.99%", "refId": "D" }, { - "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", + "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"all\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", @@ -4010,7 +4010,7 @@ "step": 4 }, { - "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le,req))", + "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"all\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-95%", @@ -4234,7 +4234,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance,req))", + "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"all\"}[1m])) by (le, instance,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{req}}", diff --git a/proxy_components/engine_store_ffi/Cargo.toml b/proxy_components/engine_store_ffi/Cargo.toml index 04a97f7758d..21eb18ce61d 100644 --- a/proxy_components/engine_store_ffi/Cargo.toml +++ b/proxy_components/engine_store_ffi/Cargo.toml @@ -23,9 +23,14 @@ test-engines-panic = [ "engine_test/test-engines-panic", ] -cloud-aws = ["sst_importer/cloud-aws"] -cloud-gcp = ["sst_importer/cloud-gcp"] -cloud-azure = ["sst_importer/cloud-azure"] +cloud-aws = ["encryption_export/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure"] + +# TODO use encryption/openssl-vendored if later supports +openssl-vendored = [ + "openssl/vendored" +] [dependencies] batch-system = { workspace = true, default-features = false } @@ -36,7 +41,7 @@ collections = { workspace = true } crossbeam = "0.8" derivative = "2" encryption = { workspace = true, default-features = false } - +openssl = { workspace = true } # TODO only for feature engine_rocks = { workspace = true, default-features = false } # Should be [dev-dependencies] but we need to control the features # https://github.com/rust-lang/cargo/issues/6915 diff --git a/proxy_components/engine_store_ffi/src/core/forwarder.rs b/proxy_components/engine_store_ffi/src/core/forwarder.rs index 7bc06ad427f..84d5bbc9616 100644 --- a/proxy_components/engine_store_ffi/src/core/forwarder.rs +++ b/proxy_components/engine_store_ffi/src/core/forwarder.rs @@ -54,7 +54,7 @@ pub struct ProxyForwarder { pub engine_store_server_helper: &'static EngineStoreServerHelper, pub engine: TiFlashEngine, pub raft_engine: ER, - pub sst_importer: Arc, + pub sst_importer: Arc>, pub pre_handle_snapshot_ctx: Arc>, pub apply_snap_pool: Option>>, pub pending_delete_ssts: Arc>>, @@ -90,7 +90,7 @@ impl ProxyForwarder { store_id: u64, engine: engine_tiflash::MixedModeEngine, raft_engine: ER, - sst_importer: Arc, + sst_importer: Arc>, trans: T, snap_mgr: SnapManager, packed_envs: PackedEnvs, diff --git a/proxy_components/engine_store_ffi/src/observer.rs b/proxy_components/engine_store_ffi/src/observer.rs index 41f47754ec9..5da31809092 100644 --- a/proxy_components/engine_store_ffi/src/observer.rs +++ b/proxy_components/engine_store_ffi/src/observer.rs @@ -21,7 +21,10 @@ use raftstore::{ }; use sst_importer::SstImporter; -use crate::core::{DebugStruct, PackedEnvs, ProxyForwarder}; +use crate::{ + core::{DebugStruct, PackedEnvs, ProxyForwarder}, + TiFlashEngine, +}; // TiFlash observer's priority should be higher than all other observers, to // avoid being bypassed. @@ -38,7 +41,7 @@ impl TiFlashObserver { store_id: u64, engine: engine_tiflash::MixedModeEngine, raft_engine: ER, - sst_importer: Arc, + sst_importer: Arc>, trans: T, snap_mgr: SnapManager, packed_envs: PackedEnvs, diff --git a/proxy_components/engine_tiflash/Cargo.toml b/proxy_components/engine_tiflash/Cargo.toml index f70029b4732..4c3297f5deb 100644 --- a/proxy_components/engine_tiflash/Cargo.toml +++ b/proxy_components/engine_tiflash/Cargo.toml @@ -36,6 +36,11 @@ test-engines-panic = [ "engine_test/test-engines-panic", ] +# TODO use encryption/openssl-vendored if later supports +openssl-vendored = [ + "openssl/vendored" +] + [dependencies] api_version = { workspace = true, default-features = false } case_macros = { workspace = true } @@ -43,6 +48,7 @@ collections = { workspace = true, default-features = false } derive_more = "0.99.3" encryption = { workspace = true, default-features = false } +openssl = { workspace = true } # TODO only for feature engine_rocks = { workspace = true, default-features = false } engine_traits = { workspace = true, default-features = false } # TODO: Imported for `test-` features to make cargo clippy happy. diff --git a/proxy_components/engine_tiflash/src/cf_options.rs b/proxy_components/engine_tiflash/src/cf_options.rs index 1162c67f210..6a2372fb31f 100644 --- a/proxy_components/engine_tiflash/src/cf_options.rs +++ b/proxy_components/engine_tiflash/src/cf_options.rs @@ -40,6 +40,23 @@ impl RocksCfOptions { pub fn into_raw(self) -> RawCfOptions { self.0 } + + pub fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + pub fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } impl Deref for RocksCfOptions { diff --git a/proxy_components/engine_tiflash/src/compact.rs b/proxy_components/engine_tiflash/src/compact.rs index 199b7d9f3be..f64c9a7d49e 100644 --- a/proxy_components/engine_tiflash/src/compact.rs +++ b/proxy_components/engine_tiflash/src/compact.rs @@ -121,6 +121,10 @@ impl CompactExt for RocksEngine { db.compact_files_cf(handle, &opts, &files, output_level) .map_err(r2e) } + + fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { + self.as_inner().check_in_range(start, end).map_err(r2e) + } } #[cfg(test)] diff --git a/proxy_components/engine_tiflash/src/db_options.rs b/proxy_components/engine_tiflash/src/db_options.rs index c9ef2cfda98..64a763a19eb 100644 --- a/proxy_components/engine_tiflash/src/db_options.rs +++ b/proxy_components/engine_tiflash/src/db_options.rs @@ -112,6 +112,14 @@ impl DbOptions for RocksDbOptions { fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } + + fn get_flush_size(&self) -> Result { + if let Some(m) = self.0.get_write_buffer_manager() { + return Ok(m.flush_size() as u64); + } + + Err(box_err!("write buffer manager not found")) + } } pub struct RocksTitanDbOptions(RawTitanDBOptions); diff --git a/proxy_components/engine_tiflash/src/encryption.rs b/proxy_components/engine_tiflash/src/encryption.rs index 494fbc3cf45..75dc407e3c3 100644 --- a/proxy_components/engine_tiflash/src/encryption.rs +++ b/proxy_components/engine_tiflash/src/encryption.rs @@ -2,11 +2,10 @@ use std::{io::Result, sync::Arc}; -use encryption::{self, DataKeyManager}; -use engine_traits::{EncryptionKeyManager, EncryptionMethod, FileEncryptionInfo}; +use encryption::{DataKeyManager, FileEncryptionInfo}; +use kvproto::encryptionpb::EncryptionMethod; use rocksdb::{ - DBEncryptionMethod, EncryptionKeyManager as DBEncryptionKeyManager, - FileEncryptionInfo as DBFileEncryptionInfo, + DBEncryptionMethod, EncryptionKeyManager, FileEncryptionInfo as DBFileEncryptionInfo, }; use crate::{r2e, raw::Env}; @@ -15,23 +14,29 @@ use crate::{r2e, raw::Env}; pub(crate) fn get_env( base_env: Option>, key_manager: Option>, -) -> engine_traits::Result> { - let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); +) -> engine_traits::Result>> { if let Some(manager) = key_manager { - Ok(Arc::new( + let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); + Ok(Some(Arc::new( Env::new_key_managed_encrypted_env(base_env, WrappedEncryptionKeyManager { manager }) .map_err(r2e)?, - )) + ))) } else { Ok(base_env) } } -pub struct WrappedEncryptionKeyManager { - manager: Arc, +pub struct WrappedEncryptionKeyManager { + manager: Arc, } -impl DBEncryptionKeyManager for WrappedEncryptionKeyManager { +impl WrappedEncryptionKeyManager { + pub fn new(manager: Arc) -> Self { + Self { manager } + } +} + +impl EncryptionKeyManager for WrappedEncryptionKeyManager { fn get_file(&self, fname: &str) -> Result { self.manager .get_file(fname) diff --git a/proxy_components/engine_tiflash/src/engine.rs b/proxy_components/engine_tiflash/src/engine.rs index 7a90ab9952d..27c566a8a02 100644 --- a/proxy_components/engine_tiflash/src/engine.rs +++ b/proxy_components/engine_tiflash/src/engine.rs @@ -12,7 +12,7 @@ use std::{ pub(crate) use details::RocksEngine; pub use details::RocksEngine as MixedModeEngine; use engine_rocks::RocksSnapshot; -use engine_traits::{Checkpointable, Checkpointer, Error, KvEngine, Result}; +use engine_traits::{Checkpointable, Checkpointer, Error, KvEngine, Result, SnapshotContext}; use rocksdb::DB; use crate::{ @@ -126,8 +126,8 @@ impl RocksEngine { impl KvEngine for RocksEngine { type Snapshot = RocksSnapshot; - fn snapshot(&self) -> RocksSnapshot { - self.rocks.snapshot() + fn snapshot(&self, x: Option) -> RocksSnapshot { + self.rocks.snapshot(x) } fn sync(&self) -> Result<()> { diff --git a/proxy_components/engine_tiflash/src/lib.rs b/proxy_components/engine_tiflash/src/lib.rs index aabf133618f..5226b6f41bb 100644 --- a/proxy_components/engine_tiflash/src/lib.rs +++ b/proxy_components/engine_tiflash/src/lib.rs @@ -128,5 +128,5 @@ pub fn get_env( limiter: Option>, ) -> engine_traits::Result> { let env = encryption::get_env(None /* base_env */, key_manager)?; - file_system::get_env(Some(env), limiter) + file_system::get_env(env, limiter) } diff --git a/proxy_components/engine_tiflash/src/misc.rs b/proxy_components/engine_tiflash/src/misc.rs index 29d665ce563..6a590b974c1 100644 --- a/proxy_components/engine_tiflash/src/misc.rs +++ b/proxy_components/engine_tiflash/src/misc.rs @@ -444,6 +444,11 @@ impl MiscExt for RocksEngine { .get(); Ok(n) } + + type DiskEngine = RocksEngine; + fn get_disk_engine(&self) -> &Self::DiskEngine { + self + } } #[cfg(test)] diff --git a/proxy_components/engine_tiflash/src/sst.rs b/proxy_components/engine_tiflash/src/sst.rs index e0d9d818b42..1030b7aa17f 100644 --- a/proxy_components/engine_tiflash/src/sst.rs +++ b/proxy_components/engine_tiflash/src/sst.rs @@ -2,21 +2,20 @@ use std::{path::PathBuf, sync::Arc}; -use engine_rocks::encryption::WrappedEncryptionKeyManager; +use ::encryption::DataKeyManager; use engine_traits::{ - EncryptionKeyManager, Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, - SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, + SstExt, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; -use kvproto::import_sstpb::SstMeta; +use file_system::get_io_rate_limiter; use rocksdb::{ rocksdb::supported_compression, ColumnFamilyOptions, DBCompressionType, DBIterator, Env, EnvOptions, ExternalSstFileInfo as RawExternalSstFileInfo, SequentialFile, SstFileReader, SstFileWriter, DB, }; -use tikv_util::box_err; -use crate::{engine::RocksEngine, options::RocksReadOptions, r2e}; +use crate::{engine::RocksEngine, get_env, options::RocksReadOptions, r2e}; impl SstExt for RocksEngine { type SstReader = RocksSstReader; @@ -29,19 +28,6 @@ pub struct RocksSstReader { } impl RocksSstReader { - pub fn sst_meta_info(&self, sst: SstMeta) -> SstMetaInfo { - let mut meta = SstMetaInfo { - total_kvs: 0, - total_bytes: 0, - meta: sst, - }; - self.inner.read_table_properties(|p| { - meta.total_kvs = p.num_entries(); - meta.total_bytes = p.raw_key_size() + p.raw_value_size(); - }); - meta - } - pub fn open_with_env(path: &str, env: Option>) -> Result { let mut cf_options = ColumnFamilyOptions::new(); if let Some(env) = env { @@ -62,20 +48,23 @@ impl RocksSstReader { } impl SstReader for RocksSstReader { - fn open(path: &str) -> Result { - Self::open_with_env(path, None) - } - fn open_encrypted(path: &str, mgr: Arc) -> Result { - let env = Env::new_key_managed_encrypted_env( - Arc::default(), - WrappedEncryptionKeyManager::new(mgr), - ) - .map_err(|err| Error::Other(box_err!("failed to open encrypted env: {}", err)))?; - Self::open_with_env(path, Some(Arc::new(env))) + fn open(path: &str, mgr: Option>) -> Result { + let env = get_env(mgr, get_io_rate_limiter())?; + Self::open_with_env(path, Some(env)) } + fn verify_checksum(&self) -> Result<()> { - self.inner.verify_checksum().map_err(r2e)?; - Ok(()) + self.inner.verify_checksum().map_err(r2e) + } + + fn kv_count_and_size(&self) -> (u64, u64) { + let mut count = 0; + let mut bytes = 0; + self.inner.read_table_properties(|p| { + count = p.num_entries(); + bytes = p.raw_key_size() + p.raw_value_size(); + }); + (count, bytes) } } @@ -386,7 +375,7 @@ mod tests { let mut writer = RocksSstWriterBuilder::new() .set_cf(CF_DEFAULT) .set_db(&engine) - .build(p.as_os_str().to_str().unwrap()) + .build(p.to_str().unwrap()) .unwrap(); writer.put(k, v).unwrap(); let sst_file = writer.finish().unwrap(); @@ -401,7 +390,7 @@ mod tests { .set_in_memory(true) .set_cf(CF_DEFAULT) .set_db(&engine) - .build(p.as_os_str().to_str().unwrap()) + .build(p.to_str().unwrap()) .unwrap(); writer.put(k, v).unwrap(); let mut buf = vec![]; diff --git a/proxy_components/engine_tiflash/src/sst_partitioner.rs b/proxy_components/engine_tiflash/src/sst_partitioner.rs index fc1dcd40270..f642a94f28f 100644 --- a/proxy_components/engine_tiflash/src/sst_partitioner.rs +++ b/proxy_components/engine_tiflash/src/sst_partitioner.rs @@ -23,6 +23,8 @@ impl rocksdb::SstPartitionerFactory output_level: context.output_level, smallest_key: context.smallest_key, largest_key: context.largest_key, + next_level_boundaries: context.next_level_boundaries.clone(), + next_level_sizes: context.next_level_sizes.clone(), }; self.0.create_partitioner(&ctx).map(RocksSstPartitioner) } diff --git a/proxy_components/mock-engine-store/Cargo.toml b/proxy_components/mock-engine-store/Cargo.toml index 0328f80f3fc..c12122ce471 100644 --- a/proxy_components/mock-engine-store/Cargo.toml +++ b/proxy_components/mock-engine-store/Cargo.toml @@ -31,6 +31,7 @@ test-engines-panic = [ "proxy_server/test-engines-panic", ] testexport = ["proxy_server/testexport"] +openssl-vendored = ["tikv/openssl-vendored", "openssl/vendored"] [dependencies] api_version = { workspace = true, default-features = false } @@ -40,6 +41,7 @@ collections = { workspace = true } concurrency_manager = { workspace = true, default-features = false } crossbeam = "0.8" encryption = { workspace = true, default-features = false } +openssl = { workspace = true } # TODO only for feature encryption_export = { workspace = true, default-features = false } engine_rocks = { workspace = true, default-features = false } engine_store_ffi = { workspace = true, default-features = false } diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs index 00b71f7b71d..e2d62d4a9c6 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs @@ -10,6 +10,7 @@ use std::{ use collections::{HashMap, HashSet}; use encryption::DataKeyManager; +use engine_traits::SnapshotContext; // mock cluster use engine_traits::{Engines, KvEngine, CF_DEFAULT}; use file_system::IoRateLimiter; @@ -45,7 +46,7 @@ use test_pd_client::TestPdClient; use test_raftstore::{ is_error_response, make_cb, new_admin_request, new_delete_cmd, new_peer, new_put_cf_cmd, new_region_leader_cmd, new_request, new_status_request, new_store, new_tikv_config, - new_transfer_leader_cmd, sleep_ms, FilterFactory, + new_transfer_leader_cmd, sleep_ms, }; use tikv::server::Result as ServerResult; use tikv_util::{ @@ -57,7 +58,11 @@ use tikv_util::{ use tokio::sync::oneshot; use txn_types::WriteBatchFlags; -use super::{common::*, transport_simulate::Filter, util::*}; +use super::{ + common::*, + transport_simulate::{Filter, FilterFactory}, + util::*, +}; // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id @@ -113,23 +118,25 @@ pub trait Simulator { fn read( &mut self, + snap_ctx: Option, batch_id: Option, request: RaftCmdRequest, timeout: Duration, ) -> Result { let node_id = request.get_header().get_peer().get_store_id(); - let (cb, mut rx) = make_cb(&request); - self.async_read(node_id, batch_id, request, cb); + let (cb, mut rx) = make_cb::(&request); + self.async_read(snap_ctx, node_id, batch_id, request, cb); rx.recv_timeout(timeout) .map_err(|_| Error::Timeout(format!("request timeout for {:?}", timeout))) } fn async_read( &mut self, + snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ); fn call_command_on_node( @@ -138,7 +145,7 @@ pub trait Simulator { request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb::(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} @@ -397,7 +404,7 @@ impl> Cluster { } } let ret = if is_read { - self.sim.wl().read(None, request.clone(), timeout) + self.sim.wl().read(None, None, request.clone(), timeout) } else { self.sim.rl().call_command(request.clone(), timeout) }; @@ -946,6 +953,7 @@ impl> Cluster { split_keys: vec![split_key], callback: cb, source: "test".into(), + share_source_region_size: false, }, ) .unwrap(); diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/mod.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/mod.rs index 89073a80970..c7a34bd593b 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/mod.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/mod.rs @@ -1,4 +1,5 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(clippy::type_complexity)] pub mod cluster; pub mod cluster_ext_v1; diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs index 5c94546666d..275d093a0f2 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs @@ -10,7 +10,7 @@ use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; use engine_rocks::RocksSnapshot; use engine_store_ffi::core::DebugStruct; -use engine_traits::{Engines, MiscExt, Peekable}; +use engine_traits::{Engines, MiscExt, Peekable, SnapshotContext}; use kvproto::{ metapb, raft_cmdpb::*, @@ -56,7 +56,10 @@ use super::{ pub struct ChannelTransportCore { snap_paths: HashMap, - routers: HashMap>>, + routers: HashMap< + u64, + SimulateTransport, TiFlashEngine>, + >, } #[derive(Clone)] @@ -179,19 +182,18 @@ impl Transport for ChannelTransport { fn flush(&mut self) {} } -type SimulateChannelTransport = SimulateTransport; - +type SimulateChannelTransport = SimulateTransport; pub struct NodeCluster { trans: ChannelTransport, pd_client: Arc, nodes: HashMap>, snap_mgrs: HashMap, cfg_controller: Option, - simulate_trans: HashMap, + simulate_trans: HashMap>, concurrency_managers: HashMap, #[allow(clippy::type_complexity)] post_create_coprocessor_host: Option)>>, - pub importer: Option>, + pub importer: Option>>, } impl std::panic::UnwindSafe for NodeCluster {} @@ -217,7 +219,8 @@ impl NodeCluster { pub fn get_node_router( &self, node_id: u64, - ) -> SimulateTransport> { + ) -> SimulateTransport, TiFlashEngine> + { self.trans .core .lock() @@ -525,10 +528,11 @@ impl Simulator for NodeCluster { fn async_read( &mut self, + snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback<::Snapshot>, ) { if !self .trans @@ -546,7 +550,7 @@ impl Simulator for NodeCluster { } let mut guard = self.trans.core.lock().unwrap(); let router = guard.routers.get_mut(&node_id).unwrap(); - router.read(batch_id, request, cb).unwrap(); + router.read(snap_ctx, batch_id, request, cb).unwrap(); } fn send_raft_msg(&mut self, msg: raft_serverpb::RaftMessage) -> Result<()> { diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs index ca45b6e7e97..1d305d9ac2b 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs @@ -15,7 +15,7 @@ use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; use engine_rocks::RocksSnapshot; use engine_store_ffi::core::DebugStruct; -use engine_traits::{Engines, MiscExt}; +use engine_traits::{Engines, MiscExt, SnapshotContext}; use futures::executor::block_on; use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; use grpcio_health::HealthService; @@ -83,12 +83,13 @@ use transport_simulate::SimulateTransport; use txn_types::TxnExtraScheduler; use super::{common::*, Cluster, Simulator, *}; +use crate::mock_cluster::v1::transport_simulate::Filter; type SimulateStoreTransport = - SimulateTransport>; + SimulateTransport, TiFlashEngine>; type SimulateRaftExtension = ::RaftExtension; type SimulateServerTransport = - SimulateTransport>; + SimulateTransport, TiFlashEngine>; pub type SimulateEngine = RaftKv; @@ -112,8 +113,8 @@ impl StoreAddrResolver for AddressMap { fn resolve( &self, store_id: u64, - cb: Box) + Send>, - ) -> ServerResult<()> { + cb: Box) + Send>, + ) -> resolve::Result<()> { let addr = self.get(store_id); match addr { Some(addr) => cb(Ok(addr)), @@ -146,7 +147,7 @@ pub struct ServerCluster { addrs: AddressMap, pub storages: HashMap, pub region_info_accessors: HashMap, - pub importers: HashMap>, + pub importers: HashMap>>, pub pending_services: HashMap, pub coprocessor_hooks: HashMap, pub health_services: HashMap, @@ -396,6 +397,7 @@ impl ServerCluster { Arc::clone(&importer), None, None, // TODO resource_ctl + Arc::new(region_info_accessor.clone()), ); let check_leader_runner = @@ -724,10 +726,11 @@ impl Simulator for ServerCluster { fn async_read( &mut self, + snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback<::Snapshot>, ) { match self.metas.get_mut(&node_id) { None => { @@ -737,7 +740,9 @@ impl Simulator for ServerCluster { cb.invoke_with_response(resp); } Some(meta) => { - meta.sim_router.read(batch_id, request, cb).unwrap(); + meta.sim_router + .read(snap_ctx, batch_id, request, cb) + .unwrap(); } }; } @@ -782,26 +787,13 @@ impl Simulator for ServerCluster { self.call_command_on_node(node_id, request, timeout) } - fn read( - &mut self, - batch_id: Option, - request: RaftCmdRequest, - timeout: Duration, - ) -> Result { - let node_id = request.get_header().get_peer().get_store_id(); - let (cb, mut rx) = test_raftstore::make_cb(&request); - self.async_read(node_id, batch_id, request, cb); - rx.recv_timeout(timeout) - .map_err(|_| RaftError::Timeout(format!("request timeout for {:?}", timeout))) - } - fn call_command_on_node( &self, node_id: u64, request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, mut rx) = test_raftstore::make_cb(&request); + let (cb, mut rx) = test_raftstore::make_cb::(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} @@ -815,11 +807,11 @@ impl Simulator for ServerCluster { .map_err(|e| RaftError::Timeout(format!("request timeout for {:?}: {:?}", timeout, e))) } - fn add_send_filter(&mut self, _node_id: u64, _filter: Box) { + fn add_send_filter(&mut self, _node_id: u64, _filter: Box) { todo!() } - fn add_recv_filter(&mut self, _node_id: u64, _filter: Box) { + fn add_recv_filter(&mut self, _node_id: u64, _filter: Box) { todo!() } } diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/transport_simulate.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/transport_simulate.rs index 939972ed23b..5bab509d221 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/transport_simulate.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/transport_simulate.rs @@ -1,4 +1,4 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use std::{ marker::PhantomData, @@ -11,7 +11,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; -use engine_rocks::RocksSnapshot; +use engine_traits::{KvEngine, SnapshotContext}; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::eraftpb::MessageType; use raftstore::{ @@ -22,12 +22,8 @@ use raftstore::{ }, DiscardReason, Error, Result as RaftStoreResult, Result, }; -// Exported for v2_compat tests. -pub use test_raftstore::FilterFactory; use tikv_util::{error, time::ThreadReadId, Either, HandyRwLock}; -use super::common::*; - pub fn check_messages(msgs: &[RaftMessage]) -> Result<()> { if msgs.is_empty() { Err(Error::Transport(DiscardReason::Filtered)) @@ -36,7 +32,15 @@ pub fn check_messages(msgs: &[RaftMessage]) -> Result<()> { } } -pub use test_raftstore::Filter; +pub trait Filter: Send + Sync { + /// `before` is run before sending the messages. + fn before(&self, msgs: &mut Vec) -> Result<()>; + /// `after` is run after sending the messages, + /// so that the returned value could be changed if necessary. + fn after(&self, res: Result<()>) -> Result<()> { + res + } +} /// Emits a notification for each given message type that it sees. #[allow(dead_code)] @@ -136,16 +140,19 @@ impl Filter for DelayFilter { } #[derive(Clone)] -pub struct SimulateTransport { +pub struct SimulateTransport { filters: Arc>>>, ch: C, + + _p: PhantomData, } -impl SimulateTransport { - pub fn new(ch: C) -> SimulateTransport { +impl SimulateTransport { + pub fn new(ch: C) -> SimulateTransport { SimulateTransport { filters: Arc::new(RwLock::new(vec![])), ch, + _p: PhantomData, } } @@ -158,8 +165,7 @@ impl SimulateTransport { } } -#[allow(clippy::significant_drop_in_scrutinee)] -fn filter_send( +pub fn filter_send( filters: &Arc>>>, msg: RaftMessage, mut h: H, @@ -186,14 +192,13 @@ where } } } - let l = filters[..taken].iter().rev(); - for filter in l { + for filter in filters[..taken].iter().rev() { res = filter.after(res); } res } -impl Transport for SimulateTransport { +impl Transport for SimulateTransport { fn send(&mut self, m: RaftMessage) -> Result<()> { let ch = &mut self.ch; filter_send(&self.filters, m, |m| ch.send(m)) @@ -212,49 +217,52 @@ impl Transport for SimulateTransport { } } -impl> StoreRouter for SimulateTransport { - fn send(&self, msg: StoreMsg) -> Result<()> { +impl> StoreRouter for SimulateTransport { + fn send(&self, msg: StoreMsg) -> Result<()> { StoreRouter::send(&self.ch, msg) } } -impl> ProposalRouter for SimulateTransport { +impl> ProposalRouter<::Snapshot> + for SimulateTransport +{ fn send( &self, - cmd: RaftCommand, - ) -> std::result::Result<(), TrySendError>> { - ProposalRouter::::send(&self.ch, cmd) + cmd: RaftCommand<::Snapshot>, + ) -> std::result::Result<(), TrySendError::Snapshot>>> { + ProposalRouter::<::Snapshot>::send(&self.ch, cmd) } } -impl> CasualRouter for SimulateTransport { - fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - CasualRouter::::send(&self.ch, region_id, msg) +impl> CasualRouter for SimulateTransport { + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::::send(&self.ch, region_id, msg) } } -impl> SignificantRouter for SimulateTransport { - fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { +impl> SignificantRouter for SimulateTransport { + fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { self.ch.significant_send(region_id, msg) } } -impl> RaftStoreRouter for SimulateTransport { +impl> RaftStoreRouter for SimulateTransport { fn send_raft_msg(&self, msg: RaftMessage) -> Result<()> { filter_send(&self.filters, msg, |m| self.ch.send_raft_msg(m)) } - fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) {} + fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) {} } -impl> LocalReadRouter for SimulateTransport { +impl> LocalReadRouter for SimulateTransport { fn read( &mut self, + snap_ctx: Option, read_id: Option, req: RaftCmdRequest, - cb: Callback, + cb: Callback, ) -> RaftStoreResult<()> { - self.ch.read(read_id, req, cb) + self.ch.read(snap_ctx, read_id, req, cb) } fn release_snapshot_cache(&mut self) { @@ -262,9 +270,9 @@ impl> LocalReadRouter for Simul } } -// pub trait FilterFactory { -// fn generate(&self, node_id: u64) -> Vec>; -// } +pub trait FilterFactory { + fn generate(&self, node_id: u64) -> Vec>; +} #[derive(Default)] pub struct DefaultFilterFactory(PhantomData); @@ -374,7 +382,6 @@ pub struct RegionPacketFilter { drop_type: Vec, skip_type: Vec, dropped_messages: Option>>>, - #[allow(clippy::type_complexity)] msg_callback: Option>, } @@ -510,7 +517,7 @@ impl Filter for SnapshotFilter { /// simultaneous delivery of multiple snapshots from different peers. It /// collects the snapshots from different peers and drop the subsequent /// snapshots from the same peers. Currently, if there are more than 1 snapshots -/// in this filter, all the snapshots will be dilivered at once. +/// in this filter, all the snapshots will be delivered at once. pub struct CollectSnapshotFilter { dropped: AtomicBool, stale: AtomicBool, @@ -777,7 +784,6 @@ impl RandomLatencyFilter { } impl Filter for RandomLatencyFilter { - #[allow(clippy::significant_drop_in_scrutinee)] fn before(&self, msgs: &mut Vec) -> Result<()> { let mut to_send = vec![]; let mut to_delay = vec![]; @@ -831,18 +837,18 @@ impl Filter for LeaseReadFilter { #[derive(Clone)] pub struct DropMessageFilter { - ty: MessageType, + retain: Arc bool + Sync + Send>, } impl DropMessageFilter { - pub fn new(ty: MessageType) -> DropMessageFilter { - DropMessageFilter { ty } + pub fn new(retain: Arc bool + Sync + Send>) -> DropMessageFilter { + DropMessageFilter { retain } } } impl Filter for DropMessageFilter { fn before(&self, msgs: &mut Vec) -> Result<()> { - msgs.retain(|m| m.get_message().get_msg_type() != self.ty); + msgs.retain(|m| (self.retain)(m)); Ok(()) } } diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/util.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/util.rs index 7b0ee683079..49bc2c45296 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/util.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/util.rs @@ -75,7 +75,7 @@ pub fn create_tiflash_test_engine( let kv_path_str = kv_path.to_str().unwrap(); let kv_db_opt = cfg.rocksdb.build_opt( - &cfg.rocksdb.build_resources(env.clone()), + &cfg.rocksdb.build_resources(env.clone(), cfg.storage.engine), cfg.storage.engine, ); diff --git a/proxy_components/proxy_ffi/Cargo.toml b/proxy_components/proxy_ffi/Cargo.toml index 6b32f62c7f8..0b49155e49e 100644 --- a/proxy_components/proxy_ffi/Cargo.toml +++ b/proxy_components/proxy_ffi/Cargo.toml @@ -23,8 +23,14 @@ test-engines-panic = [ "engine_test/test-engines-panic", ] +# TODO use encryption/openssl-vendored if later supports +openssl-vendored = [ + "openssl/vendored" +] + [dependencies] encryption = { workspace = true, default-features = false } +openssl = { workspace = true } # TODO only for feature engine_rocks = { workspace = true, default-features = false } engine_traits = { workspace = true, default-features = false } engine_test = { workspace = true, default-features = false } diff --git a/proxy_components/proxy_ffi/src/encryption_impls.rs b/proxy_components/proxy_ffi/src/encryption_impls.rs index 4abed27d62e..d45f4e453ad 100644 --- a/proxy_components/proxy_ffi/src/encryption_impls.rs +++ b/proxy_components/proxy_ffi/src/encryption_impls.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{EncryptionKeyManager, EncryptionMethod, FileEncryptionInfo}; +use encryption::FileEncryptionInfo; +use kvproto::encryptionpb::EncryptionMethod; use super::{ get_engine_store_server_helper, diff --git a/proxy_components/proxy_server/Cargo.toml b/proxy_components/proxy_server/Cargo.toml index c9cf2bfca0f..d843816c169 100644 --- a/proxy_components/proxy_server/Cargo.toml +++ b/proxy_components/proxy_server/Cargo.toml @@ -34,6 +34,7 @@ nortcheck = ["engine_rocks/nortcheck", "engine_tiflash/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] testexport = ["engine_tiflash/testexport", "engine_store_ffi/testexport", "tikv/testexport"] pprof-fp = ["tikv/pprof-fp"] +openssl-vendored = ["tikv/openssl-vendored", "openssl/vendored"] [dependencies] api_version = { workspace = true } @@ -74,7 +75,7 @@ mime = "0.3.13" nix = "0.23" online_config = { workspace = true } -openssl = "0.10" +openssl = { workspace = true } pd_client = { workspace = true, default-features = false } pin-project = "1.0" pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec", "cpp"] } diff --git a/proxy_components/proxy_server/src/common_override.rs b/proxy_components/proxy_server/src/common_override.rs index a5acb7dbc4a..19820f0948a 100644 --- a/proxy_components/proxy_server/src/common_override.rs +++ b/proxy_components/proxy_server/src/common_override.rs @@ -96,7 +96,11 @@ impl ConfiguredRaftEngine for engine_rocks::RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + self.clone(), + DbType::Raft, + )), ); } } @@ -146,8 +150,9 @@ impl ConfiguredRaftEngine for PSLogEngine { _key_manager: &Option>, _block_cache: &Cache, ) -> (Self, Option>) { - // create a dummy file in raft engine dir to pass initial config check - let raft_engine_path = _config.raft_engine.config().dir + "/ps_engine"; + // Create a dummy file in raft engine dir to pass initial config check + // See raftengine_exists. + let raft_engine_path = _config.raft_engine.config().dir + "/ps_engine.raftlog"; let path = Path::new(&raft_engine_path); if !path.exists() { File::create(path).unwrap(); diff --git a/proxy_components/proxy_server/src/engine.rs b/proxy_components/proxy_server/src/engine.rs index 0f9d7d72bd9..14ba9b9ccda 100644 --- a/proxy_components/proxy_server/src/engine.rs +++ b/proxy_components/proxy_server/src/engine.rs @@ -70,6 +70,11 @@ impl ConfigurableDb for ProxyRocksEngine { opt.set_flush_size(f).map_err(Box::from) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + let mut cf_option = self.inner.get_options_cf(cf)?; + cf_option.set_flush_size(f).map_err(Box::from) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { let mut opt = self.get_db_options(); opt.set_flush_oldest_first(f).map_err(Box::from) diff --git a/proxy_components/proxy_server/src/lib.rs b/proxy_components/proxy_server/src/lib.rs index 6317fd1eeb5..dd6d2d6a5b5 100644 --- a/proxy_components/proxy_server/src/lib.rs +++ b/proxy_components/proxy_server/src/lib.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![allow(incomplete_features)] +#![feature(stmt_expr_attributes)] #![recursion_limit = "256"] #![feature(specialization)] diff --git a/proxy_components/proxy_server/src/run.rs b/proxy_components/proxy_server/src/run.rs index 8cdd359e05b..2aa90faaea9 100644 --- a/proxy_components/proxy_server/src/run.rs +++ b/proxy_components/proxy_server/src/run.rs @@ -517,7 +517,11 @@ impl TiKvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(proxy_rocks_engine, DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + proxy_rocks_engine, + DbType::Kv, + )), ); let reg = TabletRegistry::new( @@ -580,7 +584,7 @@ struct Servers { lock_mgr: LockManager, server: LocalServer, node: Node, - importer: Arc, + importer: Arc>, debugger: DebuggerImpl>, LockManager, F>, } @@ -867,6 +871,7 @@ impl TiKvServer { engines.engine.clone(), resource_ctl, CleanupMethod::Remote(self.core.background_worker.remote()), + true, )) } else { None @@ -1370,7 +1375,9 @@ impl TiKvServer { servers.importer.clone(), None, self.resource_manager.clone(), + Arc::new(self.region_info_accessor.clone()), ); + if servers .server .register_service(create_import_sst(import_service)) diff --git a/proxy_components/proxy_server/src/setup.rs b/proxy_components/proxy_server/src/setup.rs index 606ce2c0243..78792ead118 100644 --- a/proxy_components/proxy_server/src/setup.rs +++ b/proxy_components/proxy_server/src/setup.rs @@ -1,5 +1,4 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - use std::borrow::ToOwned; use clap::ArgMatches; @@ -37,11 +36,15 @@ pub fn overwrite_config_with_cmd_args( println!("arg matches is {:?}", matches); if let Some(level) = matches.value_of("log-level") { config.log.level = logger::get_level_by_string(level).unwrap().into(); + // For backward compating + #[allow(deprecated)] config.log_level = slog::Level::Info.into(); } if let Some(file) = matches.value_of("log-file") { config.log.file.filename = file.to_owned(); + // For backward compating + #[allow(deprecated)] config.log_file = "".to_owned(); } diff --git a/proxy_components/proxy_server/src/status_server.rs b/proxy_components/proxy_server/src/status_server/mod.rs similarity index 99% rename from proxy_components/proxy_server/src/status_server.rs rename to proxy_components/proxy_server/src/status_server/mod.rs index f451a637791..6542ffcf8ef 100644 --- a/proxy_components/proxy_server/src/status_server.rs +++ b/proxy_components/proxy_server/src/status_server/mod.rs @@ -1,5 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +mod profile; + use std::{ error::Error as StdError, marker::PhantomData, @@ -37,19 +39,17 @@ use openssl::{ x509::X509, }; use pin_project::pin_project; +use profile::{ + activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, list_heap_profiles, + read_file, start_one_cpu_profile, start_one_heap_profile, +}; use raftstore::store::{transport::CasualRouter, CasualMessage}; use regex::Regex; use security::{self, SecurityConfig}; use serde_json::Value; use tikv::{ config::{ConfigController, LogLevel}, - server::{ - status_server::{ - activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, - list_heap_profiles, read_file, start_one_cpu_profile, start_one_heap_profile, - }, - Result, - }, + server::Result, }; use tikv_util::{ error, logger::set_log_level, metrics::dump, sys::thread::ThreadBuildWrapper, diff --git a/proxy_components/proxy_server/src/status_server/profile.rs b/proxy_components/proxy_server/src/status_server/profile.rs new file mode 100644 index 00000000000..b3d91d3bea6 --- /dev/null +++ b/proxy_components/proxy_server/src/status_server/profile.rs @@ -0,0 +1,459 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +use std::{ + fs::{File, Metadata}, + io::Read, + path::PathBuf, + pin::Pin, + process::Command, + sync::Mutex as StdMutex, + time::{Duration, UNIX_EPOCH}, +}; + +use chrono::{offset::Local, DateTime}; +use futures::{ + channel::oneshot::{self, Sender}, + future::BoxFuture, + select, + task::{Context, Poll}, + Future, FutureExt, Stream, StreamExt, +}; +use lazy_static::lazy_static; +use pprof::protos::Message; +use regex::Regex; +use tempfile::{NamedTempFile, TempDir}; +#[cfg(not(test))] +use tikv_alloc::{activate_prof, deactivate_prof, dump_prof}; +use tokio::sync::{Mutex, MutexGuard}; + +#[cfg(test)] +pub use self::test_utils::TEST_PROFILE_MUTEX; +#[cfg(test)] +use self::test_utils::{activate_prof, deactivate_prof, dump_prof}; + +// File name suffix for periodically dumped heap profiles. +const HEAP_PROFILE_SUFFIX: &str = ".heap"; + +lazy_static! { + // If it's locked it means there are already a heap or CPU profiling. + static ref PROFILE_MUTEX: Mutex<()> = Mutex::new(()); + // The channel is used to deactivate a profiling. + static ref PROFILE_ACTIVE: StdMutex, TempDir)>> = StdMutex::new(None); + + // To normalize thread names. + static ref THREAD_NAME_RE: Regex = + Regex::new(r"^(?P[a-z-_ :]+?)(-?\d)*$").unwrap(); + static ref THREAD_NAME_REPLACE_SEPERATOR_RE: Regex = Regex::new(r"[_ ]").unwrap(); +} + +type OnEndFn = Box Result + Send + 'static>; + +struct ProfileGuard<'a, I, T> { + _guard: MutexGuard<'a, ()>, + item: Option, + on_end: Option>, + end: BoxFuture<'static, Result<(), String>>, +} + +impl<'a, I, T> Unpin for ProfileGuard<'a, I, T> {} + +impl<'a, I, T> ProfileGuard<'a, I, T> { + fn new( + on_start: F1, + on_end: F2, + end: BoxFuture<'static, Result<(), String>>, + ) -> Result, String> + where + F1: FnOnce() -> Result, + F2: FnOnce(I) -> Result + Send + 'static, + { + let _guard = match PROFILE_MUTEX.try_lock() { + Ok(guard) => guard, + _ => return Err("Already in Profiling".to_owned()), + }; + let item = on_start()?; + Ok(ProfileGuard { + _guard, + item: Some(item), + on_end: Some(Box::new(on_end) as OnEndFn), + end, + }) + } +} + +impl<'a, I, T> Future for ProfileGuard<'a, I, T> { + type Output = Result; + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + match self.end.as_mut().poll(cx) { + Poll::Ready(res) => { + let item = self.item.take().unwrap(); + let on_end = self.on_end.take().unwrap(); + let r = match (res, on_end(item)) { + (Ok(_), r) => r, + (Err(errmsg), _) => Err(errmsg), + }; + Poll::Ready(r) + } + Poll::Pending => Poll::Pending, + } + } +} + +/// Trigger a heap profie and return the content. +#[allow(dead_code)] +pub async fn start_one_heap_profile(end: F, use_jeprof: bool) -> Result, String> +where + F: Future> + Send + 'static, +{ + let on_start = || activate_prof().map_err(|e| format!("activate_prof: {}", e)); + + let on_end = move |_| { + deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e))?; + let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; + let path = f.path().to_str().unwrap(); + dump_prof(path).map_err(|e| format!("dump_prof: {}", e))?; + if use_jeprof { + jeprof_heap_profile(path) + } else { + read_file(path) + } + }; + + ProfileGuard::new(on_start, on_end, end.boxed())?.await +} + +/// Activate heap profile and call `callback` if successfully. +/// `deactivate_heap_profile` can only be called after it's notified from +/// `callback`. +pub async fn activate_heap_profile( + dump_period: S, + store_path: PathBuf, + callback: F, +) -> Result<(), String> +where + S: Stream> + Send + Unpin + 'static, + F: FnOnce() + Send + 'static, +{ + let (tx, rx) = oneshot::channel(); + let dir = tempfile::Builder::new() + .prefix("heap-") + .tempdir_in(store_path) + .map_err(|e| format!("create temp directory: {}", e))?; + let dir_path = dir.path().to_str().unwrap().to_owned(); + + let on_start = move || { + let mut activate = PROFILE_ACTIVE.lock().unwrap(); + assert!(activate.is_none()); + activate_prof().map_err(|e| format!("activate_prof: {}", e))?; + *activate = Some((tx, dir)); + callback(); + info!("periodical heap profiling is started"); + Ok(()) + }; + + let on_end = |_| { + deactivate_heap_profile(); + deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e)) + }; + + let end = async move { + select! { + _ = rx.fuse() => { + info!("periodical heap profiling is canceled"); + Ok(()) + }, + res = dump_heap_profile_periodically(dump_period, dir_path).fuse() => { + warn!("the heap profiling dump loop shouldn't break"; "res" => ?res); + res + } + } + }; + + ProfileGuard::new(on_start, on_end, end.boxed())?.await +} + +/// Deactivate heap profile. Return `false` if it hasn't been activated. +pub fn deactivate_heap_profile() -> bool { + let mut activate = PROFILE_ACTIVE.lock().unwrap(); + activate.take().is_some() +} + +/// Trigger one cpu profile. +pub async fn start_one_cpu_profile( + end: F, + frequency: i32, + protobuf: bool, +) -> Result, String> +where + F: Future> + Send + 'static, +{ + let on_start = || { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(frequency) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build() + .map_err(|e| format!("pprof::ProfilerGuardBuilder::build fail: {}", e))?; + Ok(guard) + }; + + let on_end = move |guard: pprof::ProfilerGuard<'static>| { + let report = guard + .report() + .frames_post_processor(move |frames| { + let name = extract_thread_name(&frames.thread_name); + frames.thread_name = name; + }) + .build() + .map_err(|e| format!("create cpu profiling report fail: {}", e))?; + let mut body = Vec::new(); + if protobuf { + let profile = report + .pprof() + .map_err(|e| format!("generate pprof from report fail: {}", e))?; + profile + .write_to_vec(&mut body) + .map_err(|e| format!("encode pprof into bytes fail: {}", e))?; + } else { + report + .flamegraph(&mut body) + .map_err(|e| format!("generate flamegraph from report fail: {}", e))?; + } + Ok(body) + }; + + ProfileGuard::new(on_start, on_end, end.boxed())?.await +} + +pub fn read_file(path: &str) -> Result, String> { + let mut f = File::open(path).map_err(|e| format!("open {} fail: {}", path, e))?; + let mut buf = Vec::new(); + f.read_to_end(&mut buf) + .map_err(|e| format!("read {} fail: {}", path, e))?; + Ok(buf) +} + +pub fn jeprof_heap_profile(path: &str) -> Result, String> { + info!("using jeprof to process {}", path); + let output = Command::new("./jeprof") + .args(["--show_bytes", "./bin/tikv-server", path, "--svg"]) + .output() + .map_err(|e| format!("jeprof: {}", e))?; + if !output.status.success() { + let stderr = std::str::from_utf8(&output.stderr).unwrap_or("invalid utf8"); + return Err(format!("jeprof stderr: {:?}", stderr)); + } + Ok(output.stdout) +} + +pub fn list_heap_profiles() -> Result, String> { + let path = match &*PROFILE_ACTIVE.lock().unwrap() { + Some((_, ref dir)) => dir.path().to_str().unwrap().to_owned(), + None => return Ok(vec![]), + }; + + let dir = std::fs::read_dir(path).map_err(|e| format!("read dir fail: {}", e))?; + let mut profiles = Vec::new(); + for item in dir { + let item = match item { + Ok(x) => x, + _ => continue, + }; + let f = item.path().to_str().unwrap().to_owned(); + if !f.ends_with(HEAP_PROFILE_SUFFIX) { + continue; + } + let ct = item.metadata().map(|x| last_change_epoch(&x)).unwrap(); + let dt = DateTime::::from(UNIX_EPOCH + Duration::from_secs(ct)); + profiles.push((f, dt.format("%Y-%m-%d %H:%M:%S").to_string())); + } + + // Reverse sort them. + profiles.sort_by(|x, y| y.1.cmp(&x.1)); + info!("list_heap_profiles gets {} items", profiles.len()); + Ok(profiles) +} + +async fn dump_heap_profile_periodically(mut period: S, dir: String) -> Result<(), String> +where + S: Stream> + Send + Unpin + 'static, +{ + let mut id = 0; + while let Some(res) = period.next().await { + res?; + id += 1; + let path = format!("{}/{:0>6}{}", dir, id, HEAP_PROFILE_SUFFIX); + dump_prof(&path).map_err(|e| format!("dump_prof: {}", e))?; + info!("a heap profile is dumped to {}", path); + } + Ok(()) +} + +fn extract_thread_name(thread_name: &str) -> String { + THREAD_NAME_RE + .captures(thread_name) + .and_then(|cap| { + cap.name("thread_name").map(|thread_name| { + THREAD_NAME_REPLACE_SEPERATOR_RE + .replace_all(thread_name.as_str(), "-") + .into_owned() + }) + }) + .unwrap_or_else(|| thread_name.to_owned()) +} + +// Re-define some heap profiling functions because heap-profiling is not enabled +// for tests. +#[cfg(test)] +mod test_utils { + use std::sync::Mutex; + + use tikv_alloc::error::ProfResult; + + lazy_static! { + pub static ref TEST_PROFILE_MUTEX: Mutex<()> = Mutex::new(()); + } + + pub fn activate_prof() -> ProfResult<()> { + Ok(()) + } + pub fn deactivate_prof() -> ProfResult<()> { + Ok(()) + } + pub fn dump_prof(_: &str) -> ProfResult<()> { + Ok(()) + } +} + +#[cfg(unix)] +fn last_change_epoch(metadata: &Metadata) -> u64 { + use std::os::unix::fs::MetadataExt; + metadata.ctime() as u64 +} + +#[cfg(not(unix))] +fn last_change_epoch(metadata: &Metadata) -> u64 { + 0 +} + +#[cfg(test)] +mod tests { + use std::sync::mpsc::sync_channel; + + use futures::{channel::mpsc, executor::block_on, SinkExt}; + use tokio::runtime; + + use super::*; + + #[test] + fn test_last_change_epoch() { + let f = tempfile::tempfile().unwrap(); + assert!(last_change_epoch(&f.metadata().unwrap()) > 0); + } + + #[test] + fn test_extract_thread_name() { + assert_eq!(&extract_thread_name("test-name-1"), "test-name"); + assert_eq!(&extract_thread_name("grpc-server-5"), "grpc-server"); + assert_eq!(&extract_thread_name("rocksdb:bg1000"), "rocksdb:bg"); + assert_eq!(&extract_thread_name("raftstore-1-100"), "raftstore"); + assert_eq!(&extract_thread_name("snap sender1000"), "snap-sender"); + assert_eq!(&extract_thread_name("snap_sender1000"), "snap-sender"); + } + + // Test there is at most 1 concurrent profiling. + #[test] + fn test_profile_guard_concurrency() { + use std::{thread, time::Duration}; + + use futures::{channel::oneshot, TryFutureExt}; + + let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); + let rt = runtime::Builder::new_multi_thread() + .worker_threads(4) + .build() + .unwrap(); + + let expected = "Already in Profiling"; + + let (tx1, rx1) = oneshot::channel(); + let rx1 = rx1.map_err(|_| "channel canceled".to_owned()); + let res1 = rt.spawn(start_one_cpu_profile(rx1, 99, false)); + thread::sleep(Duration::from_millis(100)); + + let (_tx2, rx2) = oneshot::channel(); + let rx2 = rx2.map_err(|_| "channel canceled".to_owned()); + let res2 = rt.spawn(start_one_cpu_profile(rx2, 99, false)); + assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); + + let (_tx2, rx2) = oneshot::channel(); + let rx2 = rx2.map_err(|_| "channel canceled".to_owned()); + let res2 = rt.spawn(start_one_heap_profile(rx2, false)); + assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); + + let (_tx2, rx2) = mpsc::channel(1); + let res2 = rt.spawn(activate_heap_profile(rx2, std::env::temp_dir(), || {})); + assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); + + drop(tx1); + block_on(res1).unwrap().unwrap_err(); + } + + #[test] + fn test_profile_guard_toggle() { + let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); + let rt = runtime::Builder::new_multi_thread() + .worker_threads(4) + .build() + .unwrap(); + + // Test activated profiling can be stopped by canceling the period stream. + let (tx, rx) = mpsc::channel(1); + let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); + drop(tx); + block_on(res).unwrap().unwrap(); + + // Test activated profiling can be stopped by the handle. + let (tx, rx) = sync_channel::(1); + let on_activated = move || drop(tx); + let check_activated = move || rx.recv().is_err(); + + let (_tx, _rx) = mpsc::channel(1); + let res = rt.spawn(activate_heap_profile( + _rx, + std::env::temp_dir(), + on_activated, + )); + assert!(check_activated()); + assert!(deactivate_heap_profile()); + block_on(res).unwrap().unwrap(); + } + + #[test] + fn test_heap_profile_exit() { + let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); + let rt = runtime::Builder::new_multi_thread() + .worker_threads(4) + .build() + .unwrap(); + + // Test heap profiling can be stopped by sending an error. + let (mut tx, rx) = mpsc::channel(1); + let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); + block_on(tx.send(Err("test".to_string()))).unwrap(); + block_on(res).unwrap().unwrap_err(); + + // Test heap profiling can be activated again. + let (tx, rx) = sync_channel::(1); + let on_activated = move || drop(tx); + let check_activated = move || rx.recv().is_err(); + + let (_tx, _rx) = mpsc::channel(1); + let res = rt.spawn(activate_heap_profile( + _rx, + std::env::temp_dir(), + on_activated, + )); + assert!(check_activated()); + assert!(deactivate_heap_profile()); + block_on(res).unwrap().unwrap(); + } +} diff --git a/proxy_components/proxy_test_raftstore_v2/Cargo.toml b/proxy_components/proxy_test_raftstore_v2/Cargo.toml index 738499916a7..b5fdaba5185 100644 --- a/proxy_components/proxy_test_raftstore_v2/Cargo.toml +++ b/proxy_components/proxy_test_raftstore_v2/Cargo.toml @@ -37,7 +37,7 @@ engine_traits = { workspace = true } fail = "0.5" file_system = { workspace = true } futures = "0.3" -grpcio = { workspace = true } +grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } diff --git a/proxy_components/proxy_test_raftstore_v2/src/node.rs b/proxy_components/proxy_test_raftstore_v2/src/node.rs index a02af6ad177..f5e6259d383 100644 --- a/proxy_components/proxy_test_raftstore_v2/src/node.rs +++ b/proxy_components/proxy_test_raftstore_v2/src/node.rs @@ -142,13 +142,13 @@ impl Default for ChannelTransport { } } -type SimulateChannelTransport = SimulateTransport; +type SimulateChannelTransport = SimulateTransport, EK>; pub struct NodeCluster { trans: ChannelTransport, pd_client: Arc, nodes: HashMap>, - simulate_trans: HashMap, + simulate_trans: HashMap>, concurrency_managers: HashMap, // snap_mgrs: HashMap, } diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index a442a943c67..a38316b1afd 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -17,7 +17,9 @@ elif [[ $M == "testold" ]]; then chmod +x ./proxy_scripts/tikv-code-consistency.sh ./proxy_scripts/tikv-code-consistency.sh echo "Finish tikv code consistency" - # exit # If we depend TiKV as a Cargo component, the following is not necessary, and can fail. + exit # If we depend TiKV as a Cargo component, the following is not necessary, and can fail. + # TODO we have to let tests support openssl-vendored. + yum install openssl openssl-devel -y cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_normal cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_bootstrap cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_compact_log @@ -34,7 +36,7 @@ elif [[ $M == "testold" ]]; then elif [[ $M == "testnew" ]]; then export ENGINE_LABEL_VALUE=tiflash export RUST_BACKTRACE=full - export ENABLE_FEATURES="test-engine-kv-rocksdb test-engine-raft-raft-engine" + export ENABLE_FEATURES="test-engine-kv-rocksdb test-engine-raft-raft-engine openssl-vendored" cargo check --package proxy_server --features="$ENABLE_FEATURES" # tests based on mock-engine-store, with compat for new proxy cargo test --package proxy_tests --features="$ENABLE_FEATURES" --test proxy shared::write diff --git a/proxy_scripts/clippy.sh b/proxy_scripts/clippy.sh index 5cdb9a5d0aa..3c44e1de8b3 100755 --- a/proxy_scripts/clippy.sh +++ b/proxy_scripts/clippy.sh @@ -1,7 +1,7 @@ set -uxeo pipefail export ENGINE_LABEL_VALUE=tiflash export RUST_BACKTRACE=full -export ENGINE_FEATURES="test-engine-kv-rocksdb test-engine-raft-raft-engine testexport" +export ENGINE_FEATURES="test-engine-kv-rocksdb test-engine-raft-raft-engine testexport openssl-vendored" rustup component add clippy # TODO We use --manifest-path as a wordaround. cargo clippy --package proxy_ffi --features "$ENGINE_FEATURES" --manifest-path proxy_components/proxy_ffi/Cargo.toml --no-deps -- -Dwarnings -A clippy::result_large_err -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc diff --git a/proxy_scripts/tikv-code-consistency.sh b/proxy_scripts/tikv-code-consistency.sh index 09326b9d763..7227349c088 100755 --- a/proxy_scripts/tikv-code-consistency.sh +++ b/proxy_scripts/tikv-code-consistency.sh @@ -2,10 +2,10 @@ git remote rm tikv_up set -uxeo pipefail # ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts git remote add tikv_up https://github.com/tikv/tikv.git -git fetch tikv_up release-7.1 -if [[ $(git diff HEAD `git merge-base HEAD tikv_up/release-7.1` --name-only -- components | wc -l) -ne 0 ]]; then +git fetch tikv_up a0e8a7a163302bc9a7be5fd5a903b6a156797eb8 +if [[ $(git diff HEAD `git merge-base HEAD tikv_up/a0e8a7a163302bc9a7be5fd5a903b6a156797eb8` --name-only -- components | wc -l) -ne 0 ]]; then exit 1 fi -if [[ $(git diff HEAD `git merge-base HEAD tikv_up/release-7.1` --name-only -- src | wc -l) -ne 0 ]]; then +if [[ $(git diff HEAD `git merge-base HEAD tikv_up/a0e8a7a163302bc9a7be5fd5a903b6a156797eb8` --name-only -- src | wc -l) -ne 0 ]]; then exit 1 fi \ No newline at end of file diff --git a/proxy_tests/Cargo.toml b/proxy_tests/Cargo.toml index d0b0e65339f..35f0bebe765 100644 --- a/proxy_tests/Cargo.toml +++ b/proxy_tests/Cargo.toml @@ -12,9 +12,9 @@ required-features = ["failpoints", "testexport"] [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "cloud-aws", "cloud-gcp", "cloud-azure"] failpoints = ["fail/failpoints", "tikv/failpoints"] -cloud-aws = ["external_storage_export/cloud-aws"] -cloud-gcp = ["external_storage_export/cloud-gcp"] -cloud-azure = ["external_storage_export/cloud-azure"] +cloud-aws = ["encryption_export/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure"] testexport = ["raftstore/testexport", "raftstore-v2/testexport", "tikv/testexport", "engine_tiflash/testexport", "engine_store_ffi/testexport"] profiling = ["profiler/profiling"] @@ -44,7 +44,7 @@ snmalloc = ["tikv/snmalloc"] mem-profiling = ["tikv/mem-profiling"] sse = ["tikv/sse"] portable = ["tikv/portable"] - +openssl-vendored = ["tikv/openssl-vendored"] enable-pagestorage = [] [dependencies] @@ -57,6 +57,7 @@ collections = { workspace = true } crc64fast = "0.1" crossbeam = "0.8" encryption = { workspace = true } +encryption_export = { workspace = true } engine_rocks_helper = { workspace = true } engine_store_ffi = { workspace = true, default-features = false } engine_test = { workspace = true, default-features = false } @@ -65,7 +66,7 @@ error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } futures = "0.3" -grpcio = { workspace = true } +grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } grpcio-health = { version = "0.10", default-features = false } kvproto = { workspace = true } libc = "0.2" @@ -117,7 +118,6 @@ criterion-cpu-time = "0.1" engine_rocks = { workspace = true } engine_test = { workspace = true } engine_traits = { workspace = true } -external_storage_export = { workspace = true } file_system = { workspace = true } hex = "0.4" hyper = { version = "0.14", default-features = false, features = ["runtime"] } diff --git a/proxy_tests/proxy/shared/encryption.rs b/proxy_tests/proxy/shared/encryption.rs new file mode 100644 index 00000000000..f284d07a139 --- /dev/null +++ b/proxy_tests/proxy/shared/encryption.rs @@ -0,0 +1,34 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use kvproto::encryptionpb::EncryptionMethod; +use proxy_ffi::interfaces_ffi::EncryptionMethod as FFIEncryptionMethod; + +use crate::utils::v1::*; + +#[test] +fn test_encryption_match() { + assert_eq!( + FFIEncryptionMethod::from(EncryptionMethod::Unknown), + FFIEncryptionMethod::Unknown + ); + assert_eq!( + FFIEncryptionMethod::from(EncryptionMethod::Plaintext), + FFIEncryptionMethod::Plaintext + ); + assert_eq!( + FFIEncryptionMethod::from(EncryptionMethod::Aes128Ctr), + FFIEncryptionMethod::Aes128Ctr + ); + assert_eq!( + FFIEncryptionMethod::from(EncryptionMethod::Aes192Ctr), + FFIEncryptionMethod::Aes192Ctr + ); + assert_eq!( + FFIEncryptionMethod::from(EncryptionMethod::Aes256Ctr), + FFIEncryptionMethod::Aes256Ctr + ); + assert_eq!( + FFIEncryptionMethod::from(EncryptionMethod::Sm4Ctr), + FFIEncryptionMethod::SM4Ctr + ); +} diff --git a/proxy_tests/proxy/shared/mod.rs b/proxy_tests/proxy/shared/mod.rs index 12e24c39e70..7a20b9d4483 100644 --- a/proxy_tests/proxy/shared/mod.rs +++ b/proxy_tests/proxy/shared/mod.rs @@ -1,6 +1,7 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. mod config; +mod encryption; mod engine; mod fast_add_peer; mod ffi; diff --git a/proxy_tests/proxy/utils/v1.rs b/proxy_tests/proxy/utils/v1.rs index 8e19f35c105..f90afcf408f 100644 --- a/proxy_tests/proxy/utils/v1.rs +++ b/proxy_tests/proxy/utils/v1.rs @@ -9,6 +9,7 @@ pub use mock_engine_store::mock_cluster::v1::{ }, Cluster, Simulator, }; +use mock_engine_store::mock_cluster::TiFlashEngine; use rand::seq::SliceRandom; use sst_importer::SstImporter; use test_sst_importer::gen_sst_file_with_kvs; @@ -90,7 +91,10 @@ pub fn new_split_region_cluster(count: u64) -> (Cluster, Arc (PathBuf, Arc) { +pub fn create_tmp_importer( + cfg: &MixedClusterConfig, + kv_path: &str, +) -> (PathBuf, Arc>) { let dir = Path::new(kv_path).join("import-sst"); let importer = { Arc::new( diff --git a/proxy_tests/proxy/v2_compat/simple_write.rs b/proxy_tests/proxy/v2_compat/simple_write.rs index 51845dd4eb1..4f81ebac5d2 100644 --- a/proxy_tests/proxy/v2_compat/simple_write.rs +++ b/proxy_tests/proxy/v2_compat/simple_write.rs @@ -36,20 +36,22 @@ fn test_write_simple() { let trans1 = Mutex::new(cluster_v1.sim.read().unwrap().get_router(2).unwrap()); let trans2 = Mutex::new(cluster_v2.sim.read().unwrap().get_router(1).unwrap()); - let factory1 = ForwardFactory { + let factory1 = ForwardFactoryV1 { node_id: 1, chain_send: Arc::new(move |m| { info!("send to trans2"; "msg" => ?m); let _ = trans2.lock().unwrap().send_raft_message(Box::new(m)); }), + keep_msg: true, }; cluster_v1.add_send_filter(factory1); - let factory2 = ForwardFactory { + let factory2 = ForwardFactoryV2 { node_id: 2, chain_send: Arc::new(move |m| { info!("send to trans1"; "msg" => ?m); let _ = trans1.lock().unwrap().send_raft_message(m); }), + keep_msg: true, }; cluster_v2.add_send_filter(factory2); diff --git a/proxy_tests/proxy/v2_compat/tablet_snapshot.rs b/proxy_tests/proxy/v2_compat/tablet_snapshot.rs index 5ff84bc2fc1..697fea181ec 100644 --- a/proxy_tests/proxy/v2_compat/tablet_snapshot.rs +++ b/proxy_tests/proxy/v2_compat/tablet_snapshot.rs @@ -336,20 +336,22 @@ fn test_handle_snapshot() { .pd_client .must_add_peer(r21, new_learner_peer(2, 10)); - let factory1 = ForwardFactory { + let factory1 = ForwardFactoryV1 { node_id: 1, chain_send: Arc::new(move |m| { info!("send to trans2"; "msg" => ?m); let _ = trans2.lock().unwrap().send_raft_message(Box::new(m)); }), + keep_msg: true, }; cluster_v1.add_send_filter(factory1); - let factory2 = ForwardFactory { + let factory2 = ForwardFactoryV2 { node_id: 2, chain_send: Arc::new(move |m| { info!("send to trans1"; "msg" => ?m); let _ = trans1.lock().unwrap().send_raft_message(m); }), + keep_msg: true, }; cluster_v2.add_send_filter(factory2); diff --git a/proxy_tests/proxy/v2_compat/utils.rs b/proxy_tests/proxy/v2_compat/utils.rs index 4d0a5120f31..9a2b7518567 100644 --- a/proxy_tests/proxy/v2_compat/utils.rs +++ b/proxy_tests/proxy/v2_compat/utils.rs @@ -3,33 +3,87 @@ use std::sync::Arc; use kvproto::raft_serverpb::RaftMessage; -use mock_engine_store::mock_cluster::v1::transport_simulate::{Filter, FilterFactory}; use raftstore::errors::Result; -pub struct ForwardFactory { +pub struct ForwardFactoryV2 { pub node_id: u64, pub chain_send: Arc, + pub keep_msg: bool, } -impl FilterFactory for ForwardFactory { - fn generate(&self, _: u64) -> Vec> { - vec![Box::new(ForwardFilter { +impl test_raftstore::FilterFactory for ForwardFactoryV2 { + fn generate(&self, _: u64) -> Vec> { + vec![Box::new(ForwardFilterV2 { node_id: self.node_id, chain_send: self.chain_send.clone(), + keep_msg: self.keep_msg, })] } } -pub struct ForwardFilter { - node_id: u64, - chain_send: Arc, +pub struct ForwardFilterV2 { + pub node_id: u64, + pub chain_send: Arc, + pub keep_msg: bool, } -impl Filter for ForwardFilter { +impl test_raftstore::Filter for ForwardFilterV2 { fn before(&self, msgs: &mut Vec) -> Result<()> { - for m in msgs.drain(..) { - if self.node_id == m.get_to_peer().get_store_id() { - (self.chain_send)(m); + if self.keep_msg { + for m in msgs { + if self.node_id == m.get_to_peer().get_store_id() { + (self.chain_send)(m.clone()); + } + } + } else { + for m in msgs.drain(..) { + if self.node_id == m.get_to_peer().get_store_id() { + (self.chain_send)(m); + } + } + } + Ok(()) + } +} + +pub struct ForwardFactoryV1 { + pub node_id: u64, + pub chain_send: Arc, + pub keep_msg: bool, +} + +impl mock_engine_store::mock_cluster::v1::transport_simulate::FilterFactory for ForwardFactoryV1 { + fn generate( + &self, + _: u64, + ) -> Vec> { + vec![Box::new(ForwardFilterV1 { + node_id: self.node_id, + chain_send: self.chain_send.clone(), + keep_msg: self.keep_msg, + })] + } +} + +pub struct ForwardFilterV1 { + pub node_id: u64, + pub chain_send: Arc, + pub keep_msg: bool, +} + +impl mock_engine_store::mock_cluster::v1::transport_simulate::Filter for ForwardFilterV1 { + fn before(&self, msgs: &mut Vec) -> Result<()> { + if self.keep_msg { + for m in msgs { + if self.node_id == m.get_to_peer().get_store_id() { + (self.chain_send)(m.clone()); + } + } + } else { + for m in msgs.drain(..) { + if self.node_id == m.get_to_peer().get_store_id() { + (self.chain_send)(m); + } } } Ok(()) diff --git a/raftstore-proxy/Cargo.toml b/raftstore-proxy/Cargo.toml index 5ab8af974a7..1916b7b6228 100644 --- a/raftstore-proxy/Cargo.toml +++ b/raftstore-proxy/Cargo.toml @@ -34,6 +34,7 @@ nortcheck = ["proxy_server/nortcheck"] backup-stream-debug = ["proxy_server/backup-stream-debug"] pprof-fp = ["proxy_server/pprof-fp"] +openssl-vendored = ["proxy_server/openssl-vendored"] [lib] name = "raftstore_proxy" diff --git a/scripts/check-bins.py b/scripts/check-bins.py index 1255472a76a..cd5a4879f27 100644 --- a/scripts/check-bins.py +++ b/scripts/check-bins.py @@ -14,13 +14,29 @@ "online_config", "online_config_derive", "tidb_query_codegen", "panic_hook", "fuzz", "fuzzer_afl", "fuzzer_honggfuzz", "fuzzer_libfuzzer", "coprocessor_plugin_api", "example_coprocessor_plugin", "memory_trace_macros", "case_macros", - "tracker", "test_raftstore_macro" + "tracker", "test_raftstore_macro", "crypto" } JEMALLOC_SYMBOL = ["je_arena_boot", " malloc"] SYS_LIB = ["libstdc++"] +def ensure_link(args, require_static, libs): + p = os.popen("uname") + if "Linux" not in p.readline(): + return + for bin in args: + p = os.popen("ldd " + bin) + requires = set(l.split()[0] for l in p.readlines()) + for lib in libs: + if any(lib in r for r in requires): + if require_static: + pr("error: %s should not requires dynamic library %s\n" % (bin, lib)) + sys.exit(1) + elif not require_static: + pr("error: %s should requires dynamic library %s\n" % (bin, lib)) + sys.exit(1) + def pr(s): if sys.stdout.isatty(): sys.stdout.write("\x1b[2K\r" + s) @@ -72,6 +88,24 @@ def check_sse(executable): print("fix this by building tikv with ROCKSDB_SYS_SSE=1") sys.exit(1) +def is_openssl_vendored_enabled(features): + return "openssl-vendored" in features + +def check_openssl(executable, is_static_link): + openssl_libs = ["libcrypto", "libssl"] + ensure_link([executable], is_static_link, openssl_libs) + if is_static_link: + return + openssl_symbols = ["EVP_", "OPENSSL"] + p = os.popen('nm %s | grep -iE " (t|T) (%s)"' % (executable, "|".join(openssl_symbols))) + lines = p.readlines() + if lines: + pr( + "error: %s contains OpenSSL symbol %s in text section:\n%s\n" + % (executable, openssl_symbols, "".join(lines)) + ) + sys.exit(1) + def check_tests(features): if not is_jemalloc_enabled(features): print("jemalloc not enabled, skip check!") @@ -95,28 +129,22 @@ def check_tests(features): pr("Checking binary %s" % name) check_jemalloc(executable) + check_openssl(executable, True) pr("") print("Done, takes %.2fs." % (time.time() - start)) -def ensure_link(args): - p = os.popen("uname") - if "Linux" not in p.readline(): - return - for bin in args: - p = os.popen("ldd " + bin) - requires = set(l.split()[0] for l in p.readlines()) - for lib in SYS_LIB: - if any(lib in r for r in requires): - pr("error: %s should not requires dynamic library %s\n" % (bin, lib)) - sys.exit(1) - def check_release(enabled_features, args): - ensure_link(args) + # Ensure statically link SYS_LIB. + ensure_link(args, True, SYS_LIB) checked_features = [] if is_jemalloc_enabled(enabled_features): checked_features.append("jemalloc") if is_sse_enabled(enabled_features): checked_features.append("SSE4.2") + if is_openssl_vendored_enabled(enabled_features): + checked_features.append("static-link-openssl") + else: + checked_features.append("dynamic-link-openssl") if not checked_features: print("Both jemalloc and SSE4.2 are disabled, skip check") return @@ -127,7 +155,8 @@ def check_release(enabled_features, args): check_jemalloc(arg) if is_sse_enabled(enabled_features): check_sse(arg) - pr("%s %s \033[32menabled\033[0m\n" % (arg, " ".join(checked_features))) + check_openssl(arg, is_openssl_vendored_enabled(enabled_features)) + pr("%s [%s] \033[32menabled\033[0m\n" % (arg, " ".join(checked_features))) def main(): argv = sys.argv diff --git a/scripts/check-dashboards b/scripts/check-dashboards new file mode 100755 index 00000000000..fdb73c28168 --- /dev/null +++ b/scripts/check-dashboards @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -euo pipefail + +for sha256 in ./metrics/grafana/*.sha256; do + if ! sha256sum -c "$sha256"; then + dashboard=$(basename "$sha256" .sha256) + echo "Please avoid manually modifying $dashboard" + echo "Try ./scripts/gen-tikv-details-dashboard" + exit 1 + fi +done + +echo "Dashboards check passed." diff --git a/scripts/gen-tikv-details-dashboard b/scripts/gen-tikv-details-dashboard new file mode 100755 index 00000000000..2c91cf3dbb9 --- /dev/null +++ b/scripts/gen-tikv-details-dashboard @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +set -euo pipefail + +docker build -t tikv-dashboard-gen -f - . < /metrics/grafana/\$name.json.sha256 + done + " diff --git a/src/config/configurable.rs b/src/config/configurable.rs index 6fe9409c1c0..c92b01cf465 100644 --- a/src/config/configurable.rs +++ b/src/config/configurable.rs @@ -15,6 +15,7 @@ pub trait ConfigurableDb { fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes; fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes; fn set_flush_size(&self, f: usize) -> ConfigRes; + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes; fn set_flush_oldest_first(&self, f: bool) -> ConfigRes; fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes; fn set_high_priority_background_threads(&self, n: i32, allow_reduce: bool) -> ConfigRes; @@ -57,6 +58,11 @@ impl ConfigurableDb for RocksEngine { opt.set_flush_size(f).map_err(Box::from) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + let mut cf_option = self.get_options_cf(cf)?; + cf_option.set_flush_size(f).map_err(Box::from) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { let mut opt = self.get_db_options(); opt.set_flush_oldest_first(f).map_err(Box::from) @@ -171,6 +177,17 @@ impl ConfigurableDb for TabletRegistry { }) } + fn set_cf_flush_size(&self, cf: &str, f: usize) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_cf_flush_size(cf, f)?; + Ok(false) + } else { + Ok(true) + } + }) + } + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { loop_registry(self, |cache| { if let Some(latest) = cache.latest() { diff --git a/src/config/mod.rs b/src/config/mod.rs index 5c7f1424c38..c0c2a679b5a 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -110,6 +110,7 @@ const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.2; // Too large will increase Raft Engine memory usage. const WRITE_BUFFER_MEMORY_LIMIT_MAX: u64 = ReadableSize::gb(8).0; +const DEFAULT_LOCK_BUFFER_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(128); /// Configs that actually took effect in the last run pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; @@ -135,6 +136,8 @@ pub struct TitanCfConfig { #[online_config(skip)] pub blob_file_compression: CompressionType, #[online_config(skip)] + pub zstd_dict_size: ReadableSize, + #[online_config(skip)] pub blob_cache_size: ReadableSize, #[online_config(skip)] pub min_gc_batch_size: ReadableSize, @@ -146,6 +149,7 @@ pub struct TitanCfConfig { #[online_config(skip)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "Titan doesn't need to sample anymore"] pub sample_ratio: Option, #[online_config(skip)] pub merge_small_file_threshold: ReadableSize, @@ -156,18 +160,20 @@ pub struct TitanCfConfig { pub range_merge: bool, #[online_config(skip)] pub max_sorted_runs: i32, - // deprecated. #[online_config(skip)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The feature is removed"] pub gc_merge_rewrite: bool, } impl Default for TitanCfConfig { + #[allow(deprecated)] fn default() -> Self { Self { min_blob_size: ReadableSize::kb(1), // disable titan default - blob_file_compression: CompressionType::Lz4, + blob_file_compression: CompressionType::Zstd, + zstd_dict_size: ReadableSize::kb(0), blob_cache_size: ReadableSize::mb(0), min_gc_batch_size: ReadableSize::mb(16), max_gc_batch_size: ReadableSize::mb(64), @@ -188,6 +194,15 @@ impl TitanCfConfig { let mut opts = RocksTitanDbOptions::new(); opts.set_min_blob_size(self.min_blob_size.0); opts.set_blob_file_compression(self.blob_file_compression.into()); + // To try zstd dict compression, set dict size to 4k, sample size to 100X dict + // size + opts.set_compression_options( + -14, // window_bits + 32767, // level + 0, // strategy + self.zstd_dict_size.0 as i32, // zstd dict size + self.zstd_dict_size.0 as i32 * 100, // zstd sample size + ); opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); opts.set_min_gc_batch_size(self.min_gc_batch_size.0); opts.set_max_gc_batch_size(self.max_gc_batch_size.0); @@ -200,6 +215,7 @@ impl TitanCfConfig { opts } + #[allow(deprecated)] fn validate(&self) -> Result<(), Box> { if self.gc_merge_rewrite { return Err( @@ -243,22 +259,30 @@ const RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS: BackgroundJobLimits = BackgroundJobL // `defaults` serves as an upper bound for returning limits. fn get_background_job_limits_impl( + engine_type: EngineType, cpu_num: u32, defaults: &BackgroundJobLimits, ) -> BackgroundJobLimits { // At the minimum, we should have two background jobs: one for flush and one for // compaction. Otherwise, the number of background jobs should not exceed // cpu_num - 1. - let max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); + let mut max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); // Scale flush threads proportionally to cpu cores. Also make sure the number of // flush threads doesn't exceed total jobs. let max_background_flushes = cmp::min( (max_background_jobs + 3) / 4, defaults.max_background_flushes, ); - // Cap max_sub_compactions to allow at least two compactions. - let max_compactions = max_background_jobs - max_background_flushes; + + // set the default compaction threads differently for v1 and v2: + // v1: cap max_sub_compactions to allow at least two compactions. + // v2: decrease the compaction threads to make the qps more stable. + let max_compactions = match engine_type { + EngineType::RaftKv => max_background_jobs - max_background_flushes, + EngineType::RaftKv2 => (max_background_jobs + 3) / 4, + }; let max_sub_compactions: u32 = (max_compactions - 1).clamp(1, defaults.max_sub_compactions); + max_background_jobs = max_background_flushes + max_compactions; // Maximum background GC threads for Titan let max_titan_background_gc = cmp::min(defaults.max_titan_background_gc, cpu_num); @@ -270,9 +294,12 @@ fn get_background_job_limits_impl( } } -fn get_background_job_limits(defaults: &BackgroundJobLimits) -> BackgroundJobLimits { +fn get_background_job_limits( + engine_type: EngineType, + defaults: &BackgroundJobLimits, +) -> BackgroundJobLimits { let cpu_num = cmp::max(SysQuota::cpu_cores_quota() as u32, 1); - get_background_job_limits_impl(cpu_num, defaults) + get_background_job_limits_impl(engine_type, cpu_num, defaults) } macro_rules! cf_config { @@ -311,6 +338,7 @@ macro_rules! cf_config { #[online_config(skip)] pub compression_per_level: [DBCompressionType; 7], pub write_buffer_size: Option, + pub write_buffer_limit: Option, pub max_write_buffer_number: i32, #[online_config(skip)] pub min_write_buffer_number_to_merge: i32, @@ -643,6 +671,7 @@ macro_rules! build_cf_opt { $cf_name, provider.clone(), $opt.compaction_guard_min_output_file_size.0, + $opt.max_compaction_bytes.0, ) .unwrap(); cf_opts.set_sst_partitioner_factory(factory); @@ -668,6 +697,7 @@ macro_rules! build_cf_opt { pub struct CfResources { pub cache: Cache, pub compaction_thread_limiters: HashMap<&'static str, ConcurrentTaskLimiter>, + pub write_buffer_managers: HashMap<&'static str, Arc>, } cf_config!(DefaultCfConfig); @@ -734,6 +764,7 @@ impl Default for DefaultCfConfig { ttl: None, periodic_compaction_seconds: None, titan: TitanCfConfig::default(), + write_buffer_limit: None, } } } @@ -832,6 +863,9 @@ impl DefaultCfConfig { } } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_DEFAULT) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -906,6 +940,7 @@ impl Default for WriteCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -962,6 +997,9 @@ impl WriteCfConfig { .unwrap(); } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_WRITE) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -1028,6 +1066,7 @@ impl Default for LockCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -1062,6 +1101,9 @@ impl LockCfConfig { .unwrap(); } cf_opts.set_titan_cf_options(&self.titan.build_opts()); + if let Some(write_buffer_manager) = shared.write_buffer_managers.get(CF_LOCK) { + cf_opts.set_write_buffer_manager(write_buffer_manager); + } cf_opts } } @@ -1127,6 +1169,7 @@ impl Default for RaftCfConfig { ttl: None, periodic_compaction_seconds: None, titan, + write_buffer_limit: None, } } } @@ -1170,7 +1213,7 @@ impl Default for TitanDbConfig { enabled: false, dirname: "".to_owned(), disable_gc: false, - max_background_gc: 4, + max_background_gc: 1, purge_obsolete_files_period: ReadableDuration::secs(10), } } @@ -1235,10 +1278,10 @@ pub struct DbConfig { #[serde(with = "rocks_config::rate_limiter_mode_serde")] #[online_config(skip)] pub rate_limiter_mode: DBRateLimiterMode, - // deprecated. use rate_limiter_auto_tuned. - #[online_config(skip)] + #[online_config(hidden)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The configuration has been removed. Use `rate_limiter_auto_tuned` instead"] pub auto_tuned: Option, pub rate_limiter_auto_tuned: bool, pub bytes_per_sync: ReadableSize, @@ -1290,20 +1333,16 @@ pub struct DbResources { } impl Default for DbConfig { + #[allow(deprecated)] fn default() -> DbConfig { - let bg_job_limits = get_background_job_limits(&KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); - let titan_config = TitanDbConfig { - max_background_gc: bg_job_limits.max_titan_background_gc as i32, - ..Default::default() - }; DbConfig { wal_recovery_mode: DBRecoveryMode::PointInTime, wal_dir: "".to_owned(), wal_ttl_seconds: 0, wal_size_limit: ReadableSize::kb(0), max_total_wal_size: None, - max_background_jobs: bg_job_limits.max_background_jobs as i32, - max_background_flushes: bg_job_limits.max_background_flushes as i32, + max_background_jobs: 0, + max_background_flushes: 0, max_manifest_file_size: ReadableSize::mb(128), create_if_missing: true, max_open_files: 40960, @@ -1322,7 +1361,7 @@ impl Default for DbConfig { rate_limiter_auto_tuned: true, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(512), - max_sub_compactions: bg_job_limits.max_sub_compactions, + max_sub_compactions: 0, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, @@ -1337,7 +1376,7 @@ impl Default for DbConfig { writecf: WriteCfConfig::default(), lockcf: LockCfConfig::default(), raftcf: RaftCfConfig::default(), - titan: titan_config, + titan: TitanDbConfig::default(), } } } @@ -1385,21 +1424,48 @@ impl DbConfig { // strategy is consistent with single RocksDB. self.defaultcf.max_compactions.get_or_insert(1); self.writecf.max_compactions.get_or_insert(1); - if self.lockcf.write_buffer_size.is_none() { - self.lockcf.write_buffer_size = Some(ReadableSize::mb(4)); - } + self.lockcf + .write_buffer_size + .get_or_insert(ReadableSize::mb(32)); + self.lockcf + .write_buffer_limit + .get_or_insert(DEFAULT_LOCK_BUFFER_MEMORY_LIMIT); } } + let bg_job_limits = get_background_job_limits(engine, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); + if self.max_background_jobs == 0 { + self.max_background_jobs = bg_job_limits.max_background_jobs as i32; + } + if self.max_background_flushes == 0 { + self.max_background_flushes = bg_job_limits.max_background_flushes as i32; + } + if self.max_sub_compactions == 0 { + self.max_sub_compactions = bg_job_limits.max_sub_compactions; + } + if self.titan.max_background_gc == 0 { + self.titan.max_background_gc = bg_job_limits.max_titan_background_gc as i32; + } } - pub fn build_resources(&self, env: Arc) -> DbResources { + pub fn build_resources(&self, env: Arc, engine: EngineType) -> DbResources { let rate_limiter = if self.rate_bytes_per_sec.0 > 0 { + // for raft-v2, we use a longer window to make the compaction io smoother + let (tune_per_secs, window_size, recent_size) = match engine { + // 1s tune duraion, long term window is 5m, short term window is 30s. + // this is the default settings. + EngineType::RaftKv => (1, 300, 30), + // 5s tune duraion, long term window is 1h, short term window is 5m + EngineType::RaftKv2 => (5, 720, 60), + }; Some(Arc::new(RateLimiter::new_writeampbased_with_auto_tuned( self.rate_bytes_per_sec.0 as i64, (self.rate_limiter_refill_period.as_millis() * 1000) as i64, 10, // fairness self.rate_limiter_mode, self.rate_limiter_auto_tuned, + tune_per_secs, + window_size, + recent_size, ))) } else { None @@ -1510,9 +1576,29 @@ impl DbConfig { ConcurrentTaskLimiter::new(CF_RAFT, n), ); } + let mut write_buffer_managers = HashMap::default(); + self.lockcf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_LOCK, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); + self.defaultcf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_DEFAULT, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); + self.writecf.write_buffer_limit.map(|limit| { + write_buffer_managers.insert( + CF_WRITE, + Arc::new(WriteBufferManager::new(limit.0 as usize, 0f32, true)), + ) + }); CfResources { cache, compaction_thread_limiters, + write_buffer_managers, } } @@ -1556,6 +1642,9 @@ impl DbConfig { self.writecf.validate()?; self.raftcf.validate()?; self.titan.validate()?; + if self.raftcf.write_buffer_limit.is_some() { + return Err("raftcf does not support cf based write buffer manager".into()); + } if self.enable_unordered_write { if self.titan.enabled { return Err("RocksDB.unordered_write does not support Titan".into()); @@ -1660,6 +1749,7 @@ impl Default for RaftDefaultCfConfig { ttl: None, periodic_compaction_seconds: None, titan: TitanCfConfig::default(), + write_buffer_limit: None, } } } @@ -1752,7 +1842,9 @@ pub struct RaftDbConfig { impl Default for RaftDbConfig { fn default() -> RaftDbConfig { - let bg_job_limits = get_background_job_limits(&RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); + // raftdb should only be used for raftkv + let bg_job_limits = + get_background_job_limits(EngineType::RaftKv, &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); let titan_config = TitanDbConfig { max_background_gc: bg_job_limits.max_titan_background_gc as i32, ..Default::default() @@ -1903,13 +1995,14 @@ pub enum DbType { } pub struct DbConfigManger { + cfg: DbConfig, db: D, db_type: DbType, } impl DbConfigManger { - pub fn new(db: D, db_type: DbType) -> Self { - DbConfigManger { db, db_type } + pub fn new(cfg: DbConfig, db: D, db_type: DbType) -> Self { + DbConfigManger { cfg, db, db_type } } } @@ -1944,10 +2037,31 @@ impl DbConfigManger { _ => Err(format!("invalid cf {:?} for db {:?}", cf, self.db_type).into()), } } + + fn update_background_cfg( + &self, + max_background_jobs: i32, + max_background_flushes: i32, + ) -> Result<(), Box> { + assert!(max_background_jobs > 0 && max_background_flushes > 0); + let max_background_compacts = + std::cmp::max(max_background_jobs - max_background_flushes, 1); + self.db + .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; + self.db.set_db_config(&[( + "max_background_flushes", + &max_background_flushes.to_string(), + )])?; + self.db.set_db_config(&[( + "max_background_compactions", + &max_background_compacts.to_string(), + )]) + } } impl ConfigManager for DbConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { + self.cfg.update(change.clone())?; let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); let cf_config = change.drain_filter(|(name, _)| name.ends_with("cf")); @@ -1966,6 +2080,15 @@ impl ConfigManager for DbConfigManger { cf_change.insert(name, value); } } + if let Some(f) = cf_change.remove("write_buffer_limit") { + if cf_name != CF_LOCK { + return Err( + "cf write buffer manager is only supportted for lock cf now".into() + ); + } + let size: ReadableSize = f.into(); + self.db.set_cf_flush_size(cf_name, size.0 as usize)?; + } if !cf_change.is_empty() { let cf_change = config_value_to_string(cf_change.into_iter().collect()); let cf_change_slice = config_to_slice(&cf_change); @@ -1996,7 +2119,8 @@ impl ConfigManager for DbConfigManger { .drain_filter(|(name, _)| name == "write_buffer_limit") .next() { - self.db.set_flush_size(size.1.into())?; + let size: ReadableSize = size.1.into(); + self.db.set_flush_size(size.0 as usize)?; } if let Some(f) = change @@ -2011,8 +2135,7 @@ impl ConfigManager for DbConfigManger { .next() { let max_background_jobs: i32 = background_jobs_config.1.into(); - self.db - .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; + self.update_background_cfg(max_background_jobs, self.cfg.max_background_flushes)?; } if let Some(background_subcompactions_config) = change @@ -2029,10 +2152,7 @@ impl ConfigManager for DbConfigManger { .next() { let max_background_flushes: i32 = background_flushes_config.1.into(); - self.db.set_db_config(&[( - "max_background_flushes", - &max_background_flushes.to_string(), - )])?; + self.update_background_cfg(self.cfg.max_background_jobs, max_background_flushes)?; } if !change.is_empty() { @@ -2113,7 +2233,6 @@ pub struct UnifiedReadPoolConfig { pub max_thread_count: usize, #[online_config(skip)] pub stack_size: ReadableSize, - #[online_config(skip)] pub max_tasks_per_worker: usize, pub auto_adjust_pool_size: bool, // FIXME: Add more configs when they are effective in yatp @@ -2762,6 +2881,7 @@ pub struct BackupStreamConfig { pub initial_scan_pending_memory_quota: ReadableSize, #[online_config(skip)] pub initial_scan_rate_limit: ReadableSize, + pub initial_scan_concurrency: usize, } impl BackupStreamConfig { @@ -2789,6 +2909,9 @@ impl BackupStreamConfig { ) .into()); } + if self.initial_scan_concurrency == 0 { + return Err("the `initial_scan_concurrency` shouldn't be zero".into()); + } Ok(()) } } @@ -2816,6 +2939,7 @@ impl Default for BackupStreamConfig { file_size_limit, initial_scan_pending_memory_quota: ReadableSize(quota_size as _), initial_scan_rate_limit: ReadableSize::mb(60), + initial_scan_concurrency: 6, temp_file_memory_quota: cache_size, } } @@ -2830,8 +2954,17 @@ pub struct CdcConfig { // TODO(hi-rustin): Consider resizing the thread pool based on `incremental_scan_threads`. #[online_config(skip)] pub incremental_scan_threads: usize, + // The number of scan tasks that is allowed to run concurrently. pub incremental_scan_concurrency: usize, + // The number of scan tasks that is allowed to be created. In other words, + // there will be at most `incremental_scan_concurrency_limit - incremental_scan_concurrency` + // number of scan tasks that is waitting to run. + pub incremental_scan_concurrency_limit: usize, + /// Limit scan speed based on disk I/O traffic. pub incremental_scan_speed_limit: ReadableSize, + /// Limit scan speed based on memory accesing traffic. + #[doc(hidden)] + pub incremental_fetch_speed_limit: ReadableSize, /// `TsFilter` can increase speed and decrease resource usage when /// incremental content is much less than total content. However in /// other cases, `TsFilter` can make performance worse because it needs @@ -2852,13 +2985,15 @@ pub struct CdcConfig { pub old_value_cache_memory_quota: ReadableSize, // Deprecated! preserved for compatibility check. - #[online_config(skip)] + #[online_config(hidden)] #[doc(hidden)] #[serde(skip_serializing)] + #[deprecated = "The configuration has been removed."] pub old_value_cache_size: usize, } impl Default for CdcConfig { + #[allow(deprecated)] fn default() -> Self { Self { min_ts_interval: ReadableDuration::secs(1), @@ -2867,9 +3002,12 @@ impl Default for CdcConfig { incremental_scan_threads: 4, // At most 6 concurrent running tasks. incremental_scan_concurrency: 6, + // At most 10000 tasks can exist simultaneously. + incremental_scan_concurrency_limit: 10000, // TiCDC requires a SSD, the typical write speed of SSD // is more than 500MB/s, so 128MB/s is enough. incremental_scan_speed_limit: ReadableSize::mb(128), + incremental_fetch_speed_limit: ReadableSize::mb(512), incremental_scan_ts_filter_ratio: 0.2, tso_worker_threads: 1, // 512MB memory for CDC sink. @@ -2907,6 +3045,14 @@ impl CdcConfig { ); self.incremental_scan_concurrency = self.incremental_scan_threads } + if self.incremental_scan_concurrency_limit < self.incremental_scan_concurrency { + warn!( + "cdc.incremental-scan-concurrency-limit must be larger than cdc.incremental-scan-concurrency, + change it to {}", + self.incremental_scan_concurrency + ); + self.incremental_scan_concurrency_limit = self.incremental_scan_concurrency + } if self.incremental_scan_ts_filter_ratio < 0.0 || self.incremental_scan_ts_filter_ratio > 1.0 { @@ -2937,6 +3083,8 @@ pub struct ResolvedTsConfig { pub advance_ts_interval: ReadableDuration, #[online_config(skip)] pub scan_lock_pool_size: usize, + pub memory_quota: ReadableSize, + pub incremental_scan_concurrency: usize, } impl ResolvedTsConfig { @@ -2957,6 +3105,8 @@ impl Default for ResolvedTsConfig { enable: true, advance_ts_interval: ReadableDuration::secs(20), scan_lock_pool_size: 2, + memory_quota: ReadableSize::mb(256), + incremental_scan_concurrency: 6, } } } @@ -3093,6 +3243,63 @@ impl ConfigManager for LogConfigManager { } } +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct MemoryConfig { + // Whether enable the heap profiling which may have a bit performance overhead about 2% for the + // default sample rate. + pub enable_heap_profiling: bool, + + // Average interval between allocation samples, as measured in bytes of allocation activity. + // Increasing the sampling interval decreases profile fidelity, but also decreases the + // computational overhead. + // The default sample interval is 512 KB. It only accepts power of two, otherwise it will be + // rounded up to the next power of two. + pub profiling_sample_per_bytes: ReadableSize, +} + +impl Default for MemoryConfig { + fn default() -> Self { + Self { + enable_heap_profiling: true, + profiling_sample_per_bytes: ReadableSize::kb(512), + } + } +} + +impl MemoryConfig { + pub fn init(&self) { + if self.enable_heap_profiling { + if let Err(e) = tikv_alloc::activate_prof() { + error!("failed to enable heap profiling"; "err" => ?e); + return; + } + tikv_alloc::set_prof_sample(self.profiling_sample_per_bytes.0).unwrap(); + } + } +} + +pub struct MemoryConfigManager; + +impl ConfigManager for MemoryConfigManager { + fn dispatch(&mut self, changes: ConfigChange) -> CfgResult<()> { + if let Some(ConfigValue::Bool(enable)) = changes.get("enable_heap_profiling") { + if *enable { + tikv_alloc::activate_prof()?; + } else { + tikv_alloc::deactivate_prof()?; + } + } + + if let Some(ConfigValue::Size(sample_rate)) = changes.get("profiling_sample_per_bytes") { + tikv_alloc::set_prof_sample(*sample_rate).unwrap(); + } + info!("update memory config"; "config" => ?changes); + Ok(()) + } +} + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -3143,21 +3350,29 @@ pub struct TikvConfig { #[online_config(hidden)] pub cfg_path: String, - // Deprecated! These configuration has been moved to LogConfig. - // They are preserved for compatibility check. #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.level."] pub log_level: LogLevel, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.filename."] pub log_file: String, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.format."] pub log_format: LogFormat, - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.max_days."] pub log_rotation_timespan: ReadableDuration, #[doc(hidden)] - #[online_config(skip)] + #[online_config(hidden)] + #[serde(skip_serializing)] + #[deprecated = "The configuration has been moved to log.file.max_size."] pub log_rotation_size: ReadableSize, #[online_config(skip)] @@ -3185,9 +3400,15 @@ pub struct TikvConfig { #[online_config(skip)] pub memory_usage_high_water: f64, + // Memory quota used for in-memory engine. 0 means not enable it. + pub region_cache_memory_limit: ReadableSize, + #[online_config(submodule)] pub log: LogConfig, + #[online_config(submodule)] + pub memory: MemoryConfig, + #[online_config(submodule)] pub quota: QuotaConfig, @@ -3265,6 +3486,7 @@ pub struct TikvConfig { } impl Default for TikvConfig { + #[allow(deprecated)] fn default() -> TikvConfig { TikvConfig { cfg_path: "".to_owned(), @@ -3280,7 +3502,9 @@ impl Default for TikvConfig { abort_on_panic: false, memory_usage_limit: None, memory_usage_high_water: 0.9, + region_cache_memory_limit: ReadableSize::mb(0), log: LogConfig::default(), + memory: MemoryConfig::default(), quota: QuotaConfig::default(), readpool: ReadPoolConfig::default(), server: ServerConfig::default(), @@ -3343,8 +3567,24 @@ impl TikvConfig { .unwrap() .to_owned(); } - self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; - self.raft_engine.config.dir = self.infer_raft_engine_path(None)?; + + match ( + self.raft_store.raftdb_path.is_empty(), + self.raft_engine.config.dir.is_empty(), + ) { + (false, true) => { + // If raftdb_path is specified, raft_engine_path will inherit it, this will be + // useful when updating from older version. + self.raft_engine.config.dir = + self.infer_raft_engine_path(Some(self.raft_store.raftdb_path.as_str()))?; + self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; + } + _ => { + self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; + self.raft_engine.config.dir = self.infer_raft_engine_path(None)?; + } + } + if self.log_backup.temp_path.is_empty() { self.log_backup.temp_path = config::canonicalize_sub_path(&self.storage.data_dir, "log-backup-temp")?; @@ -3633,7 +3873,8 @@ impl TikvConfig { self.raft_engine.validate()?; self.server.validate()?; self.pd.validate()?; - self.coprocessor.validate()?; + self.coprocessor + .validate(self.storage.engine == EngineType::RaftKv2)?; self.raft_store.validate( self.coprocessor.region_split_size(), self.coprocessor.enable_region_bucket(), @@ -3658,6 +3899,7 @@ impl TikvConfig { // As the init of `logger` is very early, this adjust needs to be separated and // called immediately after parsing the command line. + #[allow(deprecated)] pub fn logger_compatible_adjust(&mut self) { let default_tikv_cfg = TikvConfig::default(); let default_log_cfg = LogConfig::default(); @@ -3709,6 +3951,7 @@ impl TikvConfig { } } + #[allow(deprecated)] pub fn compatible_adjust(&mut self) { let default_raft_store = RaftstoreConfig::default(); let default_coprocessor = CopConfig::default(); @@ -3871,7 +4114,10 @@ impl TikvConfig { last_cfg.raftdb.wal_dir, self.raftdb.wal_dir )); } - if last_raft_engine_dir != self.raft_engine.config.dir { + + if RaftDataStateMachine::raftengine_exists(Path::new(&last_raft_engine_dir)) + && last_raft_engine_dir != self.raft_engine.config.dir + { return Err(format!( "raft engine dir have been changed, former is '{}', \ current is '{}', please check if it is expected.", @@ -4316,6 +4562,7 @@ pub enum Module { BackupStream, Quota, Log, + Memory, Unknown(String), } @@ -4344,6 +4591,7 @@ impl From<&str> for Module { "resource_metering" => Module::ResourceMetering, "quota" => Module::Quota, "log" => Module::Log, + "memory" => Module::Memory, n => Module::Unknown(n.to_owned()), } } @@ -4527,6 +4775,21 @@ mod tests { }, }; + fn create_mock_raftdb(path: &Path) { + fs::create_dir_all(path).unwrap(); + fs::File::create(path.join("CURRENT")).unwrap(); + } + + fn create_mock_raftengine(path: &Path) { + fs::create_dir_all(path).unwrap(); + fs::File::create(path.join("0000000000000001.raftlog")).unwrap(); + } + + fn create_mock_kv_data(path: &Path) { + fs::create_dir_all(path.join("db")).unwrap(); + fs::File::create(path.join("db").join("CURRENT")).unwrap(); + } + #[test] fn test_case_macro() { let h = kebab_case!(HelloWorld); @@ -4577,7 +4840,8 @@ mod tests { tikv_cfg.raft_engine.mut_config().dir = "/raft/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); - tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); + // no actual raft engine data + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); last_cfg.raft_engine.mut_config().dir = "/raft/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); @@ -4628,6 +4892,213 @@ mod tests { ); } } + + let test_dir = tempfile::Builder::new() + .tempdir() + .unwrap() + .into_path() + .join("unittest_raft_engine_dir"); + let data_dir = test_dir.join("data"); + + // simulate tikv restart + // enable raft engine: true + // need dump data from raftdb: false + // custom raft dir: true + { + let raft_dir = test_dir.join("raft"); + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = raft_dir.to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.mut_config().dir = raft_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_raftengine(&raft_dir); + create_mock_kv_data(&data_dir); + + tikv_cfg.validate().unwrap(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + // simulate tikv restart + // enable raft engine: true + // need dump data from raftdb: false + // custom raft dir: false + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + tikv_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftengine(&data_dir.join("raft-engine")); + + tikv_cfg.validate().unwrap(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + // simulate tikv update + // enable raft engine: true + // need dump data from raftdb: true + // custom raft dir: false + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + tikv_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftdb(&data_dir.join("raft")); + + tikv_cfg.validate().unwrap(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + // multi raft engine dir + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + tikv_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftdb(&data_dir.join("raft")); + create_mock_raftengine(&data_dir.join("raft-engine")); + + tikv_cfg.validate().unwrap_err(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + // simulate tikv update with custom raft dir + // enable raft engine: true + // need dump data from raftdb: true + // custom raft dir: false + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = test_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.mut_config().dir = + test_dir.join("raft-engine").to_str().unwrap().to_owned(); + tikv_cfg.raft_store.raftdb_path = test_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftdb(&test_dir.join("raft")); + + tikv_cfg.validate().unwrap(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + // simulate tikv update with custom raft dir + // enable raft engine: true + // need dump data from raftdb: true + // custom raft dir: false + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = test_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.mut_config().dir = "".to_owned(); + tikv_cfg.raft_store.raftdb_path = test_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftdb(&test_dir.join("raft")); + + tikv_cfg.validate().unwrap(); + assert_eq!( + tikv_cfg.raft_engine.config.dir, + test_dir.join("raft").join("raft-engine").to_str().unwrap() + ); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + // simulate tikv downgrade to raftdb + // need dump data from raft-engine + // custom raft dir: false + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + last_cfg.raft_engine.enable = true; + + tikv_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + tikv_cfg.raft_engine.enable = false; + tikv_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftengine(&data_dir.join("raft-engine")); + + tikv_cfg.validate().unwrap(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); + fs::remove_dir_all(&test_dir).unwrap(); + } + + { + tikv_cfg = TikvConfig::default(); + last_cfg = TikvConfig::default(); + + last_cfg.raft_engine.mut_config().dir = + data_dir.join("raft-engine").to_str().unwrap().to_owned(); + last_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + last_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + last_cfg.raft_engine.enable = true; + + tikv_cfg.raft_engine.mut_config().dir = "".to_owned(); + tikv_cfg.raft_engine.enable = false; + tikv_cfg.raft_store.raftdb_path = data_dir.join("raft").to_str().unwrap().to_owned(); + tikv_cfg.storage.data_dir = data_dir.to_str().unwrap().to_owned(); + + create_mock_kv_data(&data_dir); + create_mock_raftengine(&data_dir.join("raft-engine")); + + tikv_cfg.validate().unwrap_err(); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); + fs::remove_dir_all(&test_dir).unwrap(); + } } #[test] @@ -4647,7 +5118,7 @@ mod tests { assert_eq!(last_cfg_metadata.modified().unwrap(), first_modified); // write to file when config is the inequivalent of last one. - cfg.log_level = slog::Level::Warning.into(); + cfg.log.level = slog::Level::Warning.into(); persist_config(&cfg).unwrap(); last_cfg_metadata = last_cfg_path.metadata().unwrap(); assert_ne!(last_cfg_metadata.modified().unwrap(), first_modified); @@ -4763,7 +5234,9 @@ mod tests { fn test_rocks_rate_limit_zero() { let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); - let resource = tikv_cfg.rocksdb.build_resources(Arc::new(Env::default())); + let resource = tikv_cfg + .rocksdb + .build_resources(Arc::new(Env::default()), tikv_cfg.storage.engine); tikv_cfg .rocksdb .build_opt(&resource, tikv_cfg.storage.engine); @@ -4927,7 +5400,9 @@ mod tests { Arc, ) { assert_eq!(F::TAG, cfg.storage.api_version()); - let resource = cfg.rocksdb.build_resources(Arc::default()); + let resource = cfg + .rocksdb + .build_resources(Arc::default(), cfg.storage.engine); let engine = RocksDBEngine::new( &cfg.storage.data_dir, Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), @@ -4958,7 +5433,11 @@ mod tests { let cfg_controller = ConfigController::new(cfg); cfg_controller.register( Module::Rocksdb, - Box::new(DbConfigManger::new(engine.clone(), DbType::Kv)), + Box::new(DbConfigManger::new( + cfg_controller.get_current().rocksdb, + engine.clone(), + DbType::Kv, + )), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( @@ -5097,6 +5576,7 @@ mod tests { cfg.rocksdb.defaultcf.block_cache_size = Some(ReadableSize::mb(8)); cfg.rocksdb.rate_bytes_per_sec = ReadableSize::mb(64); cfg.rocksdb.rate_limiter_auto_tuned = false; + cfg.rocksdb.lockcf.write_buffer_limit = Some(ReadableSize::mb(1)); cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -5108,6 +5588,7 @@ mod tests { .update_config("rocksdb.max-background-jobs", "8") .unwrap(); assert_eq!(db.get_db_options().get_max_background_jobs(), 8); + assert_eq!(db.get_db_options().get_max_background_compactions(), 6); // update max_background_flushes, set to a bigger value assert_eq!(db.get_db_options().get_max_background_flushes(), 2); @@ -5116,6 +5597,7 @@ mod tests { .update_config("rocksdb.max-background-flushes", "5") .unwrap(); assert_eq!(db.get_db_options().get_max_background_flushes(), 5); + assert_eq!(db.get_db_options().get_max_background_compactions(), 3); // update rate_bytes_per_sec assert_eq!( @@ -5131,6 +5613,40 @@ mod tests { ReadableSize::mb(128).0 as i64 ); + cfg_controller + .update_config("rocksdb.write-buffer-limit", "10MB") + .unwrap(); + let flush_size = db.get_db_options().get_flush_size().unwrap(); + assert_eq!(flush_size, ReadableSize::mb(10).0); + + cfg_controller + .update_config("rocksdb.lockcf.write-buffer-limit", "22MB") + .unwrap(); + let cf_opt = db.get_options_cf("lock").unwrap(); + let flush_size = cf_opt.get_flush_size().unwrap(); + assert_eq!(flush_size, ReadableSize::mb(22).0); + + cfg_controller + .update_config("rocksdb.lockcf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("lock").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + + cfg_controller + .update_config("rocksdb.writecf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("write").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + + cfg_controller + .update_config("rocksdb.defaultcf.write-buffer-size", "102MB") + .unwrap(); + let cf_opt = db.get_options_cf("default").unwrap(); + let bsize = cf_opt.get_write_buffer_size(); + assert_eq!(bsize, ReadableSize::mb(102).0); + // update some configs on default cf let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), false); @@ -5200,7 +5716,7 @@ mod tests { } #[test] - fn test_change_logconfig() { + fn test_change_log_config() { let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); @@ -5222,6 +5738,37 @@ mod tests { ); } + #[test] + #[cfg(feature = "mem-profiling")] + fn test_change_memory_config() { + let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); + let cfg_controller = ConfigController::new(cfg); + + cfg_controller.register(Module::Memory, Box::new(MemoryConfigManager)); + cfg_controller + .update_config("memory.enable_heap_profiling", "false") + .unwrap(); + assert_eq!(tikv_alloc::is_profiling_active(), false); + cfg_controller + .update_config("memory.enable_heap_profiling", "true") + .unwrap(); + assert_eq!(tikv_alloc::is_profiling_active(), true); + + cfg_controller + .update_config("memory.profiling_sample_per_bytes", "1MB") + .unwrap(); + assert_eq!( + cfg_controller + .get_current() + .memory + .profiling_sample_per_bytes, + ReadableSize::mb(1), + ); + cfg_controller + .update_config("memory.profiling_sample_per_bytes", "invalid") + .unwrap_err(); + } + #[test] fn test_dispatch_titan_blob_run_mode_config() { let mut cfg = TikvConfig::default(); @@ -5239,7 +5786,28 @@ mod tests { let diff = config_value_to_string(diff.into_iter().collect()); assert_eq!(diff.len(), 1); assert_eq!(diff[0].0.as_str(), "blob_run_mode"); - assert_eq!(diff[0].1.as_str(), "fallback"); + assert_eq!(diff[0].1.as_str(), "kFallback"); + } + + #[test] + fn test_update_titan_blob_run_mode_config() { + let mut cfg = TikvConfig::default(); + cfg.rocksdb.titan.enabled = true; + let (_, cfg_controller, ..) = new_engines::(cfg); + for run_mode in [ + "kFallback", + "kNormal", + "kReadOnly", + "fallback", + "normal", + "read-only", + ] { + let change = HashMap::from([( + "rocksdb.defaultcf.titan.blob-run-mode".to_string(), + run_mode.to_string(), + )]); + cfg_controller.update_without_persist(change).unwrap(); + } } #[test] @@ -5775,61 +6343,95 @@ mod tests { #[test] fn test_background_job_limits() { - // cpu num = 1 + for engine in [EngineType::RaftKv, EngineType::RaftKv2] { + // cpu num = 1 + assert_eq!( + get_background_job_limits_impl( + engine, + 1, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 1, + } + ); + assert_eq!( + get_background_job_limits_impl( + engine, + 1, // cpu_num + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 1, + } + ); + // cpu num = 2 + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, + 2, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 2, + } + ); + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, + 2, // cpu_num + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 2, + max_background_flushes: 1, + max_sub_compactions: 1, + max_titan_background_gc: 2, + } + ); + } + + // cpu num = 4 assert_eq!( get_background_job_limits_impl( - 1, // cpu_num + EngineType::RaftKv, + 4, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 2, - max_background_flushes: 1, - max_sub_compactions: 1, - max_titan_background_gc: 1, - } - ); - assert_eq!( - get_background_job_limits_impl( - 1, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS - ), - BackgroundJobLimits { - max_background_jobs: 2, + max_background_jobs: 3, max_background_flushes: 1, max_sub_compactions: 1, - max_titan_background_gc: 1, + max_titan_background_gc: 4, } ); - // cpu num = 2 assert_eq!( get_background_job_limits_impl( - 2, // cpu_num + EngineType::RaftKv2, + 4, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { max_background_jobs: 2, max_background_flushes: 1, max_sub_compactions: 1, - max_titan_background_gc: 2, - } - ); - assert_eq!( - get_background_job_limits_impl( - 2, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS - ), - BackgroundJobLimits { - max_background_jobs: 2, - max_background_flushes: 1, - max_sub_compactions: 1, - max_titan_background_gc: 2, + max_titan_background_gc: 4, } ); - // cpu num = 4 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 4, // cpu_num - &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { max_background_jobs: 3, @@ -5838,33 +6440,36 @@ mod tests { max_titan_background_gc: 4, } ); + // cpu num = 8 assert_eq!( get_background_job_limits_impl( - 4, // cpu_num - &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS + EngineType::RaftKv, + 8, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 3, - max_background_flushes: 1, - max_sub_compactions: 1, + max_background_jobs: 7, + max_background_flushes: 2, + max_sub_compactions: 3, max_titan_background_gc: 4, } ); - // cpu num = 8 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv2, 8, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { - max_background_jobs: 7, + max_background_jobs: 4, max_background_flushes: 2, - max_sub_compactions: 3, + max_sub_compactions: 1, max_titan_background_gc: 4, } ); assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 8, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -5873,6 +6478,7 @@ mod tests { // cpu num = 16 assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv, 16, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -5880,6 +6486,20 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( + EngineType::RaftKv2, + 16, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), + BackgroundJobLimits { + max_background_jobs: 6, + max_background_flushes: 3, + max_sub_compactions: 2, + max_titan_background_gc: 4, + } + ); + assert_eq!( + get_background_job_limits_impl( + EngineType::RaftKv, 16, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), @@ -5966,6 +6586,9 @@ mod tests { default_cfg .server .optimize_for(default_cfg.coprocessor.region_split_size()); + default_cfg + .raft_store + .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); default_cfg.security.redact_info_log = Some(false); default_cfg.coprocessor.region_max_size = Some(default_cfg.coprocessor.region_max_size()); default_cfg.coprocessor.region_max_keys = Some(default_cfg.coprocessor.region_max_keys()); @@ -6067,12 +6690,12 @@ mod tests { assert_eq!(default_cfg.coprocessor.region_split_size(), SPLIT_SIZE); assert!(!default_cfg.coprocessor.enable_region_bucket()); - assert_eq!(default_cfg.split.qps_threshold, DEFAULT_QPS_THRESHOLD); + assert_eq!(default_cfg.split.qps_threshold(), DEFAULT_QPS_THRESHOLD); assert_eq!( - default_cfg.split.region_cpu_overload_threshold_ratio, + default_cfg.split.region_cpu_overload_threshold_ratio(), REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); - assert_eq!(default_cfg.split.byte_threshold, DEFAULT_BYTE_THRESHOLD); + assert_eq!(default_cfg.split.byte_threshold(), DEFAULT_BYTE_THRESHOLD); let mut default_cfg = TikvConfig::default(); default_cfg.storage.engine = EngineType::RaftKv2; @@ -6082,15 +6705,15 @@ mod tests { RAFTSTORE_V2_SPLIT_SIZE ); assert_eq!( - default_cfg.split.qps_threshold, + default_cfg.split.qps_threshold(), DEFAULT_BIG_REGION_QPS_THRESHOLD ); assert_eq!( - default_cfg.split.region_cpu_overload_threshold_ratio, + default_cfg.split.region_cpu_overload_threshold_ratio(), BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO ); assert_eq!( - default_cfg.split.byte_threshold, + default_cfg.split.byte_threshold(), DEFAULT_BIG_REGION_BYTE_THRESHOLD ); assert!(default_cfg.coprocessor.enable_region_bucket()); @@ -6098,21 +6721,25 @@ mod tests { let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); default_cfg.coprocessor.optimize_for(false); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.coprocessor.validate(false).unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(!default_cfg.coprocessor.enable_region_bucket()); + default_cfg.coprocessor.validate(true).unwrap(); assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); default_cfg.coprocessor.optimize_for(true); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.coprocessor.validate(false).unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(!default_cfg.coprocessor.enable_region_bucket()); + default_cfg.coprocessor.validate(true).unwrap(); assert!(default_cfg.coprocessor.enable_region_bucket()); } @@ -6187,6 +6814,15 @@ mod tests { let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); + let content = r#" + [cdc] + incremental-scan-concurrency = 6 + incremental-scan-concurrency-limit = 0 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert!(cfg.cdc.incremental_scan_concurrency_limit >= cfg.cdc.incremental_scan_concurrency); + let content = r#" [storage] engine = "partitioned-raft-kv" @@ -6425,4 +7061,67 @@ mod tests { Some(ReadableSize::gb(1)) ); } + + #[test] + fn test_compact_check_default() { + let content = r#" + [raftstore] + region-compact-check-step = 50 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 50); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 20 + ); + + let content = r#" + [raftstore] + region-compact-check-step = 50 + [storage] + engine = "partitioned-raft-kv" + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 50); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 20 + ); + + let content = r#" + [raftstore] + region-compact-redundant-rows-percent = 50 + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 100); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 50 + ); + + let content = r#" + [raftstore] + region-compact-redundant-rows-percent = 50 + [storage] + engine = "partitioned-raft-kv" + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.raft_store.region_compact_check_step.unwrap(), 5); + assert_eq!( + cfg.raft_store + .region_compact_redundant_rows_percent + .unwrap(), + 50 + ); + } } diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index 31a6df181d5..bd077c5c0ba 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -143,7 +143,9 @@ fn handle_qe_response( can_be_cached: bool, data_version: Option, ) -> Result { - use tidb_query_common::error::ErrorInner; + use tidb_query_common::error::{ErrorInner, EvaluateError}; + + use crate::coprocessor::Error; match result { Ok((sel_resp, range)) => { @@ -162,6 +164,7 @@ fn handle_qe_response( } Err(err) => match *err.0 { ErrorInner::Storage(err) => Err(err.into()), + ErrorInner::Evaluate(EvaluateError::DeadlineExceeded) => Err(Error::DeadlineExceeded), ErrorInner::Evaluate(err) => { let mut resp = Response::default(); let mut sel_resp = SelectResponse::default(); @@ -179,7 +182,9 @@ fn handle_qe_response( fn handle_qe_stream_response( result: tidb_query_common::Result<(Option<(StreamResponse, IntervalRange)>, bool)>, ) -> Result<(Option, bool)> { - use tidb_query_common::error::ErrorInner; + use tidb_query_common::error::{ErrorInner, EvaluateError}; + + use crate::coprocessor::Error; match result { Ok((Some((s_resp, range)), finished)) => { @@ -192,6 +197,7 @@ fn handle_qe_stream_response( Ok((None, finished)) => Ok((None, finished)), Err(err) => match *err.0 { ErrorInner::Storage(err) => Err(err.into()), + ErrorInner::Evaluate(EvaluateError::DeadlineExceeded) => Err(Error::DeadlineExceeded), ErrorInner::Evaluate(err) => { let mut resp = Response::default(); let mut s_resp = StreamResponse::default(); @@ -203,3 +209,43 @@ fn handle_qe_stream_response( }, } } + +#[cfg(test)] +mod tests { + use anyhow::anyhow; + use protobuf::Message; + use tidb_query_common::error::{Error as CommonError, EvaluateError, StorageError}; + + use super::*; + use crate::coprocessor::Error; + + #[test] + fn test_handle_qe_response() { + // Ok Response + let ok_res = Ok((SelectResponse::default(), None)); + let res = handle_qe_response(ok_res, true, Some(1)).unwrap(); + assert!(res.can_be_cached); + assert_eq!(res.get_cache_last_version(), 1); + let mut select_res = SelectResponse::new(); + Message::merge_from_bytes(&mut select_res, res.get_data()).unwrap(); + assert!(!select_res.has_error()); + + // Storage Error + let storage_err = CommonError::from(StorageError(anyhow!("unknown"))); + let res = handle_qe_response(Err(storage_err), false, None); + assert!(matches!(res, Err(Error::Other(_)))); + + // Evaluate Error + let err = CommonError::from(EvaluateError::DeadlineExceeded); + let res = handle_qe_response(Err(err), false, None); + assert!(matches!(res, Err(Error::DeadlineExceeded))); + + let err = CommonError::from(EvaluateError::InvalidCharacterString { + charset: "test".into(), + }); + let res = handle_qe_response(Err(err), false, None).unwrap(); + let mut select_res = SelectResponse::new(); + Message::merge_from_bytes(&mut select_res, res.get_data()).unwrap(); + assert_eq!(select_res.get_error().get_code(), 1300); + } +} diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 7a12c7493e5..001d1e94ca0 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -19,7 +19,9 @@ use resource_metering::{FutureExt, ResourceTagFactory, StreamExt}; use tidb_query_common::execute_stats::ExecSummary; use tikv_alloc::trace::MemoryTraceGuard; use tikv_kv::SnapshotExt; -use tikv_util::{quota_limiter::QuotaLimiter, time::Instant}; +use tikv_util::{ + deadline::set_deadline_exceeded_busy_error, quota_limiter::QuotaLimiter, time::Instant, +}; use tipb::{AnalyzeReq, AnalyzeType, ChecksumRequest, ChecksumScanOn, DagRequest, ExecType}; use tokio::sync::Semaphore; use txn_types::Lock; @@ -511,6 +513,10 @@ impl Endpoint { .get_resource_control_context() .get_resource_group_name(), req_ctx.context.get_request_source(), + req_ctx + .context + .get_resource_control_context() + .get_override_priority(), ) }); // box the tracker so that moving it is cheap. @@ -544,8 +550,9 @@ impl Endpoint { if let Err(busy_err) = self.read_pool.check_busy_threshold(Duration::from_millis( req.get_context().get_busy_threshold_ms() as u64, )) { - let mut resp = coppb::Response::default(); - resp.mut_region_error().set_server_is_busy(busy_err); + let mut pb_error = errorpb::Error::new(); + pb_error.set_server_is_busy(busy_err); + let resp = make_error_response(Error::Region(pb_error)); return Either::Left(async move { resp.into() }); } @@ -756,6 +763,10 @@ impl Endpoint { .get_resource_control_context() .get_resource_group_name(), req_ctx.context.get_request_source(), + req_ctx + .context + .get_resource_control_context() + .get_override_priority(), ) }); let key_ranges = req_ctx @@ -810,77 +821,62 @@ impl Endpoint { } } +macro_rules! make_error_response_common { + ($resp:expr, $tag:expr, $e:expr) => {{ + match $e { + Error::Region(e) => { + $tag = storage::get_tag_from_header(&e); + $resp.set_region_error(e); + } + Error::Locked(info) => { + $tag = "meet_lock"; + $resp.set_locked(info); + } + Error::DeadlineExceeded => { + $tag = "deadline_exceeded"; + let mut err = errorpb::Error::default(); + set_deadline_exceeded_busy_error(&mut err); + err.set_message($e.to_string()); + $resp.set_region_error(err); + } + Error::MaxPendingTasksExceeded => { + $tag = "max_pending_tasks_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason($e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message($e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + $resp.set_region_error(errorpb); + } + Error::Other(_) => { + $tag = "other"; + warn!("unexpected other error encountered processing coprocessor task"; + "error" => ?&$e, + ); + $resp.set_other_error($e.to_string()); + } + }; + COPR_REQ_ERROR.with_label_values(&[$tag]).inc(); + }}; +} + fn make_error_batch_response(batch_resp: &mut coppb::StoreBatchTaskResponse, e: Error) { - warn!( + debug!( "batch cop task error-response"; "err" => %e ); let tag; - match e { - Error::Region(e) => { - tag = storage::get_tag_from_header(&e); - batch_resp.set_region_error(e); - } - Error::Locked(info) => { - tag = "meet_lock"; - batch_resp.set_locked(info); - } - Error::DeadlineExceeded => { - tag = "deadline_exceeded"; - batch_resp.set_other_error(e.to_string()); - } - Error::MaxPendingTasksExceeded => { - tag = "max_pending_tasks_exceeded"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(e.to_string()); - let mut errorpb = errorpb::Error::default(); - errorpb.set_message(e.to_string()); - errorpb.set_server_is_busy(server_is_busy_err); - batch_resp.set_region_error(errorpb); - } - Error::Other(_) => { - tag = "other"; - batch_resp.set_other_error(e.to_string()); - } - }; - COPR_REQ_ERROR.with_label_values(&[tag]).inc(); + make_error_response_common!(batch_resp, tag, e); } fn make_error_response(e: Error) -> coppb::Response { - warn!( + debug!( "error-response"; "err" => %e ); - let mut resp = coppb::Response::default(); let tag; - match e { - Error::Region(e) => { - tag = storage::get_tag_from_header(&e); - resp.set_region_error(e); - } - Error::Locked(info) => { - tag = "meet_lock"; - resp.set_locked(info); - } - Error::DeadlineExceeded => { - tag = "deadline_exceeded"; - resp.set_other_error(e.to_string()); - } - Error::MaxPendingTasksExceeded => { - tag = "max_pending_tasks_exceeded"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(e.to_string()); - let mut errorpb = errorpb::Error::default(); - errorpb.set_message(e.to_string()); - errorpb.set_server_is_busy(server_is_busy_err); - resp.set_region_error(errorpb); - } - Error::Other(_) => { - tag = "other"; - resp.set_other_error(e.to_string()); - } - }; - COPR_REQ_ERROR.with_label_values(&[tag]).inc(); + let mut resp = coppb::Response::default(); + make_error_response_common!(resp, tag, e); resp } @@ -1945,7 +1941,11 @@ mod tests { let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); assert_eq!(resp.get_data().len(), 0); - assert!(!resp.get_other_error().is_empty()); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); } { @@ -1962,7 +1962,11 @@ mod tests { let resp = block_on(copr.handle_unary_request(config, handler_builder)).unwrap(); assert_eq!(resp.get_data().len(), 0); - assert!(!resp.get_other_error().is_empty()); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); } } @@ -2014,4 +2018,18 @@ mod tests { let resp = block_on(copr.parse_and_handle_unary_request(req, None)); assert_eq!(resp.get_locked().get_key(), b"key"); } + + #[test] + fn test_make_error_response() { + let resp = make_error_response(Error::DeadlineExceeded); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); + } } diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index 64905b3dfba..02f45d35311 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -208,6 +208,12 @@ impl CopLocalMetrics { pub fn local_read_stats(&self) -> &ReadStats { &self.local_read_stats } + + #[cfg(test)] + pub fn clear(&mut self) { + self.local_read_stats.region_infos.clear(); + self.local_read_stats.region_buckets.clear(); + } } thread_local! { diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 140d3c0476e..fcd16f9b947 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -159,7 +159,11 @@ impl ReqContext { cache_match_version: Option, perf_level: PerfLevel, ) -> Self { - let deadline = Deadline::from_now(max_handle_duration); + let mut deadline_duration = max_handle_duration; + if context.max_execution_duration_ms > 0 { + deadline_duration = Duration::from_millis(context.max_execution_duration_ms); + } + let deadline = Deadline::from_now(deadline_duration); let bypass_locks = TsSet::from_u64s(context.take_resolved_locks()); let access_locks = TsSet::from_u64s(context.take_committed_locks()); let lower_bound = match ranges.first().as_ref() { @@ -235,6 +239,23 @@ lazy_static! { mod tests { use super::*; + fn default_req_ctx_with_ctx_duration( + context: kvrpcpb::Context, + max_handle_duration: Duration, + ) -> ReqContext { + ReqContext::new( + ReqTag::test, + context, + Vec::new(), + max_handle_duration, + None, + None, + TimeStamp::max(), + None, + PerfLevel::EnableCount, + ) + } + #[test] fn test_build_task_id() { let mut ctx = ReqContext::default_for_test(); @@ -246,4 +267,27 @@ mod tests { ctx.context.set_task_id(0); assert_eq!(ctx.build_task_id(), start_ts); } + + #[test] + fn test_deadline_from_req_ctx() { + let ctx = kvrpcpb::Context::default(); + let max_handle_duration = Duration::from_millis(100); + let req_ctx = default_req_ctx_with_ctx_duration(ctx, max_handle_duration); + // sleep at least 100ms + std::thread::sleep(Duration::from_millis(200)); + req_ctx + .deadline + .check() + .expect_err("deadline should exceed"); + + let mut ctx = kvrpcpb::Context::default(); + ctx.max_execution_duration_ms = 100_000; + let req_ctx = default_req_ctx_with_ctx_duration(ctx, max_handle_duration); + // sleep at least 100ms + std::thread::sleep(Duration::from_millis(200)); + req_ctx + .deadline + .check() + .expect("deadline should not exceed"); + } } diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 18eaa0b6e98..cacf69d2c61 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -6,8 +6,10 @@ use ::tracker::{get_tls_tracker_token, with_tls_tracker}; use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; +use protobuf::Message; use tikv_kv::Engine; use tikv_util::time::{self, Duration, Instant}; +use tipb::ResourceGroupTag; use txn_types::Key; use super::metrics::*; @@ -264,8 +266,16 @@ impl Tracker { .unwrap_or_default() }); + let source_stmt = self.req_ctx.context.get_source_stmt(); with_tls_tracker(|tracker| { + let mut req_tag = ResourceGroupTag::new(); + req_tag + .merge_from_bytes(&tracker.req_info.resource_group_tag) + .unwrap_or_default(); info!(#"slow_log", "slow-query"; + "connection_id" => source_stmt.get_connection_id(), + "session_alias" => source_stmt.get_session_alias(), + "query_digest" => hex::encode(req_tag.get_sql_digest()), "region_id" => &self.req_ctx.context.get_region_id(), "remote_host" => &self.req_ctx.peer, "total_lifetime" => ?self.req_lifetime, @@ -350,20 +360,24 @@ impl Tracker { false }; - tls_collect_query( - region_id, - peer, - start_key.as_encoded(), - end_key.as_encoded(), - reverse_scan, - ); - tls_collect_read_flow( - self.req_ctx.context.get_region_id(), - Some(start_key.as_encoded()), - Some(end_key.as_encoded()), - &total_storage_stats, - self.buckets.as_ref(), - ); + // only collect metrics for select and index, exclude transient read flow such + // like analyze and checksum. + if self.req_ctx.tag == ReqTag::select || self.req_ctx.tag == ReqTag::index { + tls_collect_query( + region_id, + peer, + start_key.as_encoded(), + end_key.as_encoded(), + reverse_scan, + ); + tls_collect_read_flow( + self.req_ctx.context.get_region_id(), + Some(start_key.as_encoded()), + Some(end_key.as_encoded()), + &total_storage_stats, + self.buckets.as_ref(), + ); + } self.current_stage = TrackerState::Tracked; } @@ -427,6 +441,36 @@ impl Drop for Tracker { if let TrackerState::ItemFinished(_) = self.current_stage { self.on_finish_all_items(); } + + if self.current_stage != TrackerState::AllItemFinished + && self.req_ctx.deadline.check().is_err() + { + // record deadline exceeded error log. + let total_lifetime = self.request_begin_at.saturating_elapsed(); + let source_stmt = self.req_ctx.context.get_source_stmt(); + let first_range = self.req_ctx.ranges.first(); + let some_table_id = first_range.as_ref().map(|range| { + tidb_query_datatype::codec::table::decode_table_id(range.get_start()) + .unwrap_or_default() + }); + warn!("query deadline exceeded"; + "current_stage" => ?self.current_stage, + "connection_id" => source_stmt.get_connection_id(), + "session_alias" => source_stmt.get_session_alias(), + "region_id" => &self.req_ctx.context.get_region_id(), + "remote_host" => &self.req_ctx.peer, + "total_lifetime" => ?total_lifetime, + "wait_time" => ?self.wait_time, + "wait_time.schedule" => ?self.schedule_wait_time, + "wait_time.snapshot" => ?self.snapshot_wait_time, + "handler_build_time" => ?self.handler_build_time, + "total_process_time" => ?self.total_process_time, + "total_suspend_time" => ?self.total_suspend_time, + "txn_start_ts" => self.req_ctx.txn_start_ts, + "table_id" => some_table_id, + "tag" => self.req_ctx.tag.get_str(), + ); + } } } @@ -443,69 +487,86 @@ mod tests { #[test] fn test_track() { - let mut context = kvrpcpb::Context::default(); - context.set_region_id(1); - - let mut req_ctx = ReqContext::new( - ReqTag::test, - context, - vec![], - Duration::from_secs(0), - None, - None, - TimeStamp::max(), - None, - PerfLevel::EnableCount, - ); - req_ctx.lower_bound = vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 67, - ]; - req_ctx.upper_bound = vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 167, - ]; - let mut track: Tracker = Tracker::new(req_ctx, Duration::default()); - let mut bucket = BucketMeta::default(); - bucket.region_id = 1; - bucket.version = 1; - bucket.keys = vec![ - vec![ - 116, 128, 0, 0, 0, 0, 0, 0, 255, 179, 95, 114, 128, 0, 0, 0, 0, 255, 0, 175, 155, - 0, 0, 0, 0, 0, 250, - ], - vec![ - 116, 128, 0, 255, 255, 255, 255, 255, 255, 254, 0, 0, 0, 0, 0, 0, 0, 248, - ], - ]; - bucket.sizes = vec![10]; - track.buckets = Some(Arc::new(bucket)); - - let mut stat = Statistics::default(); - stat.write.flow_stats.read_keys = 10; - track.total_storage_stats = stat; - - track.track(); - drop(track); - TLS_COP_METRICS.with(|m| { - assert_eq!( - 10, - m.borrow() - .local_read_stats() - .region_infos - .get(&1) - .unwrap() - .flow - .read_keys - ); - assert_eq!( - vec![10], - m.borrow() - .local_read_stats() - .region_buckets - .get(&1) - .unwrap() - .stats - .read_keys + let check = move |tag: ReqTag, flow: u64| { + let mut context = kvrpcpb::Context::default(); + context.set_region_id(1); + let mut req_ctx = ReqContext::new( + tag, + context, + vec![], + Duration::from_secs(0), + None, + None, + TimeStamp::max(), + None, + PerfLevel::EnableCount, ); - }); + + req_ctx.lower_bound = vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 67, + ]; + req_ctx.upper_bound = vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 184, 95, 114, 128, 0, 0, 0, 0, 0, 70, 167, + ]; + let mut track: Tracker = Tracker::new(req_ctx, Duration::default()); + let mut bucket = BucketMeta::default(); + bucket.region_id = 1; + bucket.version = 1; + bucket.keys = vec![ + vec![ + 116, 128, 0, 0, 0, 0, 0, 0, 255, 179, 95, 114, 128, 0, 0, 0, 0, 255, 0, 175, + 155, 0, 0, 0, 0, 0, 250, + ], + vec![ + 116, 128, 0, 255, 255, 255, 255, 255, 255, 254, 0, 0, 0, 0, 0, 0, 0, 248, + ], + ]; + bucket.sizes = vec![10]; + track.buckets = Some(Arc::new(bucket)); + + let mut stat = Statistics::default(); + stat.write.flow_stats.read_keys = 10; + track.total_storage_stats = stat; + + track.track(); + drop(track); + TLS_COP_METRICS.with(|m| { + if flow > 0 { + assert_eq!( + flow as usize, + m.borrow() + .local_read_stats() + .region_infos + .get(&1) + .unwrap() + .flow + .read_keys + ); + assert_eq!( + flow, + m.borrow() + .local_read_stats() + .region_buckets + .get(&1) + .unwrap() + .stats + .read_keys[0] + ); + } else { + assert!(m.borrow().local_read_stats().region_infos.get(&1).is_none()); + assert!( + m.borrow() + .local_read_stats() + .region_buckets + .get(&1) + .is_none() + ); + } + + m.borrow_mut().clear(); + }); + }; + check(ReqTag::select, 10); + check(ReqTag::analyze_full_sampling, 0); } } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 0c81873c130..d5b5c7c4103 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,7 +5,10 @@ use std::{ convert::identity, future::Future, path::PathBuf, - sync::{Arc, Mutex}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, time::Duration, }; @@ -20,9 +23,16 @@ use kvproto::{ errorpb, import_sstpb::{ Error as ImportPbError, ImportSst, Range, RawWriteRequest_oneof_chunk as RawChunk, SstMeta, - SwitchMode, WriteRequest_oneof_chunk as Chunk, *, + SuspendImportRpcRequest, SuspendImportRpcResponse, SwitchMode, + WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, + metapb::RegionEpoch, +}; +use raftstore::{ + coprocessor::{RegionInfo, RegionInfoProvider}, + store::util::is_epoch_stale, + RegionInfoAccessor, }; use raftstore_v2::StoreMeta; use resource_control::{with_resource_limiter, ResourceGroupManager}; @@ -35,13 +45,13 @@ use tikv_kv::{ }; use tikv_util::{ config::ReadableSize, - future::create_stream_with_buffer, + future::{create_stream_with_buffer, paired_future_callback}, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, HandyRwLock, }; use tokio::{runtime::Runtime, time::sleep}; -use txn_types::{Key, WriteRef, WriteType}; +use txn_types::{Key, TimeStamp, WriteRef, WriteType}; use super::{ make_rpc_error, @@ -49,6 +59,7 @@ use super::{ }; use crate::{ import::duplicate_detect::DuplicateDetector, + send_rpc_response, server::CONFIG_ROCKSDB_GAUGE, storage::{self, errors::extract_region_error_from_error}, }; @@ -80,6 +91,10 @@ const WIRE_EXTRA_BYTES: usize = 12; /// [`raft_writer::ThrottledTlsEngineWriter`]. There aren't too many items held /// in the writer. So we can run the GC less frequently. const WRITER_GC_INTERVAL: Duration = Duration::from_secs(300); +/// The max time of suspending requests. +/// This may save us from some client sending insane value to the server. +const SUSPEND_REQUEST_MAX_SECS: u64 = // 6h + 6 * 60 * 60; fn transfer_error(err: storage::Error) -> ImportPbError { let mut e = ImportPbError::default(); @@ -111,16 +126,20 @@ pub struct ImportSstService { tablets: LocalTablets, engine: E, threads: Arc, - importer: Arc, + importer: Arc>, limiter: Limiter, task_slots: Arc>>, raft_entry_max_size: ReadableSize, + region_info_accessor: Arc, writer: raft_writer::ThrottledTlsEngineWriter, // it's some iff multi-rocksdb is enabled store_meta: Option>>>, resource_manager: Option>, + + // When less than now, don't accept any requests. + suspend_req_until: Arc, } struct RequestCollector { @@ -165,6 +184,9 @@ impl RequestCollector { } fn accept_kv(&mut self, cf: &str, is_delete: bool, k: Vec, v: Vec) { + debug!("Accepting KV."; "cf" => %cf, + "key" => %log_wrappers::Value::key(&k), + "value" => %log_wrappers::Value::key(&v)); // Need to skip the empty key/value that could break the transaction or cause // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. if k.is_empty() || (!is_delete && v.is_empty()) { @@ -300,9 +322,10 @@ impl ImportSstService { raft_entry_max_size: ReadableSize, engine: E, tablets: LocalTablets, - importer: Arc, + importer: Arc>, store_meta: Option>>>, resource_manager: Option>, + region_info_accessor: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); let eng = Mutex::new(engine.clone()); @@ -327,7 +350,7 @@ impl ImportSstService { if let LocalTablets::Singleton(tablet) = &tablets { importer.start_switch_mode_check(threads.handle(), Some(tablet.clone())); } else { - importer.start_switch_mode_check::(threads.handle(), None); + importer.start_switch_mode_check(threads.handle(), None); } let writer = raft_writer::ThrottledTlsEngineWriter::default(); @@ -350,9 +373,11 @@ impl ImportSstService { limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), raft_entry_max_size, + region_info_accessor, writer, store_meta, resource_manager, + suspend_req_until: Arc::new(AtomicU64::new(0)), } } @@ -360,7 +385,7 @@ impl ImportSstService { self.cfg.clone() } - async fn tick(importer: Arc, cfg: ConfigManager) { + async fn tick(importer: Arc>, cfg: ConfigManager) { loop { sleep(Duration::from_secs(10)).await; @@ -538,7 +563,7 @@ impl ImportSstService { async fn apply_imp( mut req: ApplyRequest, - importer: Arc, + importer: Arc>, writer: raft_writer::ThrottledTlsEngineWriter, limiter: Limiter, max_raft_size: usize, @@ -567,7 +592,6 @@ impl ImportSstService { let buff = importer .read_from_kv_file( meta, - rule, ext_storage.clone(), req.get_storage_backend(), &limiter, @@ -579,6 +603,7 @@ impl ImportSstService { meta.get_start_ts(), meta.get_restore_ts(), buff, + rule, |k, v| collector.accept_kv(meta.get_cf(), meta.get_is_delete(), k, v), )? { if let Some(range) = range.as_mut() { @@ -616,6 +641,86 @@ impl ImportSstService { Ok(range) } + + /// Check whether we should suspend the current request. + fn check_suspend(&self) -> Result<()> { + let now = TimeStamp::physical_now(); + let suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + if now < suspend_until { + Err(Error::Suspended { + time_to_lease_expire: Duration::from_millis(suspend_until - now), + }) + } else { + Ok(()) + } + } + + /// suspend requests for a period. + /// + /// # returns + /// + /// whether for now, the requests has already been suspended. + pub fn suspend_requests(&self, for_time: Duration) -> bool { + let now = TimeStamp::physical_now(); + let last_suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + let suspended = now < last_suspend_until; + let suspend_until = TimeStamp::physical_now() + for_time.as_millis() as u64; + self.suspend_req_until + .store(suspend_until, Ordering::SeqCst); + suspended + } + + /// allow all requests to enter. + /// + /// # returns + /// + /// whether requests has already been previously suspended. + pub fn allow_requests(&self) -> bool { + let now = TimeStamp::physical_now(); + let last_suspend_until = self.suspend_req_until.load(Ordering::SeqCst); + let suspended = now < last_suspend_until; + self.suspend_req_until.store(0, Ordering::SeqCst); + suspended + } +} + +fn check_local_region_stale( + region_id: u64, + epoch: &RegionEpoch, + local_region_info: Option, +) -> Result<()> { + match local_region_info { + Some(local_region_info) => { + let local_region_epoch = local_region_info.region.region_epoch.unwrap(); + + // when local region epoch is stale, client can retry write later + if is_epoch_stale(&local_region_epoch, epoch) { + return Err(Error::RequestTooNew(format!( + "request region {} is ahead of local region, local epoch {:?}, request epoch {:?}, please retry write later", + region_id, local_region_epoch, epoch + ))); + } + // when local region epoch is ahead, client need to rescan region from PD to get + // latest region later + if is_epoch_stale(epoch, &local_region_epoch) { + return Err(Error::RequestTooOld(format!( + "request region {} is staler than local region, local epoch {:?}, request epoch {:?}", + region_id, local_region_epoch, epoch + ))); + } + + // not match means to rescan + Ok(()) + } + None => { + // when region not found, we can't tell whether it's stale or ahead, so we just + // return the safest case + Err(Error::RequestTooOld(format!( + "region {} is not found", + region_id + ))) + } + } } #[macro_export] @@ -629,6 +734,7 @@ macro_rules! impl_write { ) { let import = self.importer.clone(); let tablets = self.tablets.clone(); + let region_info_accessor = self.region_info_accessor.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -637,12 +743,15 @@ macro_rules! impl_write { let label = stringify!($fn); let resource_manager = self.resource_manager.clone(); let handle_task = async move { - let res = async move { - let first_req = rx.try_next().await?; + let (res, rx) = async move { + let first_req = match rx.try_next().await { + Ok(r) => r, + Err(e) => return (Err(e), Some(rx)), + }; let (meta, resource_limiter) = match first_req { Some(r) => { let limiter = resource_manager.as_ref().and_then(|m| { - m.get_resource_limiter( + m.get_background_resource_limiter( r.get_context() .get_resource_control_context() .get_resource_group_name(), @@ -651,18 +760,49 @@ macro_rules! impl_write { }); match r.chunk { Some($chunk_ty::Meta(m)) => (m, limiter), - _ => return Err(Error::InvalidChunk), + _ => return (Err(Error::InvalidChunk), Some(rx)), } } - _ => return Err(Error::InvalidChunk), + _ => return (Err(Error::InvalidChunk), Some(rx)), }; + // wait the region epoch on this TiKV to catch up with the epoch + // in request, which comes from PD and represents the majority + // peers' status. let region_id = meta.get_region_id(); + let (cb, f) = paired_future_callback(); + if let Err(e) = region_info_accessor + .find_region_by_id(region_id, cb) + .map_err(|e| { + // when region not found, we can't tell whether it's stale or ahead, so + // we just return the safest case + Error::RequestTooOld(format!( + "failed to find region {} err {:?}", + region_id, e + )) + }) + { + return (Err(e), Some(rx)); + }; + let res = match f.await { + Ok(r) => r, + Err(e) => return (Err(From::from(e)), Some(rx)), + }; + if let Err(e) = + check_local_region_stale(region_id, meta.get_region_epoch(), res) + { + return (Err(e), Some(rx)); + }; + let tablet = match tablets.get(region_id) { Some(t) => t, None => { - return Err(Error::Engine( - format!("region {} not found", region_id).into(), - )); + return ( + Err(Error::RequestTooOld(format!( + "region {} not found", + region_id + ))), + Some(rx), + ); } }; @@ -670,10 +810,10 @@ macro_rules! impl_write { Ok(w) => w, Err(e) => { error!("build writer failed {:?}", e); - return Err(Error::InvalidChunk); + return (Err(Error::InvalidChunk), Some(rx)); } }; - let (writer, resource_limiter) = rx + let result = rx .try_fold( (writer, resource_limiter), |(mut writer, limiter), req| async move { @@ -690,7 +830,11 @@ macro_rules! impl_write { .map(|w| (w, limiter)) }, ) - .await?; + .await; + let (writer, resource_limiter) = match result { + Ok(r) => r, + Err(e) => return (Err(e), None), + }; let finish_fn = async { let metas = writer.finish()?; @@ -699,13 +843,18 @@ macro_rules! impl_write { }; let metas: Result<_> = with_resource_limiter(finish_fn, resource_limiter).await; - let metas = metas?; + let metas = match metas { + Ok(r) => r, + Err(e) => return (Err(e), None), + }; let mut resp = $resp_ty::default(); resp.set_metas(metas.into()); - Ok(resp) + (Ok(resp), None) } .await; $crate::send_rpc_response!(res, sink, label, timer); + // don't drop rx before send response + _ = rx; }; self.threads.spawn(buf_driver); @@ -911,7 +1060,7 @@ impl ImportSst for ImportSstService { let tablets = self.tablets.clone(); let start = Instant::now(); let resource_limiter = self.resource_manager.as_ref().and_then(|r| { - r.get_resource_limiter( + r.get_background_resource_limiter( req.get_context() .get_resource_control_context() .get_resource_group_name(), @@ -949,7 +1098,7 @@ impl ImportSst for ImportSstService { }; let res = with_resource_limiter( - importer.download_ext::( + importer.download_ext( req.get_sst(), req.get_storage_backend(), req.get_name(), @@ -990,6 +1139,10 @@ impl ImportSst for ImportSstService { ) { let label = "ingest"; let timer = Instant::now_coarse(); + if let Err(err) = self.check_suspend() { + ctx.spawn(async move { crate::send_rpc_response!(Err(err), sink, label, timer) }); + return; + } let mut resp = IngestResponse::default(); let region_id = req.get_context().get_region_id(); @@ -1033,6 +1186,10 @@ impl ImportSst for ImportSstService { ) { let label = "multi-ingest"; let timer = Instant::now_coarse(); + if let Err(err) = self.check_suspend() { + ctx.spawn(async move { crate::send_rpc_response!(Err(err), sink, label, timer) }); + return; + } let mut resp = IngestResponse::default(); if let Some(errorpb) = self.check_write_stall(req.get_context().get_region_id()) { @@ -1237,6 +1394,37 @@ impl ImportSst for ImportSstService { RawChunk, new_raw_writer ); + + fn suspend_import_rpc( + &mut self, + ctx: RpcContext<'_>, + req: SuspendImportRpcRequest, + sink: UnarySink, + ) { + let label = "suspend_import_rpc"; + let timer = Instant::now_coarse(); + + if req.should_suspend_imports && req.get_duration_in_secs() > SUSPEND_REQUEST_MAX_SECS { + ctx.spawn(async move { + send_rpc_response!(Err(Error::Io( + std::io::Error::new(std::io::ErrorKind::InvalidInput, + format!("you are going to suspend the import RPCs too long. (for {} seconds, max acceptable duration is {} seconds)", + req.get_duration_in_secs(), SUSPEND_REQUEST_MAX_SECS)))), sink, label, timer); + }); + return; + } + + let suspended = if req.should_suspend_imports { + info!("suspend incoming import RPCs."; "for_second" => req.get_duration_in_secs(), "caller" => req.get_caller()); + self.suspend_requests(Duration::from_secs(req.get_duration_in_secs())) + } else { + info!("allow incoming import RPCs."; "caller" => req.get_caller()); + self.allow_requests() + }; + let mut resp = SuspendImportRpcResponse::default(); + resp.set_already_suspended(suspended); + ctx.spawn(async move { send_rpc_response!(Ok(resp), sink, label, timer) }); + } } // add error statistics from pb error response @@ -1296,14 +1484,19 @@ mod test { use engine_traits::{CF_DEFAULT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, - metapb::RegionEpoch, + metapb::{Region, RegionEpoch}, raft_cmdpb::{RaftCmdRequest, Request}, }; - use protobuf::Message; + use protobuf::{Message, SingularPtrField}; + use raft::StateRole::Follower; + use raftstore::RegionInfo; use tikv_kv::{Modify, WriteData}; use txn_types::{Key, TimeStamp, Write, WriteBatchFlags, WriteType}; - use crate::{import::sst_service::RequestCollector, server::raftkv}; + use crate::{ + import::sst_service::{check_local_region_stale, RequestCollector}, + server::raftkv, + }; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1587,4 +1780,72 @@ mod test { } assert_eq!(total, 100); } + + #[test] + fn test_write_rpc_check_region_epoch() { + let mut req_epoch = RegionEpoch { + conf_ver: 10, + version: 10, + ..Default::default() + }; + // test for region not found + let result = check_local_region_stale(1, &req_epoch, None); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + let mut local_region_info = RegionInfo { + region: Region { + id: 1, + region_epoch: SingularPtrField::some(req_epoch.clone()), + ..Default::default() + }, + role: Follower, + buckets: 1, + }; + // test the local region epoch is same as request + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + result.unwrap(); + + // test the local region epoch is ahead of request + local_region_info + .region + .region_epoch + .as_mut() + .unwrap() + .conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + assert!(result.is_err()); + // check error message contains "rescan region later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("rescan region later") + ); + + req_epoch.conf_ver = 11; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info.clone())); + result.unwrap(); + + // test the local region epoch is staler than request + req_epoch.version = 12; + let result = check_local_region_stale(1, &req_epoch, Some(local_region_info)); + assert!(result.is_err()); + // check error message contains "retry write later", client will match this + // string pattern + assert!( + result + .unwrap_err() + .to_string() + .contains("retry write later") + ); + } } diff --git a/src/lib.rs b/src/lib.rs index b3e9ebaf8e8..acccb2f55e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,8 +79,8 @@ pub fn tikv_version_info(build_time: Option<&str>) -> String { } /// return the build version of tikv-server -pub fn tikv_build_version() -> &'static str { - env!("CARGO_PKG_VERSION") +pub fn tikv_build_version() -> String { + env!("CARGO_PKG_VERSION").to_owned() } /// Prints the tikv version information to the standard output. diff --git a/src/read_pool.rs b/src/read_pool.rs index a5898ea4f63..2ea6c7e36b2 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -17,10 +17,11 @@ use kvproto::{errorpb, kvrpcpb::CommandPri}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use prometheus::{core::Metric, Histogram, IntCounter, IntGauge}; use resource_control::{ - with_resource_limiter, ControlledFuture, ResourceController, ResourceLimiter, TaskMetadata, + with_resource_limiter, ControlledFuture, ResourceController, ResourceLimiter, }; use thiserror::Error; use tikv_util::{ + resource_control::TaskMetadata, sys::{cpu_time::ProcessStat, SysQuota}, time::Instant, worker::{Runnable, RunnableWithTimer, Scheduler, Worker}, @@ -270,6 +271,21 @@ impl ReadPoolHandle { } } + pub fn set_max_tasks_per_worker(&mut self, tasks_per_thread: usize) { + match self { + ReadPoolHandle::FuturePools { .. } => { + unreachable!() + } + ReadPoolHandle::Yatp { + max_tasks, + pool_size, + .. + } => { + *max_tasks = tasks_per_thread.saturating_mul(*pool_size); + } + } + } + pub fn get_ewma_time_slice(&self) -> Option { match self { ReadPoolHandle::FuturePools { .. } => None, @@ -312,6 +328,10 @@ impl ReadPoolHandle { let mut busy_err = errorpb::ServerIsBusy::default(); busy_err.set_reason("estimated wait time exceeds threshold".to_owned()); busy_err.estimated_wait_ms = u32::try_from(estimated_wait.as_millis()).unwrap_or(u32::MAX); + warn!("Already many pending tasks in the read queue, task is rejected"; + "busy_threshold" => ?&busy_threshold, + "busy_err" => ?&busy_err, + ); Err(busy_err) } } @@ -429,6 +449,7 @@ pub fn build_yatp_read_pool( engine: E, resource_ctl: Option>, cleanup_method: CleanupMethod, + enable_task_wait_metrics: bool, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); build_yatp_read_pool_with_name( @@ -438,6 +459,7 @@ pub fn build_yatp_read_pool( resource_ctl, cleanup_method, unified_read_pool_name, + enable_task_wait_metrics, ) } @@ -448,6 +470,7 @@ pub fn build_yatp_read_pool_with_name( resource_ctl: Option>, cleanup_method: CleanupMethod, unified_read_pool_name: String, + enable_task_wait_metrics: bool, ) -> ReadPool { let raftkv = Arc::new(Mutex::new(engine)); let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) @@ -472,7 +495,9 @@ pub fn build_yatp_read_pool_with_name( }) .before_stop(|| unsafe { destroy_tls_engine::(); - }); + }) + .enable_task_wait_metrics(enable_task_wait_metrics); + let pool = if let Some(ref r) = resource_ctl { builder.build_priority_pool(r.clone()) } else { @@ -583,6 +608,9 @@ impl Runnable for ReadPoolConfigRunner { self.cur_thread_count = self.core_thread_count; } } + Task::MaxTasksPerWorker(s) => { + self.handle.set_max_tasks_per_worker(s); + } } } } @@ -667,6 +695,7 @@ impl ReadPoolConfigRunner { enum Task { PoolSize(usize), AutoAdjust(bool), + MaxTasksPerWorker(usize), } impl std::fmt::Display for Task { @@ -674,6 +703,7 @@ impl std::fmt::Display for Task { match self { Task::PoolSize(s) => write!(f, "PoolSize({})", *s), Task::AutoAdjust(s) => write!(f, "AutoAdjust({})", *s), + Task::MaxTasksPerWorker(s) => write!(f, "MaxTasksPerWorker({})", *s), } } } @@ -726,6 +756,10 @@ impl ConfigManager for ReadPoolConfigManager { if let Some(ConfigValue::Bool(b)) = unified.get("auto_adjust_pool_size") { self.scheduler.schedule(Task::AutoAdjust(*b))?; } + if let Some(ConfigValue::Usize(max_tasks)) = unified.get("max_tasks_per_worker") { + self.scheduler + .schedule(Task::MaxTasksPerWorker(*max_tasks))?; + } } info!( "readpool config changed"; @@ -796,8 +830,16 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let name = "test-yatp-full"; + let pool = build_yatp_read_pool_with_name( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + name.to_owned(), + false, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -831,6 +873,12 @@ mod tests { handle .spawn(task4, CommandPri::Normal, 4, TaskMetadata::default(), None) .unwrap(); + assert_eq!( + UNIFIED_READ_POOL_RUNNING_TASKS + .with_label_values(&[name]) + .get(), + 2 + ); } #[test] @@ -844,8 +892,14 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + false, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -897,11 +951,17 @@ mod tests { max_tasks_per_worker: 1, ..Default::default() }; - // max running tasks number should be 2*1 = 2 + // max running tasks number for each priority should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = - build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); + let pool = build_yatp_read_pool( + &config, + DummyReporter, + engine, + None, + CleanupMethod::InPlace, + false, + ); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -931,6 +991,9 @@ mod tests { _ => panic!("should return full error"), } + // TODO: move running task by priority to read_pool. + // spawn a high-priority task, should not return Full error. + tx1.send(()).unwrap(); tx2.send(()).unwrap(); thread::sleep(Duration::from_millis(300)); @@ -1027,6 +1090,7 @@ mod tests { resource_manager, CleanupMethod::InPlace, name.clone(), + false, ); let gen_task = || { diff --git a/src/server/config.rs b/src/server/config.rs index 013d1a66238..4e66e5802c0 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -187,26 +187,27 @@ pub struct Config { #[online_config(skip)] pub labels: HashMap, - // deprecated. use readpool.coprocessor.xx_concurrency. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.*_concurrency."] pub end_point_concurrency: Option, - // deprecated. use readpool.coprocessor.stack_size. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.stack_size."] pub end_point_stack_size: Option, - // deprecated. use readpool.coprocessor.max_tasks_per_worker_xx. #[doc(hidden)] #[serde(skip_serializing)] - #[online_config(skip)] + #[online_config(hidden)] + #[deprecated = "The configuration has been moved to readpool.coprocessor.max_tasks_per_worker_*."] pub end_point_max_tasks: Option, } impl Default for Config { + #[allow(deprecated)] fn default() -> Config { let cpu_num = SysQuota::cpu_cores_quota(); let background_thread_count = if cpu_num > 16.0 { 3 } else { 2 }; diff --git a/src/server/debug.rs b/src/server/debug.rs index 9e01852455c..70e1df855d5 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -1111,9 +1111,11 @@ async fn async_key_range_flashback_to_version ?resp.get_error(), "region_err" => ?resp.get_region_error()); - return Err(Error::FlashbackFailed( - "exec prepare flashback failed.".into(), - )); + return Err(Error::FlashbackFailed(format!( + "exec prepare flashback failed: resp err is: {:?}, region err is: {:?}", + resp.get_error(), + resp.get_region_error() + ))); } } else { let mut req = kvrpcpb::FlashbackToVersionRequest::new(); @@ -1127,9 +1129,11 @@ async fn async_key_range_flashback_to_version ?resp.get_error(), "region_err" => ?resp.get_region_error()); - return Err(Error::FlashbackFailed( - "exec finish flashback failed.".into(), - )); + return Err(Error::FlashbackFailed(format!( + "exec finish flashback failed: resp err is: {:?}, region err is: {:?}", + resp.get_error(), + resp.get_region_error() + ))); } } Ok(()) diff --git a/src/server/debug2.rs b/src/server/debug2.rs index e914b353760..4230828dff1 100644 --- a/src/server/debug2.rs +++ b/src/server/debug2.rs @@ -10,7 +10,7 @@ use engine_traits::{ TabletRegistry, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use futures::future::Future; -use keys::{data_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; +use keys::{data_key, enc_end_key, enc_start_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; use kvproto::{ debugpb::Db as DbType, kvrpcpb::MvccInfo, @@ -36,6 +36,34 @@ use crate::{ storage::mvcc::{MvccInfoCollector, MvccInfoScanner}, }; +// `key1` and `key2` should both be start_key or end_key. +fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], is_end_key: bool) -> &'a [u8] { + if is_end_key && key1.is_empty() { + return key2; + } + if is_end_key && key2.is_empty() { + return key1; + } + if key1 < key2 { + return key1; + } + key2 +} + +// `key1` and `key2` should both be start_key or end_key. +fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], is_end_key: bool) -> &'a [u8] { + if is_end_key && key1.is_empty() { + return key1; + } + if is_end_key && key2.is_empty() { + return key2; + } + if key1 < key2 { + return key2; + } + key1 +} + // return the region containing the seek_key or the next region if not existed fn seek_region( seek_key: &[u8], @@ -98,11 +126,16 @@ impl MvccInfoIteratorV2 { )?; let tablet = tablet_cache.latest().unwrap(); + let region_start_key = enc_start_key(first_region_state.get_region()); + let region_end_key = enc_end_key(first_region_state.get_region()); + let iter_start = larger_key(start, ®ion_start_key, false); + let iter_end = smaller_key(end, ®ion_end_key, true); + assert!(!iter_start.is_empty() && !iter_start.is_empty()); let scanner = Some( MvccInfoScanner::new( |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), - if start.is_empty() { None } else { Some(start) }, - if end.is_empty() { None } else { Some(end) }, + Some(iter_start), + Some(iter_end), MvccInfoCollector::default(), ) .map_err(|e| -> Error { box_err!(e) })?, @@ -171,19 +204,16 @@ impl Iterator for MvccInfoIteratorV2 { ) .unwrap(); let tablet = tablet_cache.latest().unwrap(); + let region_start_key = enc_start_key(&self.cur_region); + let region_end_key = enc_end_key(&self.cur_region); + let iter_start = larger_key(&self.start, ®ion_start_key, false); + let iter_end = smaller_key(&self.end, ®ion_end_key, true); + assert!(!iter_start.is_empty() && !iter_start.is_empty()); self.scanner = Some( MvccInfoScanner::new( |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), - if self.start.is_empty() { - None - } else { - Some(self.start.as_bytes()) - }, - if self.end.is_empty() { - None - } else { - Some(self.end.as_bytes()) - }, + Some(iter_start), + Some(iter_end), MvccInfoCollector::default(), ) .unwrap(), @@ -658,19 +688,19 @@ impl Debugger for DebuggerImplV2 { fn region_size>(&self, region_id: u64, cfs: Vec) -> Result> { match self.raft_engine.get_region_state(region_id, u64::MAX) { Ok(Some(region_state)) => { - if region_state.get_state() != PeerState::Normal { - return Err(Error::NotFound(format!( - "region {:?} has been deleted", - region_id - ))); - } let region = region_state.get_region(); + let state = region_state.get_state(); let start_key = &keys::data_key(region.get_start_key()); let end_key = &keys::data_end_key(region.get_end_key()); let mut sizes = vec![]; let mut tablet_cache = get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; - let tablet = tablet_cache.latest().unwrap(); + let Some(tablet) = tablet_cache.latest() else { + return Err(Error::NotFound(format!( + "tablet not found, region_id={:?}, peer_state={:?}", + region_id, state + ))); + }; for cf in cfs { let mut size = 0; box_try!(tablet.scan(cf.as_ref(), start_key, end_key, false, |k, v| { @@ -701,7 +731,7 @@ impl Debugger for DebuggerImplV2 { )); } - let mut region_states = get_all_region_states_with_normal_state(&self.raft_engine); + let mut region_states = get_all_active_region_states(&self.raft_engine); region_states.sort_by(|r1, r2| { r1.get_region() @@ -756,12 +786,21 @@ impl Debugger for DebuggerImplV2 { fn get_all_regions_in_store(&self) -> Result> { let mut region_ids = vec![]; + let raft_engine = &self.raft_engine; self.raft_engine .for_each_raft_group::(&mut |region_id| { + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + if region_state.state == PeerState::Tombstone { + return Ok(()); + } region_ids.push(region_id); Ok(()) }) .unwrap(); + region_ids.sort_unstable(); Ok(region_ids) } @@ -814,21 +853,29 @@ impl Debugger for DebuggerImplV2 { Err(e) => return Err(Error::EngineTrait(e)), }; - if region_state.state != PeerState::Normal { - return Err(Error::NotFound(format!("none region {:?}", region_id))); + let state = region_state.get_state(); + if state == PeerState::Tombstone { + return Err(Error::NotFound(format!( + "region {:?} is tombstone", + region_id + ))); } - let region = region_state.get_region(); - let start = keys::enc_start_key(region); - let end = keys::enc_end_key(region); - - let mut tablet_cache = - get_tablet_cache(&self.tablet_reg, region.id, Some(region_state.clone())).unwrap(); - let tablet = tablet_cache.latest().unwrap(); + let region = region_state.get_region().clone(); + let start = keys::enc_start_key(®ion); + let end = keys::enc_end_key(®ion); + + let mut tablet_cache = get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; + let Some(tablet) = tablet_cache.latest() else { + return Err(Error::NotFound(format!( + "tablet not found, region_id={:?}, peer_state={:?}", + region_id, state + ))); + }; let mut res = dump_write_cf_properties(tablet, &start, &end)?; let mut res1 = dump_default_cf_properties(tablet, &start, &end)?; res.append(&mut res1); - let middle_key = match box_try!(get_region_approximate_middle(tablet, region)) { + let middle_key = match box_try!(get_region_approximate_middle(tablet, ®ion)) { Some(data_key) => keys::origin_key(&data_key).to_vec(), None => Vec::new(), }; @@ -1072,9 +1119,7 @@ fn get_tablet_cache( } } -fn get_all_region_states_with_normal_state( - raft_engine: &ER, -) -> Vec { +fn get_all_active_region_states(raft_engine: &ER) -> Vec { let mut region_states = vec![]; raft_engine .for_each_raft_group::(&mut |region_id| { @@ -1082,7 +1127,7 @@ fn get_all_region_states_with_normal_state( .get_region_state(region_id, u64::MAX) .unwrap() .unwrap(); - if region_state.state == PeerState::Normal { + if region_state.state != PeerState::Tombstone { region_states.push(region_state); } Ok(()) @@ -1103,7 +1148,7 @@ fn deivde_regions_for_concurrency( registry: &TabletRegistry, threads: u64, ) -> Result>> { - let region_states = get_all_region_states_with_normal_state(raft_engine); + let region_states = get_all_active_region_states(raft_engine); if threads == 1 { return Ok(vec![ @@ -1154,38 +1199,28 @@ fn deivde_regions_for_concurrency( Ok(regions_groups) } -// `key1` and `key2` should both be start_key or end_key. -fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { - if end_key && key1.is_empty() { - return key2; - } - if end_key && key2.is_empty() { - return key1; - } - if key1 < key2 { - return key1; - } - key2 -} +#[cfg(any(test, feature = "testexport"))] +pub fn new_debugger(path: &std::path::Path) -> DebuggerImplV2 { + use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; -// `key1` and `key2` should both be start_key or end_key. -fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { - if end_key && key1.is_empty() { - return key1; - } - if end_key && key2.is_empty() { - return key2; - } - if key1 < key2 { - return key2; - } - key1 + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + let cache = cfg.storage.block_cache.build_shared_cache(); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = + raft_log_engine::RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + + DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) } #[cfg(test)] mod tests { - use std::path::Path; - use collections::HashMap; use engine_traits::{ RaftEngineReadOnly, RaftLogBatch, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_WRITE, @@ -1196,36 +1231,13 @@ mod tests { raft_serverpb::*, }; use raft::prelude::EntryType; - use raft_log_engine::RaftLogEngine; use raftstore::store::RAFT_INIT_LOG_INDEX; use tikv_util::store::new_peer; use super::*; - use crate::{ - config::TikvConfig, - server::KvEngineFactoryBuilder, - storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, - }; - const INITIAL_TABLET_INDEX: u64 = 5; const INITIAL_APPLY_INDEX: u64 = 5; - fn new_debugger(path: &Path) -> DebuggerImplV2 { - let mut cfg = TikvConfig::default(); - cfg.storage.data_dir = path.to_str().unwrap().to_string(); - cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); - cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); - let cache = cfg.storage.block_cache.build_shared_cache(); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); - let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); - - let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); - - DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) - } - impl DebuggerImplV2 { fn set_store_id(&self, store_id: u64) { let mut ident = self.get_store_ident().unwrap_or_default(); @@ -1455,126 +1467,10 @@ mod tests { let mut wb = raft_engine.log_batch(10); wb.put_region_state(region_id, 10, &state).unwrap(); raft_engine.consume(&mut wb, true).unwrap(); + debugger.tablet_reg.remove(region_id); debugger.region_size(region_id, cfs.clone()).unwrap_err(); } - // For simplicity, the format of the key is inline with data in - // prepare_data_on_disk - fn extract_key(key: &[u8]) -> &[u8] { - &key[1..4] - } - - // Prepare some data - // Data for each region: - // Region 1: k00 .. k04 - // Region 2: k05 .. k09 - // Region 3: k10 .. k14 - // Region 4: k15 .. k19 - // Region 5: k20 .. k24 - // Region 6: k26 .. k28 - fn prepare_data_on_disk(path: &Path) { - let mut cfg = TikvConfig::default(); - cfg.storage.data_dir = path.to_str().unwrap().to_string(); - cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); - cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); - cfg.gc.enable_compaction_filter = false; - let cache = cfg.storage.block_cache.build_shared_cache(); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); - let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); - - let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); - let mut wb = raft_engine.log_batch(5); - for i in 0..6 { - let mut region = metapb::Region::default(); - let start_key = format!("k{:02}", i * 5); - let end_key = format!("k{:02}", (i + 1) * 5); - region.set_id(i + 1); - region.set_start_key(start_key.into_bytes()); - region.set_end_key(end_key.into_bytes()); - let mut region_state = RegionLocalState::default(); - region_state.set_tablet_index(INITIAL_TABLET_INDEX); - if region.get_id() == 4 { - region_state.set_state(PeerState::Tombstone); - } else if region.get_id() == 6 { - region.set_start_key(b"k26".to_vec()); - region.set_end_key(b"k28".to_vec()); - } - region_state.set_region(region); - - let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); - // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to - // use prewrite method to prepare mvcc data - let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); - for i in i * 5..(i + 1) * 5 { - let key = format!("zk{:02}", i); - let val = format!("val{:02}", i); - // Use prewrite only is enough for preparing mvcc data - must_prewrite_put( - &mut engine, - key.as_bytes(), - val.as_bytes(), - key.as_bytes(), - 10, - ); - } - - wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) - .unwrap(); - } - raft_engine.consume(&mut wb, true).unwrap(); - } - - #[test] - fn test_scan_mvcc() { - let dir = test_util::temp_dir("test-debugger", false); - prepare_data_on_disk(dir.path()); - let debugger = new_debugger(dir.path()); - // Test scan with bad start, end or limit. - assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); - assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); - - let verify_scanner = - |range, scanner: &mut dyn Iterator, MvccInfo)>>| { - for i in range { - let key = format!("k{:02}", i).into_bytes(); - assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); - } - }; - - // full scann - let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); - verify_scanner(0..15, &mut scanner); - verify_scanner(20..25, &mut scanner); - verify_scanner(26..28, &mut scanner); - assert!(scanner.next().is_none()); - - // Range has more elements than limit - let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); - verify_scanner(1..6, &mut scanner); - assert!(scanner.next().is_none()); - - // Range has less elements than limit - let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); - verify_scanner(7..10, &mut scanner); - assert!(scanner.next().is_none()); - - // Start from the key where no region contains it - let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); - verify_scanner(20..25, &mut scanner); - verify_scanner(26..28, &mut scanner); - assert!(scanner.next().is_none()); - - // Scan a range not existed in the cluster - let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); - assert!(scanner.next().is_none()); - - // The end key is less than the start_key of the first region - let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); - assert!(scanner.next().is_none()); - } - #[test] fn test_compact() { let dir = test_util::temp_dir("test-debugger", false); @@ -2050,9 +1946,9 @@ mod tests { assert_eq!(region_info_2, region_info_2_before); } - #[test] // It tests that the latest apply state cannot be read as it is invisible // on persisted_applied + #[test] fn test_drop_unapplied_raftlog_2() { let dir = test_util::temp_dir("test-debugger", false); let debugger = new_debugger(dir.path()); @@ -2088,4 +1984,34 @@ mod tests { 80 ); } + + #[test] + fn test_get_all_regions_in_store() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + + init_region_state(raft_engine, 1, &[100, 101], 1); + init_region_state(raft_engine, 3, &[100, 101], 1); + init_region_state(raft_engine, 4, &[100, 101], 1); + + let mut lb = raft_engine.log_batch(3); + + let mut put_tombsotne_region = |region_id: u64| { + let mut region = metapb::Region::default(); + region.set_id(region_id); + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Tombstone); + region_state.set_region(region.clone()); + lb.put_region_state(region_id, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + raft_engine.consume(&mut lb, true).unwrap(); + }; + + put_tombsotne_region(2); + put_tombsotne_region(5); + + let regions = debugger.get_all_regions_in_store().unwrap(); + assert_eq!(regions, vec![1, 3, 4]); + } } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 85de282b137..3593c01ca7f 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -56,7 +56,7 @@ impl KvEngineFactoryBuilder { flow_listener: None, sst_recovery_sender: None, encryption_key_manager: key_manager, - db_resources: config.rocksdb.build_resources(env), + db_resources: config.rocksdb.build_resources(env, config.storage.engine), cf_resources: config.rocksdb.build_cf_resources(cache), state_storage: None, lite: false, diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 665824a1bac..2bea0cf347b 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -888,7 +888,7 @@ pub mod test_utils { cfg.ratio_threshold = ratio_threshold; } cfg.enable_compaction_filter = true; - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg)), None) }; let feature_gate = { let feature_gate = FeatureGate::default(); diff --git a/src/server/gc_worker/config.rs b/src/server/gc_worker/config.rs index 1816dd845e1..809c55e1268 100644 --- a/src/server/gc_worker/config.rs +++ b/src/server/gc_worker/config.rs @@ -3,7 +3,10 @@ use std::sync::Arc; use online_config::{ConfigChange, ConfigManager, OnlineConfig}; -use tikv_util::config::{ReadableSize, VersionTrack}; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + yatp_pool::FuturePool, +}; const DEFAULT_GC_RATIO_THRESHOLD: f64 = 1.1; pub const DEFAULT_GC_BATCH_KEYS: usize = 512; @@ -22,6 +25,8 @@ pub struct GcConfig { /// greater than 5.0.0. Change `compaction_filter_skip_version_check` /// can enable it by force. pub compaction_filter_skip_version_check: bool, + /// gc threads count + pub num_threads: usize, } impl Default for GcConfig { @@ -32,6 +37,7 @@ impl Default for GcConfig { max_write_bytes_per_sec: ReadableSize(DEFAULT_GC_MAX_WRITE_BYTES_PER_SEC), enable_compaction_filter: true, compaction_filter_skip_version_check: false, + num_threads: 1, } } } @@ -41,12 +47,15 @@ impl GcConfig { if self.batch_keys == 0 { return Err("gc.batch_keys should not be 0".into()); } + if self.num_threads == 0 { + return Err("gc.thread_count should not be 0".into()); + } Ok(()) } } #[derive(Clone, Default)] -pub struct GcWorkerConfigManager(pub Arc>); +pub struct GcWorkerConfigManager(pub Arc>, pub Option); impl ConfigManager for GcWorkerConfigManager { fn dispatch( @@ -55,6 +64,16 @@ impl ConfigManager for GcWorkerConfigManager { ) -> std::result::Result<(), Box> { { let change = change.clone(); + if let Some(pool) = self.1.as_ref() { + if let Some(v) = change.get("num_threads") { + let pool_size: usize = v.into(); + pool.scale_pool_size(pool_size); + info!( + "GC worker thread count is changed"; + "new_thread_count" => pool_size, + ); + } + } self.0 .update(move |cfg: &mut GcConfig| cfg.update(change))?; } diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index be18f8216d5..d9c5287b67d 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -4,7 +4,7 @@ use std::{ cmp::Ordering, sync::{ atomic::{AtomicU64, Ordering as AtomicOrdering}, - mpsc, Arc, + mpsc, Arc, Condvar, Mutex, }, thread::{self, Builder as ThreadBuilder, JoinHandle}, time::Duration, @@ -20,10 +20,10 @@ use txn_types::{Key, TimeStamp}; use super::{ compaction_filter::is_compaction_filter_allowed, config::GcWorkerConfigManager, - gc_worker::{sync_gc, GcSafePointProvider, GcTask}, + gc_worker::{schedule_gc, GcSafePointProvider, GcTask}, Result, }; -use crate::{server::metrics::*, tikv_util::sys::thread::StdThreadBuildWrapper}; +use crate::{server::metrics::*, storage::Callback, tikv_util::sys::thread::StdThreadBuildWrapper}; const POLL_SAFE_POINT_INTERVAL_SECS: u64 = 10; @@ -245,6 +245,8 @@ pub(super) struct GcManager GcManager { @@ -254,6 +256,7 @@ impl GcMan worker_scheduler: Scheduler>, cfg_tracker: GcWorkerConfigManager, feature_gate: FeatureGate, + concurrent_tasks: usize, ) -> GcManager { GcManager { cfg, @@ -263,6 +266,7 @@ impl GcMan gc_manager_ctx: GcManagerContext::new(), cfg_tracker, feature_gate, + max_concurrent_tasks: concurrent_tasks, } } @@ -442,13 +446,27 @@ impl GcMan let mut progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); // Records how many region we have GC-ed. - let mut processed_regions = 0; + let mut scheduled_regions = 0; + let task_controller = Arc::new((Mutex::new(0), Condvar::new())); + // the task_controller is the combination to control the number + // of tasks The mutex is used for protecting the number of current + // tasks, while the condvar is used for notifying/get notified when the + // number of current tasks is changed. + let (lock, cvar) = &*task_controller; + let maybe_wait = |max_tasks| { + let mut current_tasks: std::sync::MutexGuard<'_, usize> = lock.lock().unwrap(); + while *current_tasks > max_tasks { + // Wait until the number of current tasks is below the limit + current_tasks = cvar.wait(current_tasks).unwrap(); + } + }; info!("gc_worker: auto gc starts"; "safe_point" => self.curr_safe_point()); // The following loop iterates all regions whose leader is on this TiKV and does // GC on them. At the same time, check whether safe_point is updated // periodically. If it's updated, rewinding will happen. + loop { self.gc_manager_ctx.check_stopped()?; if is_compaction_filter_allowed(&self.cfg_tracker.value(), &self.feature_gate) { @@ -462,9 +480,9 @@ impl GcMan // We have worked to the end and we need to rewind. Restart from beginning. progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); need_rewind = false; - info!("gc_worker: auto gc rewinds"; "processed_regions" => processed_regions); + info!("gc_worker: auto gc rewinds"; "scheduled_regions" => scheduled_regions); - processed_regions = 0; + scheduled_regions = 0; // Set the metric to zero to show that rewinding has happened. AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC .with_label_values(&[PROCESS_TYPE_GC]) @@ -483,19 +501,40 @@ impl GcMan if finished { // We have worked to the end of the TiKV or our progress has reached `end`, and // we don't need to rewind. In this case, the round of GC has finished. - info!("gc_worker: auto gc finishes"; "processed_regions" => processed_regions); - return Ok(()); + info!("gc_worker: all regions task are scheduled"; + "processed_regions" => scheduled_regions, + ); + break; } } - assert!(progress.is_some()); // Before doing GC, check whether safe_point is updated periodically to // determine if rewinding is needed. self.check_if_need_rewind(&progress, &mut need_rewind, &mut end); - progress = self.gc_next_region(progress.unwrap(), &mut processed_regions)?; + let controller: Arc<(Mutex, Condvar)> = Arc::clone(&task_controller); + let cb = Box::new(move |_res| { + let (lock, cvar) = &*controller; + let mut current_tasks = lock.lock().unwrap(); + *current_tasks -= 1; + cvar.notify_one(); + AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC + .with_label_values(&[PROCESS_TYPE_GC]) + .inc(); + }); + maybe_wait(self.max_concurrent_tasks - 1); + let mut current_tasks = lock.lock().unwrap(); + progress = self.async_gc_next_region(progress.unwrap(), cb, &mut current_tasks)?; + scheduled_regions += 1; } + + // wait for all tasks finished + self.gc_manager_ctx.check_stopped()?; + maybe_wait(0); + info!("gc_worker: auto gc finishes"; "processed_regions" => scheduled_regions); + + Ok(()) } /// Checks whether we need to rewind in this round of GC. Only used in @@ -536,13 +575,14 @@ impl GcMan } } - /// Does GC on the next region after `from_key`. Returns the end key of the - /// region it processed. If we have processed to the end of all regions, - /// returns `None`. - fn gc_next_region( + /// Does GC on the next region after `from_key` asynchronously. Returns the + /// end key of the region it processed. If we have processed to the end + /// of all regions, returns `None`. + fn async_gc_next_region( &mut self, from_key: Key, - processed_regions: &mut usize, + callback: Callback<()>, + running_tasks: &mut usize, ) -> GcManagerResult> { // Get the information of the next region to do GC. let (region, next_key) = self.get_next_gc_context(from_key); @@ -552,16 +592,16 @@ impl GcMan let hex_end = format!("{:?}", log_wrappers::Value::key(region.get_end_key())); debug!("trying gc"; "region_id" => region.id, "start_key" => &hex_start, "end_key" => &hex_end); - if let Err(e) = sync_gc(&self.worker_scheduler, region, self.curr_safe_point()) { - // Ignore the error and continue, since it's useless to retry this. - // TODO: Find a better way to handle errors. Maybe we should retry. - warn!("failed gc"; "start_key" => &hex_start, "end_key" => &hex_end, "err" => ?e); - } - - *processed_regions += 1; - AUTO_GC_PROCESSED_REGIONS_GAUGE_VEC - .with_label_values(&[PROCESS_TYPE_GC]) - .inc(); + let _ = schedule_gc( + &self.worker_scheduler, + region, + self.curr_safe_point(), + callback, + ) + .map(|_| { + *running_tasks += 1; + Ok::<(), GcManagerError>(()) + }); Ok(next_key) } @@ -710,8 +750,16 @@ mod tests { impl GcManagerTestUtil { pub fn new(regions: BTreeMap, RegionInfo>) -> Self { let (gc_task_sender, gc_task_receiver) = channel(); - let worker = WorkerBuilder::new("test-gc-manager").create(); - let scheduler = worker.start("gc-manager", MockGcRunner { tx: gc_task_sender }); + let worker = WorkerBuilder::new("test-gc-manager") + .thread_count(2) + .create(); + let scheduler = worker.start( + "gc-manager", + MockGcRunner { + tx: gc_task_sender.clone(), + }, + ); + worker.start("gc-manager", MockGcRunner { tx: gc_task_sender }); let (safe_point_sender, safe_point_receiver) = channel(); @@ -731,6 +779,7 @@ mod tests { scheduler, GcWorkerConfigManager::default(), Default::default(), + 2, ); Self { gc_manager: Some(gc_manager), diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index c608470ba87..a0537a478d0 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -34,6 +34,7 @@ use tikv_util::{ Either, }; use txn_types::{Key, TimeStamp}; +use yatp::{task::future::TaskCell, Remote}; use super::{ check_need_gc, @@ -178,7 +179,7 @@ where } /// Used to perform GC operations on the engine. -pub struct GcRunner { +pub struct GcRunnerCore { store_id: u64, engine: E, @@ -193,6 +194,26 @@ pub struct GcRunner { stats_map: HashMap, } +impl Clone for GcRunnerCore { + fn clone(&self) -> Self { + GcRunnerCore { + store_id: self.store_id, + engine: self.engine.clone(), + flow_info_sender: self.flow_info_sender.clone(), + limiter: self.limiter.clone(), + cfg: self.cfg.clone(), + cfg_tracker: self.cfg_tracker.clone(), + stats_map: HashMap::default(), + } + } +} + +/// Used to perform GC operations on the engine. +pub struct GcRunner { + inner: GcRunnerCore, + pool: Remote, +} + pub const MAX_RAW_WRITE_SIZE: usize = 32 * 1024; pub struct MvccRaw { @@ -282,7 +303,7 @@ fn init_snap_ctx(store_id: u64, region: &Region) -> Context { ctx } -impl GcRunner { +impl GcRunnerCore { pub fn new( store_id: u64, engine: E, @@ -918,18 +939,12 @@ impl GcRunner { error!("failed to flush deletes, will leave garbage"; "err" => ?e); } } -} - -impl Runnable for GcRunner { - type Task = GcTask; #[inline] fn run(&mut self, task: GcTask) { let _io_type_guard = WithIoType::new(IoType::Gc); let enum_label = task.get_enum_label(); - GC_GCTASK_COUNTER_STATIC.get(enum_label).inc(); - let timer = SlowTimer::from_secs(GC_TASK_SLOW_SECONDS); let update_metrics = |is_err| { GC_TASK_DURATION_HISTOGRAM_VEC @@ -941,9 +956,6 @@ impl Runnable for GcRunner { } }; - // Refresh config before handle task - self.refresh_cfg(); - match task { GcTask::Gc { region, @@ -1062,6 +1074,37 @@ impl Runnable for GcRunner { } } +impl GcRunner { + pub fn new( + store_id: u64, + engine: E, + flow_info_sender: Sender, + cfg_tracker: Tracker, + cfg: GcConfig, + pool: Remote, + ) -> Self { + Self { + inner: GcRunnerCore::new(store_id, engine, flow_info_sender, cfg_tracker, cfg), + pool, + } + } +} + +impl Runnable for GcRunner { + type Task = GcTask; + + #[inline] + fn run(&mut self, task: GcTask) { + // Refresh config before handle task + self.inner.refresh_cfg(); + + let mut inner = self.inner.clone(); + self.pool.spawn(async move { + inner.run(task); + }); + } +} + /// When we failed to schedule a `GcTask` to `GcRunner`, use this to handle the /// `ScheduleError`. fn handle_gc_task_schedule_error(e: ScheduleError>) -> Result<()> { @@ -1081,7 +1124,7 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res } /// Schedules a `GcTask` to the `GcRunner`. -fn schedule_gc( +pub fn schedule_gc( scheduler: &Scheduler>, region: Region, safe_point: TimeStamp, @@ -1174,13 +1217,18 @@ impl GcWorker { feature_gate: FeatureGate, region_info_provider: Arc, ) -> Self { - let worker_builder = WorkerBuilder::new("gc-worker").pending_capacity(GC_MAX_PENDING_TASKS); + let worker_builder = WorkerBuilder::new("gc-worker") + .pending_capacity(GC_MAX_PENDING_TASKS) + .thread_count(cfg.num_threads); let worker = worker_builder.create().lazy_build("gc-worker"); let worker_scheduler = worker.scheduler(); GcWorker { engine, flow_info_sender: Some(flow_info_sender), - config_manager: GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))), + config_manager: GcWorkerConfigManager( + Arc::new(VersionTrack::new(cfg)), + Some(worker.pool()), + ), refs: Arc::new(AtomicUsize::new(1)), worker: Arc::new(Mutex::new(worker)), worker_scheduler, @@ -1219,6 +1267,7 @@ impl GcWorker { self.scheduler(), self.config_manager.clone(), self.feature_gate.clone(), + self.config_manager.value().num_threads, ) .start()?; *handle = Some(new_handle); @@ -1226,14 +1275,20 @@ impl GcWorker { } pub fn start(&mut self, store_id: u64) -> Result<()> { + let mut worker = self.worker.lock().unwrap(); let runner = GcRunner::new( store_id, self.engine.clone(), self.flow_info_sender.take().unwrap(), - self.config_manager.0.clone().tracker("gc-woker".to_owned()), + self.config_manager + .0 + .clone() + .tracker("gc-worker".to_owned()), self.config_manager.value().clone(), + worker.remote(), ); - self.worker.lock().unwrap().start(runner); + worker.start(runner); + Ok(()) } @@ -1296,6 +1351,10 @@ impl GcWorker { pub fn get_config_manager(&self) -> GcWorkerConfigManager { self.config_manager.clone() } + + pub fn get_worker_thread_count(&self) -> usize { + self.worker.lock().unwrap().pool_size() + } } #[cfg(any(test, feature = "testexport"))] @@ -1486,6 +1545,7 @@ mod tests { use engine_traits::Peekable as _; use futures::executor::block_on; use kvproto::{kvrpcpb::ApiVersion, metapb::Peer}; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; use raft::StateRole; use raftstore::coprocessor::{ region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, @@ -1634,10 +1694,12 @@ mod tests { region2.mut_peers().push(new_peer(store_id, 2)); region2.set_start_key(split_key.to_vec()); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( engine, tx, - GcConfig::default(), + gc_config, gate, Arc::new(MockRegionInfoProvider::new(vec![region1, region2])), ); @@ -1810,10 +1872,12 @@ mod tests { let mut host = CoprocessorHost::::default(); let ri_provider = RegionInfoAccessor::new(&mut host); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( prefixed_engine.clone(), tx, - GcConfig::default(), + gc_config, feature_gate, Arc::new(ri_provider.clone()), ); @@ -1902,13 +1966,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( store_id, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -1966,13 +2030,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( store_id, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2067,13 +2131,13 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); - let mut runner = GcRunner::new( + let mut runner = GcRunnerCore::new( 1, prefixed_engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2202,10 +2266,12 @@ mod tests { let mut region = Region::default(); region.mut_peers().push(new_peer(store_id, 1)); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 2; let mut gc_worker = GcWorker::new( engine.clone(), tx, - GcConfig::default(), + gc_config, gate, Arc::new(MockRegionInfoProvider::new(vec![region.clone()])), ); @@ -2333,7 +2399,7 @@ mod tests { ) -> ( MultiRocksEngine, Arc, - GcRunner, + GcRunnerCore, Vec, mpsc::Receiver, ) { @@ -2386,13 +2452,13 @@ mod tests { ])); let cfg = GcConfig::default(); - let gc_runner = GcRunner::new( + let gc_runner = GcRunnerCore::new( store_id, engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2564,13 +2630,13 @@ mod tests { let ri_provider = Arc::new(MockRegionInfoProvider::new(vec![r1, r2])); let cfg = GcConfig::default(); - let mut gc_runner = GcRunner::new( + let mut gc_runner = GcRunnerCore::new( store_id, engine.clone(), tx, - GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) + GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone())), None) .0 - .tracker("gc-woker".to_owned()), + .tracker("gc-worker".to_owned()), cfg, ); @@ -2756,4 +2822,33 @@ mod tests { test_destroy_range_for_multi_rocksdb_impl(b"k05", b"k195", vec![1, 2]); test_destroy_range_for_multi_rocksdb_impl(b"k099", b"k25", vec![2, 3]); } + + #[test] + fn test_update_gc_thread_count() { + let engine = TestEngineBuilder::new().build().unwrap(); + let (tx, _rx) = mpsc::channel(); + let gate = FeatureGate::default(); + gate.set_version("5.0.0").unwrap(); + let mut gc_config = GcConfig::default(); + gc_config.num_threads = 1; + let gc_worker = GcWorker::new( + engine, + tx, + gc_config, + gate, + Arc::new(MockRegionInfoProvider::new(vec![])), + ); + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("num_threads"), ConfigValue::Usize(5)); + let mut cfg_manager = gc_worker.get_config_manager(); + cfg_manager.dispatch(config_change).unwrap(); + + assert_eq!(gc_worker.get_worker_thread_count(), 5); + + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("num_threads"), ConfigValue::Usize(2)); + cfg_manager.dispatch(config_change).unwrap(); + + assert_eq!(gc_worker.get_worker_thread_count(), 2); + } } diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 9583df80dd6..fd749cc3175 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -1119,7 +1119,7 @@ pub mod tests { use tikv_util::worker::FutureWorker; use super::*; - use crate::server::resolve::Callback; + use crate::server::resolve; #[test] fn test_detect_table() { @@ -1467,15 +1467,6 @@ pub mod tests { impl PdClient for MockPdClient {} - #[derive(Clone)] - pub(crate) struct MockResolver; - - impl StoreAddrResolver for MockResolver { - fn resolve(&self, _store_id: u64, _cb: Callback) -> Result<()> { - Err(Error::Other(box_err!("unimplemented"))) - } - } - fn start_deadlock_detector( host: &mut CoprocessorHost, ) -> (FutureWorker, Scheduler) { @@ -1485,7 +1476,7 @@ pub mod tests { let detector_runner = Detector::new( 1, Arc::new(MockPdClient {}), - MockResolver {}, + resolve::MockStoreAddrResolver::default(), Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()), waiter_mgr_scheduler, &Config::default(), diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 243d533a0e5..c42531ae0fd 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -318,7 +318,7 @@ mod tests { use self::{deadlock::tests::*, metrics::*, waiter_manager::tests::*}; use super::*; - use crate::storage::lock_manager::LockDigest; + use crate::{server::resolve::MockStoreAddrResolver, storage::lock_manager::LockDigest}; fn start_lock_manager() -> LockManager { let mut coprocessor_host = CoprocessorHost::::default(); @@ -336,7 +336,7 @@ mod tests { .start( 1, Arc::new(MockPdClient {}), - MockResolver {}, + MockStoreAddrResolver::default(), Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()), &cfg, ) diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 2745be59a71..c55a0c0ae8a 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -86,6 +86,7 @@ make_auto_flush_static_metric! { failed, success, tombstone, + not_found, } pub label_enum ReplicaReadLockCheckResult { @@ -98,6 +99,13 @@ make_auto_flush_static_metric! { fail, } + pub label_enum ResourcePriority { + high, + medium, + low, + unknown, + } + pub struct GcCommandCounterVec: LocalIntCounter { "type" => GcCommandKind, } @@ -133,6 +141,7 @@ make_auto_flush_static_metric! { pub struct GrpcMsgHistogramVec: LocalHistogram { "type" => GrpcTypeKind, + "priority" => ResourcePriority, } pub struct ReplicaReadLockCheckHistogramVec: LocalHistogram { @@ -208,10 +217,11 @@ lazy_static! { &["type"] ) .unwrap(); + // TODO: deprecate the "name" label in v8.0. pub static ref GRPC_RESOURCE_GROUP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_grpc_resource_group_total", "Total number of handle grpc message for each resource group", - &["name"] + &["name", "resource_group"] ) .unwrap(); pub static ref GRPC_PROXY_MSG_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -233,7 +243,7 @@ lazy_static! { pub static ref GRPC_MSG_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( "tikv_grpc_msg_duration_seconds", "Bucketed histogram of grpc server messages", - &["type"], + &["type","priority"], exponential_buckets(5e-5, 2.0, 22).unwrap() // 50us ~ 104s ) .unwrap(); @@ -400,6 +410,13 @@ lazy_static! { &["type", "store_id"] ) .unwrap(); + pub static ref RAFT_CLIENT_WAIT_CONN_READY_DURATION_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( + "tikv_server_raft_client_wait_ready_duration", + "Duration of wait raft client connection ready", + &["to"], + exponential_buckets(5e-5, 2.0, 22).unwrap() // 50us ~ 104s + ) + .unwrap(); pub static ref RAFT_MESSAGE_FLUSH_COUNTER: RaftMessageFlushCounterVec = register_static_int_counter_vec!( RaftMessageFlushCounterVec, @@ -599,3 +616,19 @@ pub fn record_request_source_metrics(source: String, duration: Duration) { } }); } + +impl From for ResourcePriority { + fn from(priority: u64) -> Self { + // the mapping definition of priority in TIDB repo, + // see: https://github.com/tikv/tikv/blob/a0dbe2d0b893489015fc99ae73c6646f7989fe32/components/resource_control/src/resource_group.rs#L79-L89 + if priority == 0 { + Self::unknown + } else if priority < 6 { + Self::low + } else if priority < 11 { + Self::medium + } else { + Self::high + } + } +} diff --git a/src/server/node.rs b/src/server/node.rs index 228f679ed14..bf19cb6c005 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -167,7 +167,7 @@ where pd_worker: LazyWorker>, store_meta: Arc>, coprocessor_host: CoprocessorHost, - importer: Arc, + importer: Arc>, split_check_scheduler: Scheduler, auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, @@ -291,7 +291,7 @@ where }; if should_check { // Check if there are only TiDB data in the engine - let snapshot = engines.kv.snapshot(); + let snapshot = engines.kv.snapshot(None); for cf in DATA_CFS { for (start, end) in TIDB_RANGES_COMPLEMENT { let mut unexpected_data_key = None; @@ -455,7 +455,7 @@ where pd_worker: LazyWorker>, store_meta: Arc>, coprocessor_host: CoprocessorHost, - importer: Arc, + importer: Arc>, split_check_scheduler: Scheduler, auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index f30e5b36045..700d409c129 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -40,14 +40,18 @@ use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, lru::LruCache, + time::duration_to_sec, timer::GLOBAL_TIMER_HANDLE, worker::Scheduler, }; use yatp::{task::future::TaskCell, ThreadPool}; use crate::server::{ - self, load_statistics::ThreadLoadPool, metrics::*, snap::Task as SnapTask, Config, - StoreAddrResolver, + load_statistics::ThreadLoadPool, + metrics::*, + resolve::{Error as ResolveError, Result as ResolveResult}, + snap::Task as SnapTask, + Config, StoreAddrResolver, }; pub struct MetadataSourceStoreId {} @@ -642,7 +646,7 @@ where S: StoreAddrResolver, R: RaftExtension + Unpin + 'static, { - fn resolve(&self) -> impl Future> { + fn resolve(&self) -> impl Future> { let (tx, rx) = oneshot::channel(); let store_id = self.store_id; let res = self.builder.resolver.resolve( @@ -673,7 +677,7 @@ where res?; match rx.await { Ok(a) => a, - Err(_) => Err(server::Error::Other( + Err(_) => Err(ResolveError::Other( "failed to receive resolve result".into(), )), } @@ -811,7 +815,13 @@ async fn start( let mut last_wake_time = None; let backoff_duration = back_end.builder.cfg.value().raft_client_max_backoff.0; let mut addr_channel = None; + let mut begin = None; + let mut try_count = 0; loop { + if begin.is_none() { + begin = Some(Instant::now()); + } + try_count += 1; maybe_backoff(backoff_duration, &mut last_wake_time).await; let f = back_end.resolve(); let addr = match f.await { @@ -824,8 +834,7 @@ async fn start( RESOLVE_STORE_COUNTER.with_label_values(&["failed"]).inc(); back_end.clear_pending_message("resolve"); error_unknown!(?e; "resolve store address failed"; "store_id" => back_end.store_id,); - // TOMBSTONE - if format!("{}", e).contains("has been removed") { + if let ResolveError::StoreTombstone(_) = e { let mut pool = pool.lock().unwrap(); if let Some(s) = pool.connections.remove(&(back_end.store_id, conn_id)) { s.set_conn_state(ConnState::Disconnected); @@ -860,7 +869,19 @@ async fn start( .report_store_unreachable(back_end.store_id); continue; } else { - debug!("connection established"; "store_id" => back_end.store_id, "addr" => %addr); + let wait_conn_duration = begin.unwrap_or_else(Instant::now).elapsed(); + info!("connection established"; + "store_id" => back_end.store_id, + "addr" => %addr, + "cost" => ?wait_conn_duration, + "msg_count" => ?back_end.queue.len(), + "try_count" => try_count, + ); + RAFT_CLIENT_WAIT_CONN_READY_DURATION_HISTOGRAM_VEC + .with_label_values(&[addr.as_str()]) + .observe(duration_to_sec(wait_conn_duration)); + begin = None; + try_count = 0; } let client = TikvClient::new(channel); @@ -940,7 +961,7 @@ struct CachedQueue { /// ```text /// for m in msgs { /// if !raft_client.send(m) { -/// // handle error. +/// // handle error. /// } /// } /// raft_client.flush(); diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 2074d469310..9f42925b6d4 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -22,7 +22,7 @@ use std::{ use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; -use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; +use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot, SnapshotContext}; use futures::{future::BoxFuture, task::AtomicWaker, Future, Stream, StreamExt, TryFutureExt}; use kvproto::{ errorpb, @@ -644,10 +644,15 @@ where })); let tracker = store_cb.read_tracker().unwrap(); + let snap_ctx = ctx.start_ts.map(|ts| SnapshotContext { + read_ts: ts.into_inner(), + region_id: ctx.pb_ctx.get_region_id(), + }); + if res.is_ok() { res = self .router - .read(ctx.read_id, cmd, store_cb) + .read(snap_ctx, ctx.read_id, cmd, store_cb) .map_err(kv::Error::from); } async move { @@ -683,7 +688,7 @@ where tracker.metrics.read_index_propose_wait_nanos as f64 / 1_000_000_000.0, ); - // snapshot may be hanlded by lease read in raftstore + // snapshot may be handled by lease read in raftstore if tracker.metrics.read_index_confirm_wait_nanos > 0 { ASYNC_REQUESTS_DURATIONS_VEC .snapshot_read_index_confirm diff --git a/src/server/raftkv/raft_extension.rs b/src/server/raftkv/raft_extension.rs index d3178842489..733d60c838c 100644 --- a/src/server/raftkv/raft_extension.rs +++ b/src/server/raftkv/raft_extension.rs @@ -121,6 +121,7 @@ where split_keys, callback: raftstore::store::Callback::write(cb), source: source.into(), + share_source_region_size: false, }; let res = self.router.send_casual_msg(region_id, req); Box::pin(async move { diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index dacc90a91f0..321a6614350 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -19,7 +19,13 @@ use kvproto::{ }; pub use node::NodeV2; pub use raft_extension::Extension; -use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; +use raftstore::{ + store::{ + cmd_resp, msg::ErrorCallback, util::encode_start_ts_into_flag_data, RaftCmdExtraOpts, + RegionSnapshot, + }, + Error, +}; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, @@ -28,6 +34,7 @@ use raftstore_v2::{ }; use tikv_kv::{Modify, WriteEvent}; use tikv_util::time::Instant; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS}; use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ @@ -172,7 +179,7 @@ impl tikv_kv::Engine for RaftKv2 { .set_key_ranges(mem::take(&mut ctx.key_ranges).into()); } ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; @@ -195,14 +202,49 @@ impl tikv_kv::Engine for RaftKv2 { let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); cmd.set_requests(vec![req].into()); - let f = self.router.snapshot(cmd); + let res: tikv_kv::Result<()> = (|| { + fail_point!("raftkv_async_snapshot_err", |_| { + Err(box_err!("injected error for async_snapshot")) + }); + Ok(()) + })(); + let f = if res.is_err() { + None + } else { + Some(self.router.snapshot(cmd)) + }; + async move { - let res = f.await; + res?; + let res = f.unwrap().await; match res { Ok(snap) => { - ASYNC_REQUESTS_DURATIONS_VEC - .snapshot - .observe(begin_instant.saturating_elapsed_secs()); + let elapse = begin_instant.saturating_elapsed_secs(); + let tracker = get_tls_tracker_token(); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + if tracker.metrics.read_index_propose_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_propose_wait + .observe( + tracker.metrics.read_index_propose_wait_nanos as f64 + / 1_000_000_000.0, + ); + // snapshot may be handled by lease read in raftstore + if tracker.metrics.read_index_confirm_wait_nanos > 0 { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_read_index_confirm + .observe( + tracker.metrics.read_index_confirm_wait_nanos as f64 + / 1_000_000_000.0, + ); + } + } else if tracker.metrics.local_read { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot_local_read + .observe(elapse); + } + }); + ASYNC_REQUESTS_DURATIONS_VEC.snapshot.observe(elapse); ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); Ok(snap) } @@ -241,6 +283,17 @@ impl tikv_kv::Engine for RaftKv2 { let region_id = ctx.region_id; ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); + + let inject_region_not_found = (|| { + // If rid is some, only the specified region reports error. + // If rid is None, all regions report error. + fail_point!("raftkv_early_error_report", |rid| -> bool { + rid.and_then(|rid| rid.parse().ok()) + .map_or(true, |rid: u64| rid == region_id) + }); + false + })(); + let begin_instant = Instant::now_coarse(); let mut header = Box::new(new_request_header(ctx)); let mut flags = 0; @@ -275,17 +328,25 @@ impl tikv_kv::Engine for RaftKv2 { }); } let (ch, sub) = builder.build(); - let msg = PeerMsg::SimpleWrite(SimpleWrite { - header, - data, - ch, - send_time: Instant::now_coarse(), - }); - let res = self - .router - .store_router() - .check_send(region_id, msg) - .map_err(tikv_kv::Error::from); + let res = if inject_region_not_found { + ch.report_error(cmd_resp::new_error(Error::RegionNotFound(region_id))); + Err(tikv_kv::Error::from(Error::RegionNotFound(region_id))) + } else { + let msg = PeerMsg::SimpleWrite(SimpleWrite { + header, + data, + ch, + send_time: Instant::now_coarse(), + extra_opts: RaftCmdExtraOpts { + deadline: batch.deadline, + disk_full_opt: batch.disk_full_opt, + }, + }); + self.router + .store_router() + .check_send(region_id, msg) + .map_err(tikv_kv::Error::from) + }; (Transform { resp: CmdResStream::new(sub), early_err: res.err(), diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index d9b17c5d35c..5fce5c0024b 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -113,7 +113,7 @@ where pd_worker: LazyWorker, store_cfg: Arc>, state: &Mutex, - sst_importer: Arc, + sst_importer: Arc>, key_manager: Option>, grpc_service_mgr: GrpcServiceManager, ) -> Result<()> @@ -218,7 +218,7 @@ where background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, - sst_importer: Arc, + sst_importer: Arc>, key_manager: Option>, grpc_service_mgr: GrpcServiceManager, ) -> Result<()> diff --git a/src/server/raftkv2/raft_extension.rs b/src/server/raftkv2/raft_extension.rs index f2f433999b9..8b15c73fb65 100644 --- a/src/server/raftkv2/raft_extension.rs +++ b/src/server/raftkv2/raft_extension.rs @@ -49,6 +49,11 @@ impl tikv_kv::RaftExtension for Extension .send_control(StoreMsg::StoreUnreachable { to_store_id }); } + fn report_store_maybe_tombstone(&self, store_id: u64) { + self.router + .broadcast_normal(|| PeerMsg::StoreMaybeTombstone { store_id }); + } + fn report_snapshot_status( &self, region_id: u64, @@ -71,7 +76,7 @@ impl tikv_kv::RaftExtension for Extension split_keys: Vec>, source: String, ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { - let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source); + let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source, true); let res = self.router.check_send(region_id, msg); Box::pin(async move { res?; diff --git a/src/server/resolve.rs b/src/server/resolve.rs index c831ff28d17..013511183e2 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -1,6 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + error::Error as StdError, fmt::{self, Display, Formatter}, sync::{Arc, Mutex}, }; @@ -9,16 +10,28 @@ use collections::HashMap; use kvproto::replication_modepb::ReplicationMode; use pd_client::{take_peer_address, PdClient}; use raftstore::store::GlobalReplicationState; +use thiserror::Error; use tikv_kv::RaftExtension; use tikv_util::{ + info, time::Instant, worker::{Runnable, Scheduler, Worker}, }; -use super::{metrics::*, Result}; +use super::metrics::*; const STORE_ADDRESS_REFRESH_SECONDS: u64 = 60; +#[derive(Debug, Error)] +pub enum Error { + #[error("{0:?}")] + Other(#[from] Box), + #[error("store {0} has been removed")] + StoreTombstone(u64), +} + +pub type Result = std::result::Result; + pub type Callback = Box) + Send>; pub fn store_address_refresh_interval_secs() -> u64 { @@ -95,9 +108,21 @@ where // it explicitly. Err(pd_client::Error::StoreTombstone(_)) => { RESOLVE_STORE_COUNTER_STATIC.tombstone.inc(); - return Err(box_err!("store {} has been removed", store_id)); + self.router.report_store_maybe_tombstone(store_id); + return Err(Error::StoreTombstone(store_id)); + } + Err(e) => { + // Tombstone store may be removed manually or automatically + // after 30 days of deletion. PD returns + // "invalid store ID %d, not found" for such store id. + // See https://github.com/tikv/pd/blob/v7.3.0/server/grpc_service.go#L777-L780 + if format!("{:?}", e).contains("not found") { + RESOLVE_STORE_COUNTER_STATIC.not_found.inc(); + info!("resolve store not found"; "store_id" => store_id); + self.router.report_store_maybe_tombstone(store_id); + } + return Err(box_err!(e)); } - Err(e) => return Err(box_err!(e)), }; let mut group_id = None; let mut state = self.state.lock().unwrap(); @@ -181,6 +206,25 @@ impl StoreAddrResolver for PdStoreAddrResolver { } } +#[derive(Clone)] +pub struct MockStoreAddrResolver { + pub resolve_fn: Arc Result<()> + Send + Sync>, +} + +impl StoreAddrResolver for MockStoreAddrResolver { + fn resolve(&self, store_id: u64, cb: Callback) -> Result<()> { + (self.resolve_fn)(store_id, cb) + } +} + +impl Default for MockStoreAddrResolver { + fn default() -> MockStoreAddrResolver { + MockStoreAddrResolver { + resolve_fn: Arc::new(|_, _| unimplemented!()), + } + } +} + #[cfg(test)] mod tests { use std::{net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, time::Duration}; diff --git a/src/server/server.rs b/src/server/server.rs index 948930ae7ae..09782be4e16 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -437,6 +437,7 @@ pub mod test_router { use engine_rocks::{RocksEngine, RocksSnapshot}; use kvproto::raft_serverpb::RaftMessage; use raftstore::{router::RaftStoreRouter, store::*, Result as RaftStoreResult}; + use tikv_util::time::Instant as TiInstant; use super::*; @@ -496,12 +497,10 @@ pub mod test_router { impl RaftStoreRouter for TestRaftStoreRouter { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { - let _ = self - .tx - .send(Either::Left(PeerMsg::RaftMessage(InspectedRaftMessage { - heap_size: 0, - msg, - }))); + let _ = self.tx.send(Either::Left(PeerMsg::RaftMessage( + InspectedRaftMessage { heap_size: 0, msg }, + Some(TiInstant::now()), + ))); Ok(()) } @@ -533,8 +532,8 @@ mod tests { use super::{ super::{ - resolve::{Callback as ResolveCallback, StoreAddrResolver}, - Config, Result, + resolve::{self, Callback as ResolveCallback, StoreAddrResolver}, + Config, }, *, }; @@ -552,7 +551,7 @@ mod tests { } impl StoreAddrResolver for MockResolver { - fn resolve(&self, _: u64, cb: ResolveCallback) -> Result<()> { + fn resolve(&self, _: u64, cb: ResolveCallback) -> resolve::Result<()> { if self.quick_fail.load(Ordering::SeqCst) { return Err(box_err!("quick fail")); } diff --git a/src/server/service/batch.rs b/src/server/service/batch.rs index ba377bed4d2..3cc9a45e9dc 100644 --- a/src/server/service/batch.rs +++ b/src/server/service/batch.rs @@ -12,7 +12,7 @@ use tracker::{with_tls_tracker, RequestInfo, RequestType, Tracker, TrackerToken, use crate::{ server::{ - metrics::{GrpcTypeKind, REQUEST_BATCH_SIZE_HISTOGRAM_VEC}, + metrics::{GrpcTypeKind, ResourcePriority, REQUEST_BATCH_SIZE_HISTOGRAM_VEC}, service::kv::{batch_commands_response, GrpcRequestDuration, MeasuredSingleResponse}, }, storage::{ @@ -162,6 +162,7 @@ impl ResponseBatchConsumer<(Option>, Statistics)> for GetCommandResponse res: Result<(Option>, Statistics)>, begin: Instant, request_source: String, + resource_priority: ResourcePriority, ) { let mut resp = GetResponse::default(); if let Some(err) = extract_region_error(&res) { @@ -185,9 +186,13 @@ impl ResponseBatchConsumer<(Option>, Statistics)> for GetCommandResponse cmd: Some(batch_commands_response::response::Cmd::Get(resp)), ..Default::default() }; - let mesure = - GrpcRequestDuration::new(begin, GrpcTypeKind::kv_batch_get_command, request_source); - let task = MeasuredSingleResponse::new(id, res, mesure); + let measure = GrpcRequestDuration::new( + begin, + GrpcTypeKind::kv_batch_get_command, + request_source, + resource_priority, + ); + let task = MeasuredSingleResponse::new(id, res, measure); if self.tx.send_with(task, WakePolicy::Immediately).is_err() { error!("KvService response batch commands fail"); } @@ -201,6 +206,7 @@ impl ResponseBatchConsumer>> for GetCommandResponseConsumer { res: Result>>, begin: Instant, request_source: String, + resource_priority: ResourcePriority, ) { let mut resp = RawGetResponse::default(); if let Some(err) = extract_region_error(&res) { @@ -216,9 +222,13 @@ impl ResponseBatchConsumer>> for GetCommandResponseConsumer { cmd: Some(batch_commands_response::response::Cmd::RawGet(resp)), ..Default::default() }; - let mesure = - GrpcRequestDuration::new(begin, GrpcTypeKind::raw_batch_get_command, request_source); - let task = MeasuredSingleResponse::new(id, res, mesure); + let measure = GrpcRequestDuration::new( + begin, + GrpcTypeKind::raw_batch_get_command, + request_source, + resource_priority, + ); + let task = MeasuredSingleResponse::new(id, res, measure); if self.tx.send_with(task, WakePolicy::Immediately).is_err() { error!("KvService response batch commands fail"); } @@ -241,6 +251,15 @@ fn future_batch_get_command( .zip(gets.iter()) .map(|(id, req)| (*id, req.get_context().get_request_source().to_string())) .collect(); + + let group_priority = gets + .first() + .unwrap() + .get_context() + .get_resource_control_context() + .get_override_priority(); + let resource_priority = ResourcePriority::from(group_priority); + let res = storage.batch_get_command( gets, requests, @@ -266,6 +285,7 @@ fn future_batch_get_command( begin_instant, GrpcTypeKind::kv_batch_get_command, source, + resource_priority, ); let task = MeasuredSingleResponse::new(id, res, measure); if tx.send_with(task, WakePolicy::Immediately).is_err() { @@ -292,6 +312,15 @@ fn future_batch_raw_get_command( .zip(gets.iter()) .map(|(id, req)| (*id, req.get_context().get_request_source().to_string())) .collect(); + + let group_priority = gets + .first() + .unwrap() + .get_context() + .get_resource_control_context() + .get_override_priority(); + let resource_priority = ResourcePriority::from(group_priority); + let res = storage.raw_batch_get_command( gets, requests, @@ -312,6 +341,7 @@ fn future_batch_raw_get_command( begin_instant, GrpcTypeKind::raw_batch_get_command, source, + resource_priority, ); let task = MeasuredSingleResponse::new(id, res, measure); if tx.send_with(task, WakePolicy::Immediately).is_err() { diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 5a4327ba46e..02bfca0473e 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -198,11 +198,13 @@ macro_rules! handle_request { let source = req.get_context().get_request_source().to_owned(); let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = &self.resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority= ResourcePriority::from(resource_control_ctx.override_priority); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let resp = $future_name(&self.storage, req); let task = async move { @@ -212,6 +214,7 @@ macro_rules! handle_request { sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .$fn_name + .get(resource_group_priority) .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); ServerResult::Ok(()) @@ -430,6 +433,7 @@ impl Tikv for Service { sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .kv_prepare_flashback_to_version + .unknown .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); ServerResult::Ok(()) @@ -461,6 +465,7 @@ impl Tikv for Service { sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .kv_flashback_to_version + .unknown .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); ServerResult::Ok(()) @@ -480,21 +485,29 @@ impl Tikv for Service { forward_unary!(self.proxy, coprocessor, ctx, req, sink); let source = req.get_context().get_request_source().to_owned(); let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = &self.resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = + ResourcePriority::from(resource_control_ctx.override_priority); } + GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let begin_instant = Instant::now(); let future = future_copr(&self.copr, Some(ctx.peer()), req); let task = async move { let resp = future.await?.consume(); - sink.success(resp).await?; let elapsed = begin_instant.saturating_elapsed(); + sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .coprocessor + .get(resource_group_priority) .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); ServerResult::Ok(()) @@ -518,21 +531,28 @@ impl Tikv for Service { ) { let source = req.get_context().get_request_source().to_owned(); let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = &self.resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = + ResourcePriority::from(resource_control_ctx.override_priority); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let begin_instant = Instant::now(); let future = future_raw_coprocessor(&self.copr_v2, &self.storage, req); let task = async move { let resp = future.await?; - sink.success(resp).await?; let elapsed = begin_instant.saturating_elapsed(); + sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .raw_coprocessor + .get(resource_group_priority) .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); ServerResult::Ok(()) @@ -580,10 +600,11 @@ impl Tikv for Service { if let Err(e) = res { resp.set_error(format!("{}", e)); } - sink.success(resp).await?; let elapsed = begin_instant.saturating_elapsed(); + sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .unsafe_destroy_range + .unknown .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); ServerResult::Ok(()) @@ -607,11 +628,17 @@ impl Tikv for Service { ) { let begin_instant = Instant::now(); let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = &self.resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = + ResourcePriority::from(resource_control_ctx.override_priority); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ + resource_control_ctx.get_resource_group_name(), + resource_control_ctx.get_resource_group_name(), + ]) .inc(); let mut stream = self @@ -628,6 +655,7 @@ impl Tikv for Service { Ok(_) => { GRPC_MSG_HISTOGRAM_STATIC .coprocessor_stream + .get(resource_group_priority) .observe(begin_instant.saturating_elapsed().as_secs_f64()); let _ = sink.close().await; } @@ -863,10 +891,11 @@ impl Tikv for Service { } } } - sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .split_region + .unknown .observe(begin_instant.saturating_elapsed().as_secs_f64()); + sink.success(resp).await?; ServerResult::Ok(()) } .map_err(|e| { @@ -1015,6 +1044,10 @@ impl Tikv for Service { .schedule(CheckLeaderTask::CheckLeader { leaders, cb }) .map_err(|e| Error::Other(format!("{}", e).into()))?; let regions = resp.await?; + GRPC_MSG_HISTOGRAM_STATIC + .check_leader + .unknown + .observe(begin_instant.saturating_elapsed().as_secs_f64()); let mut resp = CheckLeaderResponse::default(); resp.set_ts(ts); resp.set_regions(regions); @@ -1029,6 +1062,7 @@ impl Tikv for Service { let elapsed = begin_instant.saturating_elapsed(); GRPC_MSG_HISTOGRAM_STATIC .check_leader + .unknown .observe(elapsed.as_secs_f64()); ServerResult::Ok(()) } @@ -1099,6 +1133,7 @@ fn response_batch_commands_request( begin: Instant, label: GrpcTypeKind, source: String, + resource_priority: ResourcePriority, ) where MemoryTraceGuard: From, F: Future> + Send + 'static, @@ -1109,6 +1144,7 @@ fn response_batch_commands_request( begin, label, source, + resource_priority, }; let task = MeasuredSingleResponse::new(id, resp, measure); if let Err(e) = tx.send_with(task, WakePolicy::Immediately) { @@ -1147,15 +1183,18 @@ fn handle_batch_commands_request( // For some invalid requests. let begin_instant = Instant::now(); let resp = future::ok(batch_commands_response::Response::default()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default(), ResourcePriority::unknown); }, Some(batch_commands_request::request::Cmd::Get(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = ResourcePriority::from(resource_control_ctx.override_priority); } + GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[ resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) @@ -1167,16 +1206,18 @@ fn handle_batch_commands_request( let resp = future_get(storage, req) .map_ok(oneof!(batch_commands_response::response::Cmd::Get)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.kv_get.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::kv_get, source); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::kv_get, source,resource_group_priority); } }, Some(batch_commands_request::request::Cmd::RawGet(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = ResourcePriority::from(resource_control_ctx.override_priority); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) @@ -1188,25 +1229,27 @@ fn handle_batch_commands_request( let resp = future_raw_get(storage, req) .map_ok(oneof!(batch_commands_response::response::Cmd::RawGet)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.raw_get.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source,resource_group_priority); } }, - Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + Some(batch_commands_request::request::Cmd::Coprocessor(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = ResourcePriority::from(resource_control_ctx.override_priority ); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) + .inc(); let begin_instant = Instant::now(); - let source = req.mut_context().take_request_source(); + let source = req.get_context().get_request_source().to_owned(); let resp = future_copr(copr, Some(peer.to_string()), req) .map_ok(|resp| { resp.map(oneof!(batch_commands_response::response::Cmd::Coprocessor)) }) .map_err(|_| GRPC_MSG_FAIL_COUNTER.coprocessor.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::coprocessor, source); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::coprocessor, source,resource_group_priority); }, Some(batch_commands_request::request::Cmd::Empty(req)) => { let begin_instant = Instant::now(); @@ -1223,22 +1266,25 @@ fn handle_batch_commands_request( begin_instant, GrpcTypeKind::invalid, String::default(), + ResourcePriority::unknown, ); } - $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + $(Some(batch_commands_request::request::Cmd::$cmd(req)) => { let resource_control_ctx = req.get_context().get_resource_control_context(); + let mut resource_group_priority = ResourcePriority::unknown; if let Some(resource_manager) = resource_manager { resource_manager.consume_penalty(resource_control_ctx); + resource_group_priority = ResourcePriority::from(resource_control_ctx.override_priority); } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .with_label_values(&[resource_control_ctx.get_resource_group_name(), resource_control_ctx.get_resource_group_name()]) .inc(); let begin_instant = Instant::now(); - let source = req.mut_context().take_request_source(); + let source = req.get_context().get_request_source().to_owned(); let resp = $future_fn($($arg,)* req) .map_ok(oneof!(batch_commands_response::response::Cmd::$cmd)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.$metric_name.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::$metric_name, source); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::$metric_name, source,resource_group_priority); })* Some(batch_commands_request::request::Cmd::Import(_)) => unimplemented!(), } @@ -1288,10 +1334,12 @@ fn handle_measures_for_batch_commands(measures: &mut MeasuredBatchResponse) { label, begin, source, + resource_priority, } = measure; let elapsed = now.saturating_duration_since(begin); GRPC_MSG_HISTOGRAM_STATIC .get(label) + .get(resource_priority) .observe(elapsed.as_secs_f64()); record_request_source_metrics(source, elapsed); let exec_details = resp.cmd.as_mut().and_then(|cmd| match cmd { @@ -2234,13 +2282,20 @@ pub struct GrpcRequestDuration { pub begin: Instant, pub label: GrpcTypeKind, pub source: String, + pub resource_priority: ResourcePriority, } impl GrpcRequestDuration { - pub fn new(begin: Instant, label: GrpcTypeKind, source: String) -> Self { + pub fn new( + begin: Instant, + label: GrpcTypeKind, + source: String, + resource_priority: ResourcePriority, + ) -> Self { GrpcRequestDuration { begin, label, source, + resource_priority, } } } diff --git a/src/server/status_server/jeprof.in b/src/server/status_server/jeprof.in new file mode 100644 index 00000000000..cadf15d7d8e --- /dev/null +++ b/src/server/status_server/jeprof.in @@ -0,0 +1,5727 @@ +#! /usr/bin/env perl + +# Copyright (c) 1998-2007, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# --- +# Program for printing the profile generated by common/profiler.cc, +# or by the heap profiler (common/debugallocation.cc) +# +# The profile contains a sequence of entries of the form: +# +# This program parses the profile, and generates user-readable +# output. +# +# Examples: +# +# % tools/jeprof "program" "profile" +# Enters "interactive" mode +# +# % tools/jeprof --text "program" "profile" +# Generates one line per procedure +# +# % tools/jeprof --gv "program" "profile" +# Generates annotated call-graph and displays via "gv" +# +# % tools/jeprof --gv --focus=Mutex "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# +# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# and does not match "string" +# +# % tools/jeprof --list=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --list= pattern. The listing is +# annotated with the flat and cumulative sample counts at each line. +# +# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --disasm= pattern. The listing is +# annotated with the flat and cumulative sample counts at each PC value. +# +# TODO: Use color to indicate files? + +use strict; +use warnings; +use Getopt::Long; +use Cwd; + +my $JEPROF_VERSION = "unknown"; +my $PPROF_VERSION = "2.0"; + +# These are the object tools we use which can come from a +# user-specified location using --tools, from the JEPROF_TOOLS +# environment variable, or from the environment. +my %obj_tool_map = ( + "objdump" => "objdump", + "nm" => "nm", + "addr2line" => "addr2line", + "c++filt" => "c++filt", + ## ConfigureObjTools may add architecture-specific entries: + #"nm_pdb" => "nm-pdb", # for reading windows (PDB-format) executables + #"addr2line_pdb" => "addr2line-pdb", # ditto + #"otool" => "otool", # equivalent of objdump on OS X +); +# NOTE: these are lists, so you can put in commandline flags if you want. +my @DOT = ("dot"); # leave non-absolute, since it may be in /usr/local +my @GV = ("gv"); +my @EVINCE = ("evince"); # could also be xpdf or perhaps acroread +my @KCACHEGRIND = ("kcachegrind"); +my @PS2PDF = ("ps2pdf"); +# These are used for dynamic profiles +my @URL_FETCHER = ("curl", "-s", "--fail"); + +# These are the web pages that servers need to support for dynamic profiles +my $HEAP_PAGE = "/pprof/heap"; +my $PROFILE_PAGE = "/pprof/profile"; # must support cgi-param "?seconds=#" +my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param + # ?seconds=#&event=x&period=n +my $GROWTH_PAGE = "/pprof/growth"; +my $CONTENTION_PAGE = "/pprof/contention"; +my $WALL_PAGE = "/pprof/wall(?:\\?.*)?"; # accepts options like namefilter +my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?"; +my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param + # "?seconds=#", + # "?tags_regexp=#" and + # "?type=#". +my $SYMBOL_PAGE = "/pprof/symbol"; # must support symbol lookup via POST +my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; + +# These are the web pages that can be named on the command line. +# All the alternatives must begin with /. +my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" . + "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" . + "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)"; + +# default binary name +my $UNKNOWN_BINARY = "(unknown)"; + +# There is a pervasive dependency on the length (in hex characters, +# i.e., nibbles) of an address, distinguishing between 32-bit and +# 64-bit profiles. To err on the safe size, default to 64-bit here: +my $address_length = 16; + +my $dev_null = "/dev/null"; +if (! -e $dev_null && $^O =~ /MSWin/) { # $^O is the OS perl was built for + $dev_null = "nul"; +} + +# A list of paths to search for shared object files +my @prefix_list = (); + +# Special routine name that should not have any symbols. +# Used as separator to parse "addr2line -i" output. +my $sep_symbol = '_fini'; +my $sep_address = undef; + +##### Argument parsing ##### + +sub usage_string { + return < + is a space separated list of profile names. +jeprof [options] + is a list of profile files where each file contains + the necessary symbol mappings as well as profile data (likely generated + with --raw). +jeprof [options] + is a remote form. Symbols are obtained from host:port$SYMBOL_PAGE + + Each name can be: + /path/to/profile - a path to a profile file + host:port[/] - a location of a service to get profile from + + The / can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile, + $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall, + $CENSUSPROFILE_PAGE, or /pprof/filteredprofile. + For instance: + jeprof http://myserver.com:80$HEAP_PAGE + If / is omitted, the service defaults to $PROFILE_PAGE (cpu profiling). +jeprof --symbols + Maps addresses to symbol names. In this mode, stdin should be a + list of library mappings, in the same format as is found in the heap- + and cpu-profile files (this loosely matches that of /proc/self/maps + on linux), followed by a list of hex addresses to map, one per line. + + For more help with querying remote servers, including how to add the + necessary server-side support code, see this filename (or one like it): + + /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html + +Options: + --cum Sort by cumulative data + --base= Subtract from before display + --interactive Run in interactive mode (interactive "help" gives help) [default] + --seconds= Length of time for dynamic profiles [default=30 secs] + --add_lib= Read additional symbols and line info from the given library + --lib_prefix=

Comma separated list of library path prefixes + +Reporting Granularity: + --addresses Report at address level + --lines Report at source line level + --functions Report at function level [default] + --files Report at source file level + +Output type: + --text Generate text report + --callgrind Generate callgrind format to stdout + --gv Generate Postscript and display + --evince Generate PDF and display + --web Generate SVG and display + --list= Generate source listing of matching routines + --disasm= Generate disassembly of matching routines + --symbols Print demangled symbol names found at given addresses + --dot Generate DOT file to stdout + --ps Generate Postcript to stdout + --pdf Generate PDF to stdout + --svg Generate SVG to stdout + --gif Generate GIF to stdout + --raw Generate symbolized jeprof data (useful with remote fetch) + --collapsed Generate collapsed stacks for building flame graphs + (see http://www.brendangregg.com/flamegraphs.html) + +Heap-Profile Options: + --inuse_space Display in-use (mega)bytes [default] + --inuse_objects Display in-use objects + --alloc_space Display allocated (mega)bytes + --alloc_objects Display allocated objects + --show_bytes Display space in bytes + --drop_negative Ignore negative differences + +Contention-profile options: + --total_delay Display total delay at each region [default] + --contentions Display number of delays at each region + --mean_delay Display mean delay at each region + +Call-graph Options: + --nodecount= Show at most so many nodes [default=80] + --nodefraction= Hide nodes below *total [default=.005] + --edgefraction= Hide edges below *total [default=.001] + --maxdegree= Max incoming/outgoing edges per node [default=8] + --focus= Focus on backtraces with nodes matching + --thread= Show profile for thread + --ignore= Ignore backtraces with nodes matching + --scale= Set GV scaling [default=0] + --heapcheck Make nodes with non-0 object counts + (i.e. direct leak generators) more visible + --retain= Retain only nodes that match + --exclude= Exclude all nodes that match + +Miscellaneous: + --tools=[,...] \$PATH for object tool pathnames + --test Run unit tests + --help This message + --version Version information + --debug-syms-by-id (Linux only) Find debug symbol files by build ID as well as by name + +Environment Variables: + JEPROF_TMPDIR Profiles directory. Defaults to \$HOME/jeprof + JEPROF_TOOLS Prefix for object tools pathnames + +Examples: + +jeprof /bin/ls ls.prof + Enters "interactive" mode +jeprof --text /bin/ls ls.prof + Outputs one line per procedure +jeprof --web /bin/ls ls.prof + Displays annotated call-graph in web browser +jeprof --gv /bin/ls ls.prof + Displays annotated call-graph via 'gv' +jeprof --gv --focus=Mutex /bin/ls ls.prof + Restricts to code paths including a .*Mutex.* entry +jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof + Code paths including Mutex but not string +jeprof --list=getdir /bin/ls ls.prof + (Per-line) annotated source listing for getdir() +jeprof --disasm=getdir /bin/ls ls.prof + (Per-PC) annotated disassembly for getdir() + +jeprof http://localhost:1234/ + Enters "interactive" mode +jeprof --text localhost:1234 + Outputs one line per procedure for localhost:1234 +jeprof --raw localhost:1234 > ./local.raw +jeprof --text ./local.raw + Fetches a remote profile for later analysis and then + analyzes it in text mode. +EOF +} + +sub version_string { + return < \$main::opt_help, + "version!" => \$main::opt_version, + "cum!" => \$main::opt_cum, + "base=s" => \$main::opt_base, + "seconds=i" => \$main::opt_seconds, + "add_lib=s" => \$main::opt_lib, + "lib_prefix=s" => \$main::opt_lib_prefix, + "functions!" => \$main::opt_functions, + "lines!" => \$main::opt_lines, + "addresses!" => \$main::opt_addresses, + "files!" => \$main::opt_files, + "text!" => \$main::opt_text, + "callgrind!" => \$main::opt_callgrind, + "list=s" => \$main::opt_list, + "disasm=s" => \$main::opt_disasm, + "symbols!" => \$main::opt_symbols, + "gv!" => \$main::opt_gv, + "evince!" => \$main::opt_evince, + "web!" => \$main::opt_web, + "dot!" => \$main::opt_dot, + "ps!" => \$main::opt_ps, + "pdf!" => \$main::opt_pdf, + "svg!" => \$main::opt_svg, + "gif!" => \$main::opt_gif, + "raw!" => \$main::opt_raw, + "collapsed!" => \$main::opt_collapsed, + "interactive!" => \$main::opt_interactive, + "nodecount=i" => \$main::opt_nodecount, + "nodefraction=f" => \$main::opt_nodefraction, + "edgefraction=f" => \$main::opt_edgefraction, + "maxdegree=i" => \$main::opt_maxdegree, + "focus=s" => \$main::opt_focus, + "thread=s" => \$main::opt_thread, + "ignore=s" => \$main::opt_ignore, + "scale=i" => \$main::opt_scale, + "heapcheck" => \$main::opt_heapcheck, + "retain=s" => \$main::opt_retain, + "exclude=s" => \$main::opt_exclude, + "inuse_space!" => \$main::opt_inuse_space, + "inuse_objects!" => \$main::opt_inuse_objects, + "alloc_space!" => \$main::opt_alloc_space, + "alloc_objects!" => \$main::opt_alloc_objects, + "show_bytes!" => \$main::opt_show_bytes, + "drop_negative!" => \$main::opt_drop_negative, + "total_delay!" => \$main::opt_total_delay, + "contentions!" => \$main::opt_contentions, + "mean_delay!" => \$main::opt_mean_delay, + "tools=s" => \$main::opt_tools, + "test!" => \$main::opt_test, + "debug!" => \$main::opt_debug, + "debug-syms-by-id!" => \$main::opt_debug_syms_by_id, + # Undocumented flags used only by unittests: + "test_stride=i" => \$main::opt_test_stride, + ) || usage("Invalid option(s)"); + + # Deal with the standard --help and --version + if ($main::opt_help) { + print usage_string(); + exit(0); + } + + if ($main::opt_version) { + print version_string(); + exit(0); + } + + # Disassembly/listing/symbols mode requires address-level info + if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) { + $main::opt_functions = 0; + $main::opt_lines = 0; + $main::opt_addresses = 1; + $main::opt_files = 0; + } + + # Check heap-profiling flags + if ($main::opt_inuse_space + + $main::opt_inuse_objects + + $main::opt_alloc_space + + $main::opt_alloc_objects > 1) { + usage("Specify at most on of --inuse/--alloc options"); + } + + # Check output granularities + my $grains = + $main::opt_functions + + $main::opt_lines + + $main::opt_addresses + + $main::opt_files + + 0; + if ($grains > 1) { + usage("Only specify one output granularity option"); + } + if ($grains == 0) { + $main::opt_functions = 1; + } + + # Check output modes + my $modes = + $main::opt_text + + $main::opt_callgrind + + ($main::opt_list eq '' ? 0 : 1) + + ($main::opt_disasm eq '' ? 0 : 1) + + ($main::opt_symbols == 0 ? 0 : 1) + + $main::opt_gv + + $main::opt_evince + + $main::opt_web + + $main::opt_dot + + $main::opt_ps + + $main::opt_pdf + + $main::opt_svg + + $main::opt_gif + + $main::opt_raw + + $main::opt_collapsed + + $main::opt_interactive + + 0; + if ($modes > 1) { + usage("Only specify one output mode"); + } + if ($modes == 0) { + if (-t STDOUT) { # If STDOUT is a tty, activate interactive mode + $main::opt_interactive = 1; + } else { + $main::opt_text = 1; + } + } + + if ($main::opt_test) { + RunUnitTests(); + # Should not return + exit(1); + } + + # Binary name and profile arguments list + $main::prog = ""; + @main::pfile_args = (); + + # Remote profiling without a binary (using $SYMBOL_PAGE instead) + if (@ARGV > 0) { + if (IsProfileURL($ARGV[0])) { + $main::use_symbol_page = 1; + } elsif (IsSymbolizedProfileFile($ARGV[0])) { + $main::use_symbolized_profile = 1; + $main::prog = $UNKNOWN_BINARY; # will be set later from the profile file + } + } + + if ($main::use_symbol_page || $main::use_symbolized_profile) { + # We don't need a binary! + my %disabled = ('--lines' => $main::opt_lines, + '--disasm' => $main::opt_disasm); + for my $option (keys %disabled) { + usage("$option cannot be used without a binary") if $disabled{$option}; + } + # Set $main::prog later... + scalar(@ARGV) || usage("Did not specify profile file"); + } elsif ($main::opt_symbols) { + # --symbols needs a binary-name (to run nm on, etc) but not profiles + $main::prog = shift(@ARGV) || usage("Did not specify program"); + } else { + $main::prog = shift(@ARGV) || usage("Did not specify program"); + scalar(@ARGV) || usage("Did not specify profile file"); + } + + # Parse profile file/location arguments + foreach my $farg (@ARGV) { + if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) { + my $machine = $1; + my $num_machines = $2; + my $path = $3; + for (my $i = 0; $i < $num_machines; $i++) { + unshift(@main::pfile_args, "$i.$machine$path"); + } + } else { + unshift(@main::pfile_args, $farg); + } + } + + if ($main::use_symbol_page) { + unless (IsProfileURL($main::pfile_args[0])) { + error("The first profile should be a remote form to use $SYMBOL_PAGE\n"); + } + CheckSymbolPage(); + $main::prog = FetchProgramName(); + } elsif (!$main::use_symbolized_profile) { # may not need objtools! + ConfigureObjTools($main::prog) + } + + # Break the opt_lib_prefix into the prefix_list array + @prefix_list = split (',', $main::opt_lib_prefix); + + # Remove trailing / from the prefixes, in the list to prevent + # searching things like /my/path//lib/mylib.so + foreach (@prefix_list) { + s|/+$||; + } + + # Flag to prevent us from trying over and over to use + # elfutils if it's not installed (used only with + # --debug-syms-by-id option). + $main::gave_up_on_elfutils = 0; +} + +sub FilterAndPrint { + my ($profile, $symbols, $libs, $thread) = @_; + + # Get total data in profile + my $total = TotalProfile($profile); + + # Remove uniniteresting stack items + $profile = RemoveUninterestingFrames($symbols, $profile); + + # Focus? + if ($main::opt_focus ne '') { + $profile = FocusProfile($symbols, $profile, $main::opt_focus); + } + + # Ignore? + if ($main::opt_ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore); + } + + my $calls = ExtractCalls($symbols, $profile); + + # Reduce profiles to required output granularity, and also clean + # each stack trace so a given entry exists at most once. + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + # Print + if (!$main::opt_interactive) { + if ($main::opt_disasm) { + PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm); + } elsif ($main::opt_list) { + PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0); + } elsif ($main::opt_text) { + # Make sure the output is empty when have nothing to report + # (only matters when --heapcheck is given but we must be + # compatible with old branches that did not pass --heapcheck always): + if ($total != 0) { + printf("Total%s: %s %s\n", + (defined($thread) ? " (t$thread)" : ""), + Unparse($total), Units()); + } + PrintText($symbols, $flat, $cumulative, -1); + } elsif ($main::opt_raw) { + PrintSymbolizedProfile($symbols, $profile, $main::prog); + } elsif ($main::opt_collapsed) { + PrintCollapsedStacks($symbols, $profile); + } elsif ($main::opt_callgrind) { + PrintCallgrind($calls); + } else { + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), ""); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), ""); + } elsif ($main::opt_web) { + my $tmp = TempName($main::next_tmpfile, "svg"); + RunWeb($tmp); + # The command we run might hand the file name off + # to an already running browser instance and then exit. + # Normally, we'd remove $tmp on exit (right now), + # but fork a child to remove $tmp a little later, so that the + # browser has time to load it first. + delete $main::tempnames{$tmp}; + if (fork() == 0) { + sleep 5; + unlink($tmp); + exit(0); + } + } + } else { + cleanup(); + exit(1); + } + } + } else { + InteractiveMode($profile, $symbols, $libs, $total); + } +} + +sub Main() { + Init(); + $main::collected_profile = undef; + @main::profile_files = (); + $main::op_time = time(); + + # Printing symbols is special and requires a lot less info that most. + if ($main::opt_symbols) { + PrintSymbols(*STDIN); # Get /proc/maps and symbols output from stdin + return; + } + + # Fetch all profile data + FetchDynamicProfiles(); + + # this will hold symbols that we read from the profile files + my $symbol_map = {}; + + # Read one profile, pick the last item on the list + my $data = ReadProfile($main::prog, pop(@main::profile_files)); + my $profile = $data->{profile}; + my $pcs = $data->{pcs}; + my $libs = $data->{libs}; # Info about main program and shared libraries + $symbol_map = MergeSymbols($symbol_map, $data->{symbols}); + + # Add additional profiles, if available. + if (scalar(@main::profile_files) > 0) { + foreach my $pname (@main::profile_files) { + my $data2 = ReadProfile($main::prog, $pname); + $profile = AddProfile($profile, $data2->{profile}); + $pcs = AddPcs($pcs, $data2->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $data2->{symbols}); + } + } + + # Subtract base from profile, if specified + if ($main::opt_base ne '') { + my $base = ReadProfile($main::prog, $main::opt_base); + $profile = SubtractProfile($profile, $base->{profile}); + $pcs = AddPcs($pcs, $base->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $base->{symbols}); + } + + # Collect symbols + my $symbols; + if ($main::use_symbolized_profile) { + $symbols = FetchSymbols($pcs, $symbol_map); + } elsif ($main::use_symbol_page) { + $symbols = FetchSymbols($pcs); + } else { + # TODO(csilvers): $libs uses the /proc/self/maps data from profile1, + # which may differ from the data from subsequent profiles, especially + # if they were run on different machines. Use appropriate libs for + # each pc somehow. + $symbols = ExtractSymbols($libs, $pcs); + } + + if (!defined($main::opt_thread)) { + FilterAndPrint($profile, $symbols, $libs); + } + if (defined($data->{threads})) { + foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) { + if (defined($main::opt_thread) && + ($main::opt_thread eq '*' || $main::opt_thread == $thread)) { + my $thread_profile = $data->{threads}{$thread}; + FilterAndPrint($thread_profile, $symbols, $libs, $thread); + } + } + } + + cleanup(); + exit(0); +} + +##### Entry Point ##### + +Main(); + +# Temporary code to detect if we're running on a Goobuntu system. +# These systems don't have the right stuff installed for the special +# Readline libraries to work, so as a temporary workaround, we default +# to using the normal stdio code, rather than the fancier readline-based +# code +sub ReadlineMightFail { + if (-e '/lib/libtermcap.so.2') { + return 0; # libtermcap exists, so readline should be okay + } else { + return 1; + } +} + +sub RunGV { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) { + # Options using double dash are supported by this gv version. + # Also, turn on noantialias to better handle bug in gv for + # postscript files with large dimensions. + # TODO: Maybe we should not pass the --noantialias flag + # if the gv version is known to work properly without the flag. + system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname) + . $bg); + } else { + # Old gv version - only supports options that use single dash. + print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n"; + system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg); + } +} + +sub RunEvince { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + system(ShellEscape(@EVINCE, $fname) . $bg); +} + +sub RunWeb { + my $fname = shift; + print STDERR "Loading web page file:///$fname\n"; + + if (`uname` =~ /Darwin/) { + # OS X: open will use standard preference for SVG files. + system("/usr/bin/open", $fname); + return; + } + + # Some kind of Unix; try generic symlinks, then specific browsers. + # (Stop once we find one.) + # Works best if the browser is already running. + my @alt = ( + "/etc/alternatives/gnome-www-browser", + "/etc/alternatives/x-www-browser", + "google-chrome", + "firefox", + ); + foreach my $b (@alt) { + if (system($b, $fname) == 0) { + return; + } + } + + print STDERR "Could not load web browser.\n"; +} + +sub RunKcachegrind { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n"; + system(ShellEscape(@KCACHEGRIND, $fname) . $bg); +} + + +##### Interactive helper routines ##### + +sub InteractiveMode { + $| = 1; # Make output unbuffered for interactive mode + my ($orig_profile, $symbols, $libs, $total) = @_; + + print STDERR "Welcome to jeprof! For help, type 'help'.\n"; + + # Use ReadLine if it's installed and input comes from a console. + if ( -t STDIN && + !ReadlineMightFail() && + defined(eval {require Term::ReadLine}) ) { + my $term = new Term::ReadLine 'jeprof'; + while ( defined ($_ = $term->readline('(jeprof) '))) { + $term->addhistory($_) if /\S/; + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + } + } else { # don't have readline + while (1) { + print STDERR "(jeprof) "; + $_ = ; + last if ! defined $_ ; + s/\r//g; # turn windows-looking lines into unix-looking lines + + # Save some flags that might be reset by InteractiveCommand() + my $save_opt_lines = $main::opt_lines; + + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + + # Restore flags + $main::opt_lines = $save_opt_lines; + } + } +} + +# Takes two args: orig profile, and command to run. +# Returns 1 if we should keep going, or 0 if we were asked to quit +sub InteractiveCommand { + my($orig_profile, $symbols, $libs, $total, $command) = @_; + $_ = $command; # just to make future m//'s easier + if (!defined($_)) { + print STDERR "\n"; + return 0; + } + if (m/^\s*quit/) { + return 0; + } + if (m/^\s*help/) { + InteractiveHelpMessage(); + return 1; + } + # Clear all the mode options -- mode is controlled by "$command" + $main::opt_text = 0; + $main::opt_callgrind = 0; + $main::opt_disasm = 0; + $main::opt_list = 0; + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_cum = 0; + + if (m/^\s*(text|top)(\d*)\s*(.*)/) { + $main::opt_text = 1; + + my $line_limit = ($2 ne "") ? int($2) : 10; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($3); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintText($symbols, $flat, $cumulative, $line_limit); + return 1; + } + if (m/^\s*callgrind\s*([^ \n]*)/) { + $main::opt_callgrind = 1; + + # Get derived profiles + my $calls = ExtractCalls($symbols, $orig_profile); + my $filename = $1; + if ( $1 eq '' ) { + $filename = TempName($main::next_tmpfile, "callgrind"); + } + PrintCallgrind($calls, $filename); + if ( $1 eq '' ) { + RunKcachegrind($filename, " & "); + $main::next_tmpfile++; + } + + return 1; + } + if (m/^\s*(web)?list\s*(.+)/) { + my $html = (defined($1) && ($1 eq "web")); + $main::opt_list = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($2); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintListing($total, $libs, $flat, $cumulative, $routine, $html); + return 1; + } + if (m/^\s*disasm\s*(.+)/) { + $main::opt_disasm = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($1); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintDisassembly($libs, $flat, $cumulative, $routine); + return 1; + } + if (m/^\s*(gv|web|evince)\s*(.*)/) { + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_web = 0; + if ($1 eq "gv") { + $main::opt_gv = 1; + } elsif ($1 eq "evince") { + $main::opt_evince = 1; + } elsif ($1 eq "web") { + $main::opt_web = 1; + } + + my $focus; + my $ignore; + ($focus, $ignore) = ParseInteractiveArgs($2); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, + $focus, $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), " &"); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), " &"); + } elsif ($main::opt_web) { + RunWeb(TempName($main::next_tmpfile, "svg")); + } + $main::next_tmpfile++; + } + return 1; + } + if (m/^\s*$/) { + return 1; + } + print STDERR "Unknown command: try 'help'.\n"; + return 1; +} + + +sub ProcessProfile { + my $total_count = shift; + my $orig_profile = shift; + my $symbols = shift; + my $focus = shift; + my $ignore = shift; + + # Process current profile to account for various settings + my $profile = $orig_profile; + printf("Total: %s %s\n", Unparse($total_count), Units()); + if ($focus ne '') { + $profile = FocusProfile($symbols, $profile, $focus); + my $focus_count = TotalProfile($profile); + printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n", + $focus, + Unparse($focus_count), Units(), + Unparse($total_count), ($focus_count*100.0) / $total_count); + } + if ($ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $ignore); + my $ignore_count = TotalProfile($profile); + printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n", + $ignore, + Unparse($ignore_count), Units(), + Unparse($total_count), + ($ignore_count*100.0) / $total_count); + } + + return $profile; +} + +sub InteractiveHelpMessage { + print STDERR <{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + my $depth = $#addrs + 1; + # int(foo / 2**32) is the only reliable way to get rid of bottom + # 32 bits on both 32- and 64-bit systems. + print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32)); + print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32)); + + foreach my $full_addr (@addrs) { + my $addr = $full_addr; + $addr =~ s/0x0*//; # strip off leading 0x, zeroes + if (length($addr) > 16) { + print STDERR "Invalid address in profile: $full_addr\n"; + next; + } + my $low_addr = substr($addr, -8); # get last 8 hex chars + my $high_addr = substr($addr, -16, 8); # get up to 8 more hex chars + print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr)); + } + } + } +} + +# Print symbols and profile data +sub PrintSymbolizedProfile { + my $symbols = shift; + my $profile = shift; + my $prog = shift; + + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + + print '--- ', $symbol_marker, "\n"; + if (defined($prog)) { + print 'binary=', $prog, "\n"; + } + while (my ($pc, $name) = each(%{$symbols})) { + my $sep = ' '; + print '0x', $pc; + # We have a list of function names, which include the inlined + # calls. They are separated (and terminated) by --, which is + # illegal in function names. + for (my $j = 2; $j <= $#{$name}; $j += 3) { + print $sep, $name->[$j]; + $sep = '--'; + } + print "\n"; + } + print '---', "\n"; + + my $profile_marker; + if ($main::profile_type eq 'heap') { + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'growth') { + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'contention') { + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } else { # elsif ($main::profile_type eq 'cpu') + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } + + print '--- ', $profile_marker, "\n"; + if (defined($main::collected_profile)) { + # if used with remote fetch, simply dump the collected profile to output. + open(SRC, "<$main::collected_profile"); + while () { + print $_; + } + close(SRC); + } else { + # --raw/http: For everything to work correctly for non-remote profiles, we + # would need to extend PrintProfileData() to handle all possible profile + # types, re-enable the code that is currently disabled in ReadCPUProfile() + # and FixCallerAddresses(), and remove the remote profile dumping code in + # the block above. + die "--raw/http: jeprof can only dump remote profiles for --raw\n"; + # dump a cpu-format profile to standard out + PrintProfileData($profile); + } +} + +# Print text output +sub PrintText { + my $symbols = shift; + my $flat = shift; + my $cumulative = shift; + my $line_limit = shift; + + my $total = TotalProfile($flat); + + # Which profile to sort by? + my $s = $main::opt_cum ? $cumulative : $flat; + + my $running_sum = 0; + my $lines = 0; + foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b } + keys(%{$cumulative})) { + my $f = GetEntry($flat, $k); + my $c = GetEntry($cumulative, $k); + $running_sum += $f; + + my $sym = $k; + if (exists($symbols->{$k})) { + $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1]; + if ($main::opt_addresses) { + $sym = $k . " " . $sym; + } + } + + if ($f != 0 || $c != 0) { + printf("%8s %6s %6s %8s %6s %s\n", + Unparse($f), + Percent($f, $total), + Percent($running_sum, $total), + Unparse($c), + Percent($c, $total), + $sym); + } + $lines++; + last if ($line_limit >= 0 && $lines >= $line_limit); + } +} + +# Callgrind format has a compression for repeated function and file +# names. You show the name the first time, and just use its number +# subsequently. This can cut down the file to about a third or a +# quarter of its uncompressed size. $key and $val are the key/value +# pair that would normally be printed by callgrind; $map is a map from +# value to number. +sub CompressedCGName { + my($key, $val, $map) = @_; + my $idx = $map->{$val}; + # For very short keys, providing an index hurts rather than helps. + if (length($val) <= 3) { + return "$key=$val\n"; + } elsif (defined($idx)) { + return "$key=($idx)\n"; + } else { + # scalar(keys $map) gives the number of items in the map. + $idx = scalar(keys(%{$map})) + 1; + $map->{$val} = $idx; + return "$key=($idx) $val\n"; + } +} + +# Print the call graph in a way that's suiteable for callgrind. +sub PrintCallgrind { + my $calls = shift; + my $filename; + my %filename_to_index_map; + my %fnname_to_index_map; + + if ($main::opt_interactive) { + $filename = shift; + print STDERR "Writing callgrind file to '$filename'.\n" + } else { + $filename = "&STDOUT"; + } + open(CG, ">$filename"); + printf CG ("events: Hits\n\n"); + foreach my $call ( map { $_->[0] } + sort { $a->[1] cmp $b ->[1] || + $a->[2] <=> $b->[2] } + map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + [$_, $1, $2] } + keys %$calls ) { + my $count = int($calls->{$call}); + $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + my ( $caller_file, $caller_line, $caller_function, + $callee_file, $callee_line, $callee_function ) = + ( $1, $2, $3, $5, $6, $7 ); + + # TODO(csilvers): for better compression, collect all the + # caller/callee_files and functions first, before printing + # anything, and only compress those referenced more than once. + printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map); + printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map); + if (defined $6) { + printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map); + printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map); + printf CG ("calls=$count $callee_line\n"); + } + printf CG ("$caller_line $count\n\n"); + } +} + +# Print disassembly for all all routines that match $main::opt_disasm +sub PrintDisassembly { + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $disasm_opts = shift; + + my $total = TotalProfile($flat); + + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + # See if there are any samples in this routine + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + PrintDisassembledFunction($lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, $total); + last; + } + $addr = AddressInc($addr); + } + } + } +} + +# Return reference to array of tuples of the form: +# [start_address, filename, linenumber, instruction, limit_address] +# E.g., +# ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"] +sub Disassemble { + my $prog = shift; + my $offset = shift; + my $start_addr = shift; + my $end_addr = shift; + + my $objdump = $obj_tool_map{"objdump"}; + my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn", + "--start-address=0x$start_addr", + "--stop-address=0x$end_addr", $prog); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + my @result = (); + my $filename = ""; + my $linenumber = -1; + my $last = ["", "", "", ""]; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + chop; + if (m|\s*([^:\s]+):(\d+)\s*$|) { + # Location line of the form: + # : + $filename = $1; + $linenumber = $2; + } elsif (m/^ +([0-9a-f]+):\s*(.*)/) { + # Disassembly line -- zero-extend address to full length + my $addr = HexExtend($1); + my $k = AddressAdd($addr, $offset); + $last->[4] = $k; # Store ending address for previous instruction + $last = [$k, $filename, $linenumber, $2, $end_addr]; + push(@result, $last); + } + } + close(OBJDUMP); + return @result; +} + +# The input file should contain lines of the form /proc/maps-like +# output (same format as expected from the profiles) or that looks +# like hex addresses (like "0xDEADBEEF"). We will parse all +# /proc/maps output, and for all the hex addresses, we will output +# "short" symbol names, one per line, in the same order as the input. +sub PrintSymbols { + my $maps_and_symbols_file = shift; + + # ParseLibraries expects pcs to be in a set. Fine by us... + my @pclist = (); # pcs in sorted order + my $pcs = {}; + my $map = ""; + foreach my $line (<$maps_and_symbols_file>) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /\b(0x[0-9a-f]+)\b/i) { + push(@pclist, HexExtend($1)); + $pcs->{$pclist[-1]} = 1; + } else { + $map .= $line; + } + } + + my $libs = ParseLibraries($main::prog, $map, $pcs); + my $symbols = ExtractSymbols($libs, $pcs); + + foreach my $pc (@pclist) { + # ->[0] is the shortname, ->[2] is the full name + print(($symbols->{$pc}->[0] || "??") . "\n"); + } +} + + +# For sorting functions by name +sub ByName { + return ShortFunctionName($a) cmp ShortFunctionName($b); +} + +# Print source-listing for all all routines that match $list_opts +sub PrintListing { + my $total = shift; + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $list_opts = shift; + my $html = shift; + + my $output = \*STDOUT; + my $fname = ""; + + if ($html) { + # Arrange to write the output to a temporary file + $fname = TempName($main::next_tmpfile, "html"); + $main::next_tmpfile++; + if (!open(TEMP, ">$fname")) { + print STDERR "$fname: $!\n"; + return; + } + $output = \*TEMP; + print $output HtmlListingHeader(); + printf $output ("
%s
Total: %s %s
\n", + $main::prog, Unparse($total), Units()); + } + + my $listed = 0; + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + # Print if there are any samples in this routine + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + $listed += PrintSource( + $lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, + $html, + $output); + last; + } + $addr = AddressInc($addr); + } + } + } + + if ($html) { + if ($listed > 0) { + print $output HtmlListingFooter(); + close($output); + RunWeb($fname); + } else { + close($output); + unlink($fname); + } + } +} + +sub HtmlListingHeader { + return <<'EOF'; + + + +Pprof listing + + + + +EOF +} + +sub HtmlListingFooter { + return <<'EOF'; + + +EOF +} + +sub HtmlEscape { + my $text = shift; + $text =~ s/&/&/g; + $text =~ s//>/g; + return $text; +} + +# Returns the indentation of the line, if it has any non-whitespace +# characters. Otherwise, returns -1. +sub Indentation { + my $line = shift; + if (m/^(\s*)\S/) { + return length($1); + } else { + return -1; + } +} + +# If the symbol table contains inlining info, Disassemble() may tag an +# instruction with a location inside an inlined function. But for +# source listings, we prefer to use the location in the function we +# are listing. So use MapToSymbols() to fetch full location +# information for each instruction and then pick out the first +# location from a location list (location list contains callers before +# callees in case of inlining). +# +# After this routine has run, each entry in $instructions contains: +# [0] start address +# [1] filename for function we are listing +# [2] line number for function we are listing +# [3] disassembly +# [4] limit address +# [5] most specific filename (may be different from [1] due to inlining) +# [6] most specific line number (may be different from [2] due to inlining) +sub GetTopLevelLineNumbers { + my ($lib, $offset, $instructions) = @_; + my $pcs = []; + for (my $i = 0; $i <= $#{$instructions}; $i++) { + push(@{$pcs}, $instructions->[$i]->[0]); + } + my $symbols = {}; + MapToSymbols($lib, $offset, $pcs, $symbols); + for (my $i = 0; $i <= $#{$instructions}; $i++) { + my $e = $instructions->[$i]; + push(@{$e}, $e->[1]); + push(@{$e}, $e->[2]); + my $addr = $e->[0]; + my $sym = $symbols->{$addr}; + if (defined($sym)) { + if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) { + $e->[1] = $1; # File name + $e->[2] = $2; # Line number + } + } + } +} + +# Print source-listing for one routine +sub PrintSource { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $html = shift; + my $output = shift; + + # Disassemble all instructions (just to get line numbers) + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + GetTopLevelLineNumbers($prog, $offset, \@instructions); + + # Hack 1: assume that the first source file encountered in the + # disassembly contains the routine + my $filename = undef; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[2] >= 0) { + $filename = $instructions[$i]->[1]; + last; + } + } + if (!defined($filename)) { + print STDERR "no filename found in $routine\n"; + return 0; + } + + # Hack 2: assume that the largest line number from $filename is the + # end of the procedure. This is typically safe since if P1 contains + # an inlined call to P2, then P2 usually occurs earlier in the + # source file. If this does not work, we might have to compute a + # density profile or just print all regions we find. + my $lastline = 0; + for (my $i = 0; $i <= $#instructions; $i++) { + my $f = $instructions[$i]->[1]; + my $l = $instructions[$i]->[2]; + if (($f eq $filename) && ($l > $lastline)) { + $lastline = $l; + } + } + + # Hack 3: assume the first source location from "filename" is the start of + # the source code. + my $firstline = 1; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[1] eq $filename) { + $firstline = $instructions[$i]->[2]; + last; + } + } + + # Hack 4: Extend last line forward until its indentation is less than + # the indentation we saw on $firstline + my $oldlastline = $lastline; + { + if (!open(FILE, "<$filename")) { + print STDERR "$filename: $!\n"; + return 0; + } + my $l = 0; + my $first_indentation = -1; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + $l++; + my $indent = Indentation($_); + if ($l >= $firstline) { + if ($first_indentation < 0 && $indent >= 0) { + $first_indentation = $indent; + last if ($first_indentation == 0); + } + } + if ($l >= $lastline && $indent >= 0) { + if ($indent >= $first_indentation) { + $lastline = $l+1; + } else { + last; + } + } + } + close(FILE); + } + + # Assign all samples to the range $firstline,$lastline, + # Hack 4: If an instruction does not occur in the range, its samples + # are moved to the next instruction that occurs in the range. + my $samples1 = {}; # Map from line number to flat count + my $samples2 = {}; # Map from line number to cumulative count + my $running1 = 0; # Unassigned flat counts + my $running2 = 0; # Unassigned cumulative counts + my $total1 = 0; # Total flat counts + my $total2 = 0; # Total cumulative counts + my %disasm = (); # Map from line number to disassembly + my $running_disasm = ""; # Unassigned disassembly + my $skip_marker = "---\n"; + if ($html) { + $skip_marker = ""; + for (my $l = $firstline; $l <= $lastline; $l++) { + $disasm{$l} = ""; + } + } + my $last_dis_filename = ''; + my $last_dis_linenum = -1; + my $last_touched_line = -1; # To detect gaps in disassembly for a line + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + + if ($html) { + my $dis = sprintf(" %6s %6s \t\t%8s: %s ", + HtmlPrintNumber($c1), + HtmlPrintNumber($c2), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + + # Append the most specific source line associated with this instruction + if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) }; + $dis = HtmlEscape($dis); + my $f = $e->[5]; + my $l = $e->[6]; + if ($f ne $last_dis_filename) { + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } elsif ($l ne $last_dis_linenum) { + # De-emphasize the unchanged file name portion + $dis .= sprintf("%s" . + ":%d", + HtmlEscape(CleanFileName($f)), $l); + } else { + # De-emphasize the entire location + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } + $last_dis_filename = $f; + $last_dis_linenum = $l; + $running_disasm .= $dis; + $running_disasm .= "\n"; + } + + $running1 += $c1; + $running2 += $c2; + $total1 += $c1; + $total2 += $c2; + my $file = $e->[1]; + my $line = $e->[2]; + if (($file eq $filename) && + ($line >= $firstline) && + ($line <= $lastline)) { + # Assign all accumulated samples to this line + AddEntry($samples1, $line, $running1); + AddEntry($samples2, $line, $running2); + $running1 = 0; + $running2 = 0; + if ($html) { + if ($line != $last_touched_line && $disasm{$line} ne '') { + $disasm{$line} .= "\n"; + } + $disasm{$line} .= $running_disasm; + $running_disasm = ''; + $last_touched_line = $line; + } + } + } + + # Assign any leftover samples to $lastline + AddEntry($samples1, $lastline, $running1); + AddEntry($samples2, $lastline, $running2); + if ($html) { + if ($lastline != $last_touched_line && $disasm{$lastline} ne '') { + $disasm{$lastline} .= "\n"; + } + $disasm{$lastline} .= $running_disasm; + } + + if ($html) { + printf $output ( + "

%s

%s\n
\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while () {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "" . $dis . "";
+        }
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "%5d " .
+          "%6s %6s %s%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "
\n"; + } + return 1; +} + +# Return the source line for the specified file/linenumber. +# Returns undef if not found. +sub SourceLine { + my $file = shift; + my $line = shift; + + # Look in cache + if (!defined($main::source_cache{$file})) { + if (100 < scalar keys(%main::source_cache)) { + # Clear the cache when it gets too big + $main::source_cache = (); + } + + # Read all lines from the file + if (!open(FILE, "<$file")) { + print STDERR "$file: $!\n"; + $main::source_cache{$file} = []; # Cache the negative result + return undef; + } + my $lines = []; + push(@{$lines}, ""); # So we can use 1-based line numbers as indices + while () { + push(@{$lines}, $_); + } + close(FILE); + + # Save the lines in the cache + $main::source_cache{$file} = $lines; + } + + my $lines = $main::source_cache{$file}; + if (($line < 0) || ($line > $#{$lines})) { + return undef; + } else { + return $lines->[$line]; + } +} + +# Print disassembly for one routine with interspersed source if available +sub PrintDisassembledFunction { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $total = shift; + + # Disassemble all instructions + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + + # Make array of counts per instruction + my @flat_count = (); + my @cum_count = (); + my $flat_total = 0; + my $cum_total = 0; + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + push(@flat_count, $c1); + push(@cum_count, $c2); + $flat_total += $c1; + $cum_total += $c2; + } + + # Print header with total counts + printf("ROUTINE ====================== %s\n" . + "%6s %6s %s (flat, cumulative) %.1f%% of total\n", + ShortFunctionName($routine), + Unparse($flat_total), + Unparse($cum_total), + Units(), + ($cum_total * 100.0) / $total); + + # Process instructions in order + my $current_file = ""; + for (my $i = 0; $i <= $#instructions; ) { + my $e = $instructions[$i]; + + # Print the new file name whenever we switch files + if ($e->[1] ne $current_file) { + $current_file = $e->[1]; + my $fname = $current_file; + $fname =~ s|^\./||; # Trim leading "./" + + # Shorten long file names + if (length($fname) >= 58) { + $fname = "..." . substr($fname, -55); + } + printf("-------------------- %s\n", $fname); + } + + # TODO: Compute range of lines to print together to deal with + # small reorderings. + my $first_line = $e->[2]; + my $last_line = $first_line; + my %flat_sum = (); + my %cum_sum = (); + for (my $l = $first_line; $l <= $last_line; $l++) { + $flat_sum{$l} = 0; + $cum_sum{$l} = 0; + } + + # Find run of instructions for this range of source lines + my $first_inst = $i; + while (($i <= $#instructions) && + ($instructions[$i]->[2] >= $first_line) && + ($instructions[$i]->[2] <= $last_line)) { + $e = $instructions[$i]; + $flat_sum{$e->[2]} += $flat_count[$i]; + $cum_sum{$e->[2]} += $cum_count[$i]; + $i++; + } + my $last_inst = $i - 1; + + # Print source lines + for (my $l = $first_line; $l <= $last_line; $l++) { + my $line = SourceLine($current_file, $l); + if (!defined($line)) { + $line = "?\n"; + next; + } else { + $line =~ s/^\s+//; + } + printf("%6s %6s %5d: %s", + UnparseAlt($flat_sum{$l}), + UnparseAlt($cum_sum{$l}), + $l, + $line); + } + + # Print disassembly + for (my $x = $first_inst; $x <= $last_inst; $x++) { + my $e = $instructions[$x]; + printf("%6s %6s %8s: %6s\n", + UnparseAlt($flat_count[$x]), + UnparseAlt($cum_count[$x]), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + } + } +} + +# Print DOT graph +sub PrintDot { + my $prog = shift; + my $symbols = shift; + my $raw = shift; + my $flat = shift; + my $cumulative = shift; + my $overall_total = shift; + + # Get total + my $local_total = TotalProfile($flat); + my $nodelimit = int($main::opt_nodefraction * $local_total); + my $edgelimit = int($main::opt_edgefraction * $local_total); + my $nodecount = $main::opt_nodecount; + + # Find nodes to include + my @list = (sort { abs(GetEntry($cumulative, $b)) <=> + abs(GetEntry($cumulative, $a)) + || $a cmp $b } + keys(%{$cumulative})); + my $last = $nodecount - 1; + if ($last > $#list) { + $last = $#list; + } + while (($last >= 0) && + (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) { + $last--; + } + if ($last < 0) { + print STDERR "No nodes to print\n"; + return 0; + } + + if ($nodelimit > 0 || $edgelimit > 0) { + printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n", + Unparse($nodelimit), Units(), + Unparse($edgelimit), Units()); + } + + # Open DOT output file + my $output; + my $escaped_dot = ShellEscape(@DOT); + my $escaped_ps2pdf = ShellEscape(@PS2PDF); + if ($main::opt_gv) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps")); + $output = "| $escaped_dot -Tps2 >$escaped_outfile"; + } elsif ($main::opt_evince) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf")); + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile"; + } elsif ($main::opt_ps) { + $output = "| $escaped_dot -Tps2"; + } elsif ($main::opt_pdf) { + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -"; + } elsif ($main::opt_web || $main::opt_svg) { + # We need to post-process the SVG, so write to a temporary file always. + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg")); + $output = "| $escaped_dot -Tsvg >$escaped_outfile"; + } elsif ($main::opt_gif) { + $output = "| $escaped_dot -Tgif"; + } else { + $output = ">&STDOUT"; + } + open(DOT, $output) || error("$output: $!\n"); + + # Title + printf DOT ("digraph \"%s; %s %s\" {\n", + $prog, + Unparse($overall_total), + Units()); + if ($main::opt_pdf) { + # The output is more printable if we set the page size for dot. + printf DOT ("size=\"8,11\"\n"); + } + printf DOT ("node [width=0.375,height=0.25];\n"); + + # Print legend + printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," . + "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n", + $prog, + sprintf("Total %s: %s", Units(), Unparse($overall_total)), + sprintf("Focusing on: %s", Unparse($local_total)), + sprintf("Dropped nodes with <= %s abs(%s)", + Unparse($nodelimit), Units()), + sprintf("Dropped edges with <= %s %s", + Unparse($edgelimit), Units()) + ); + + # Print nodes + my %node = (); + my $nextnode = 1; + foreach my $a (@list[0..$last]) { + # Pick font size + my $f = GetEntry($flat, $a); + my $c = GetEntry($cumulative, $a); + + my $fs = 8; + if ($local_total > 0) { + $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total))); + } + + $node{$a} = $nextnode++; + my $sym = $a; + $sym =~ s/\s+/\\n/g; + $sym =~ s/::/\\n/g; + + # Extra cumulative info to print for non-leaves + my $extra = ""; + if ($f != $c) { + $extra = sprintf("\\rof %s (%s)", + Unparse($c), + Percent($c, $local_total)); + } + my $style = ""; + if ($main::opt_heapcheck) { + if ($f > 0) { + # make leak-causing nodes more visible (add a background) + $style = ",style=filled,fillcolor=gray" + } elsif ($f < 0) { + # make anti-leak-causing nodes (which almost never occur) + # stand out as well (triple border) + $style = ",peripheries=3" + } + } + + printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" . + "\",shape=box,fontsize=%.1f%s];\n", + $node{$a}, + $sym, + Unparse($f), + Percent($f, $local_total), + $extra, + $fs, + $style, + ); + } + + # Get edges and counts per edge + my %edge = (); + my $n; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$raw})) { + # TODO: omit low %age edges + $n = $raw->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + for (my $i = 1; $i <= $#translated; $i++) { + my $src = $translated[$i]; + my $dst = $translated[$i-1]; + #next if ($src eq $dst); # Avoid self-edges? + if (exists($node{$src}) && exists($node{$dst})) { + my $edge_label = "$src\001$dst"; + if (!exists($edge{$edge_label})) { + $edge{$edge_label} = 0; + } + $edge{$edge_label} += $n; + } + } + } + + # Print edges (process in order of decreasing counts) + my %indegree = (); # Number of incoming edges added per node so far + my %outdegree = (); # Number of outgoing edges added per node so far + foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) { + my @x = split(/\001/, $e); + $n = $edge{$e}; + + # Initialize degree of kept incoming and outgoing edges if necessary + my $src = $x[0]; + my $dst = $x[1]; + if (!exists($outdegree{$src})) { $outdegree{$src} = 0; } + if (!exists($indegree{$dst})) { $indegree{$dst} = 0; } + + my $keep; + if ($indegree{$dst} == 0) { + # Keep edge if needed for reachability + $keep = 1; + } elsif (abs($n) <= $edgelimit) { + # Drop if we are below --edgefraction + $keep = 0; + } elsif ($outdegree{$src} >= $main::opt_maxdegree || + $indegree{$dst} >= $main::opt_maxdegree) { + # Keep limited number of in/out edges per node + $keep = 0; + } else { + $keep = 1; + } + + if ($keep) { + $outdegree{$src}++; + $indegree{$dst}++; + + # Compute line width based on edge count + my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0); + if ($fraction > 1) { $fraction = 1; } + my $w = $fraction * 2; + if ($w < 1 && ($main::opt_web || $main::opt_svg)) { + # SVG output treats line widths < 1 poorly. + $w = 1; + } + + # Dot sometimes segfaults if given edge weights that are too large, so + # we cap the weights at a large value + my $edgeweight = abs($n) ** 0.7; + if ($edgeweight > 100000) { $edgeweight = 100000; } + $edgeweight = int($edgeweight); + + my $style = sprintf("setlinewidth(%f)", $w); + if ($x[1] =~ m/\(inline\)/) { + $style .= ",dashed"; + } + + # Use a slightly squashed function of the edge count as the weight + printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n", + $node{$x[0]}, + $node{$x[1]}, + Unparse($n), + $edgeweight, + $style); + } + } + + print DOT ("}\n"); + close(DOT); + + if ($main::opt_web || $main::opt_svg) { + # Rewrite SVG to be more usable inside web browser. + RewriteSvg(TempName($main::next_tmpfile, "svg")); + } + + return 1; +} + +sub RewriteSvg { + my $svgfile = shift; + + open(SVG, $svgfile) || die "open temp svg: $!"; + my @svg = ; + close(SVG); + unlink $svgfile; + my $svg = join('', @svg); + + # Dot's SVG output is + # + # + # + # ... + # + # + # + # Change it to + # + # + # $svg_javascript + # + # + # ... + # + # + # + + # Fix width, height; drop viewBox. + $svg =~ s/(?s) above first + my $svg_javascript = SvgJavascript(); + my $viewport = "\n"; + $svg =~ s/ above . + $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/; + $svg =~ s/$svgfile") || die "open $svgfile: $!"; + print SVG $svg; + close(SVG); + } +} + +sub SvgJavascript { + return <<'EOF'; + +EOF +} + +# Provides a map from fullname to shortname for cases where the +# shortname is ambiguous. The symlist has both the fullname and +# shortname for all symbols, which is usually fine, but sometimes -- +# such as overloaded functions -- two different fullnames can map to +# the same shortname. In that case, we use the address of the +# function to disambiguate the two. This function fills in a map that +# maps fullnames to modified shortnames in such cases. If a fullname +# is not present in the map, the 'normal' shortname provided by the +# symlist is the appropriate one to use. +sub FillFullnameToShortnameMap { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $shortnames_seen_once = {}; + my $shortnames_seen_more_than_once = {}; + + foreach my $symlist (values(%{$symbols})) { + # TODO(csilvers): deal with inlined symbols too. + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + if ($fullname !~ /<[0-9a-fA-F]+>$/) { # fullname doesn't end in an address + next; # the only collisions we care about are when addresses differ + } + if (defined($shortnames_seen_once->{$shortname}) && + $shortnames_seen_once->{$shortname} ne $fullname) { + $shortnames_seen_more_than_once->{$shortname} = 1; + } else { + $shortnames_seen_once->{$shortname} = $fullname; + } + } + + foreach my $symlist (values(%{$symbols})) { + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + # TODO(csilvers): take in a list of addresses we care about, and only + # store in the map if $symlist->[1] is in that list. Saves space. + next if defined($fullname_to_shortname_map->{$fullname}); + if (defined($shortnames_seen_more_than_once->{$shortname})) { + if ($fullname =~ /<0*([^>]*)>$/) { # fullname has address at end of it + $fullname_to_shortname_map->{$fullname} = "$shortname\@$1"; + } + } + } +} + +# Return a small number that identifies the argument. +# Multiple calls with the same argument will return the same number. +# Calls with different arguments will return different numbers. +sub ShortIdFor { + my $key = shift; + my $id = $main::uniqueid{$key}; + if (!defined($id)) { + $id = keys(%main::uniqueid) + 1; + $main::uniqueid{$key} = $id; + } + return $id; +} + +# Translate a stack of addresses into a stack of symbols +sub TranslateStack { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $k = shift; + + my @addrs = split(/\n/, $k); + my @result = (); + for (my $i = 0; $i <= $#addrs; $i++) { + my $a = $addrs[$i]; + + # Skip large addresses since they sometimes show up as fake entries on RH9 + if (length($a) > 8 && $a gt "7fffffffffffffff") { + next; + } + + if ($main::opt_disasm || $main::opt_list) { + # We want just the address for the key + push(@result, $a); + next; + } + + my $symlist = $symbols->{$a}; + if (!defined($symlist)) { + $symlist = [$a, "", $a]; + } + + # We can have a sequence of symbols for a particular entry + # (more than one symbol in the case of inlining). Callers + # come before callees in symlist, so walk backwards since + # the translated stack should contain callees before callers. + for (my $j = $#{$symlist}; $j >= 2; $j -= 3) { + my $func = $symlist->[$j-2]; + my $fileline = $symlist->[$j-1]; + my $fullfunc = $symlist->[$j]; + if (defined($fullname_to_shortname_map->{$fullfunc})) { + $func = $fullname_to_shortname_map->{$fullfunc}; + } + if ($j > 2) { + $func = "$func (inline)"; + } + + # Do not merge nodes corresponding to Callback::Run since that + # causes confusing cycles in dot display. Instead, we synthesize + # a unique name for this frame per caller. + if ($func =~ m/Callback.*::Run$/) { + my $caller = ($i > 0) ? $addrs[$i-1] : 0; + $func = "Run#" . ShortIdFor($caller); + } + + if ($main::opt_addresses) { + push(@result, "$a $func $fileline"); + } elsif ($main::opt_lines) { + if ($func eq '??' && $fileline eq '??:0') { + push(@result, "$a"); + } else { + push(@result, "$func $fileline"); + } + } elsif ($main::opt_functions) { + if ($func eq '??') { + push(@result, "$a"); + } else { + push(@result, $func); + } + } elsif ($main::opt_files) { + if ($fileline eq '??:0' || $fileline eq '') { + push(@result, "$a"); + } else { + my $f = $fileline; + $f =~ s/:\d+$//; + push(@result, $f); + } + } else { + push(@result, $a); + last; # Do not print inlined info + } + } + } + + # print join(",", @addrs), " => ", join(",", @result), "\n"; + return @result; +} + +# Generate percent string for a number and a total +sub Percent { + my $num = shift; + my $tot = shift; + if ($tot != 0) { + return sprintf("%.1f%%", $num * 100.0 / $tot); + } else { + return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf"); + } +} + +# Generate pretty-printed form of number +sub Unparse { + my $num = shift; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return sprintf("%d", $num); + } else { + if ($main::opt_show_bytes) { + return sprintf("%d", $num); + } else { + return sprintf("%.1f", $num / 1048576.0); + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds + } else { + return sprintf("%d", $num); + } +} + +# Alternate pretty-printed form: 0 maps to "." +sub UnparseAlt { + my $num = shift; + if ($num == 0) { + return "."; + } else { + return Unparse($num); + } +} + +# Alternate pretty-printed form: 0 maps to "" +sub HtmlPrintNumber { + my $num = shift; + if ($num == 0) { + return ""; + } else { + return Unparse($num); + } +} + +# Return output units +sub Units { + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return "objects"; + } else { + if ($main::opt_show_bytes) { + return "B"; + } else { + return "MB"; + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return "seconds"; + } else { + return "samples"; + } +} + +##### Profile manipulation code ##### + +# Generate flattened profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a] +sub FlatProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + AddEntry($result, $addrs[0], $count); + } + } + return $result; +} + +# Generate cumulative profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a], [b], [c], [d] +sub CumulativeProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + AddEntry($result, $a, $count); + } + } + return $result; +} + +# If the second-youngest PC on the stack is always the same, returns +# that pc. Otherwise, returns undef. +sub IsSecondPcAlwaysTheSame { + my $profile = shift; + + my $second_pc = undef; + foreach my $k (keys(%{$profile})) { + my @addrs = split(/\n/, $k); + if ($#addrs < 1) { + return undef; + } + if (not defined $second_pc) { + $second_pc = $addrs[1]; + } else { + if ($second_pc ne $addrs[1]) { + return undef; + } + } + } + return $second_pc; +} + +sub ExtractSymbolNameInlineStack { + my $symbols = shift; + my $address = shift; + + my @stack = (); + + if (exists $symbols->{$address}) { + my @localinlinestack = @{$symbols->{$address}}; + for (my $i = $#localinlinestack; $i > 0; $i-=3) { + my $file = $localinlinestack[$i-1]; + my $fn = $localinlinestack[$i-0]; + + if ($file eq "?" || $file eq ":0") { + $file = "??:0"; + } + if ($fn eq '??') { + # If we can't get the symbol name, at least use the file information. + $fn = $file; + } + my $suffix = "[inline]"; + if ($i == 2) { + $suffix = ""; + } + push (@stack, $fn.$suffix); + } + } + else { + # If we can't get a symbol name, at least fill in the address. + push (@stack, $address); + } + + return @stack; +} + +sub ExtractSymbolLocation { + my $symbols = shift; + my $address = shift; + # 'addr2line' outputs "??:0" for unknown locations; we do the + # same to be consistent. + my $location = "??:0:unknown"; + if (exists $symbols->{$address}) { + my $file = $symbols->{$address}->[1]; + if ($file eq "?") { + $file = "??:0" + } + $location = $file . ":" . $symbols->{$address}->[0]; + } + return $location; +} + +# Extracts a graph of calls. +sub ExtractCalls { + my $symbols = shift; + my $profile = shift; + + my $calls = {}; + while( my ($stack_trace, $count) = each %$profile ) { + my @address = split(/\n/, $stack_trace); + my $destination = ExtractSymbolLocation($symbols, $address[0]); + AddEntry($calls, $destination, $count); + for (my $i = 1; $i <= $#address; $i++) { + my $source = ExtractSymbolLocation($symbols, $address[$i]); + my $call = "$source -> $destination"; + AddEntry($calls, $call, $count); + $destination = $source; + } + } + + return $calls; +} + +sub FilterFrames { + my $symbols = shift; + my $profile = shift; + + if ($main::opt_retain eq '' && $main::opt_exclude eq '') { + return $profile; + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + my $sym; + if (exists($symbols->{$a})) { + $sym = $symbols->{$a}->[0]; + } else { + $sym = $a; + } + if ($main::opt_retain ne '' && $sym !~ m/$main::opt_retain/) { + next; + } + if ($main::opt_exclude ne '' && $sym =~ m/$main::opt_exclude/) { + next; + } + push(@path, $a); + } + if (scalar(@path) > 0) { + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + } + + return $result; +} + +sub PrintCollapsedStacks { + my $symbols = shift; + my $profile = shift; + + while (my ($stack_trace, $count) = each %$profile) { + my @address = split(/\n/, $stack_trace); + my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address ); + printf("%s %d\n", join(";", @names), $count); + } +} + +sub RemoveUninterestingFrames { + my $symbols = shift; + my $profile = shift; + + # List of function names to skip + my %skip = (); + my $skip_regexp = 'NOMATCH'; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + foreach my $name ('@JEMALLOC_PREFIX@calloc', + 'cfree', + '@JEMALLOC_PREFIX@malloc', + 'je_malloc_default', + 'newImpl', + 'void* newImpl', + 'fallbackNewImpl', + 'void* fallbackNewImpl', + '@JEMALLOC_PREFIX@free', + '@JEMALLOC_PREFIX@memalign', + '@JEMALLOC_PREFIX@posix_memalign', + '@JEMALLOC_PREFIX@aligned_alloc', + 'pvalloc', + '@JEMALLOC_PREFIX@valloc', + '@JEMALLOC_PREFIX@realloc', + '@JEMALLOC_PREFIX@mallocx', + '@JEMALLOC_PREFIX@rallocx', + 'do_rallocx', + '@JEMALLOC_PREFIX@xallocx', + '@JEMALLOC_PREFIX@dallocx', + '@JEMALLOC_PREFIX@sdallocx', + '@JEMALLOC_PREFIX@sdallocx_noflags', + 'tc_calloc', + 'tc_cfree', + 'tc_malloc', + 'tc_free', + 'tc_memalign', + 'tc_posix_memalign', + 'tc_pvalloc', + 'tc_valloc', + 'tc_realloc', + 'tc_new', + 'tc_delete', + 'tc_newarray', + 'tc_deletearray', + 'tc_new_nothrow', + 'tc_newarray_nothrow', + 'do_malloc', + '::do_malloc', # new name -- got moved to an unnamed ns + '::do_malloc_or_cpp_alloc', + 'DoSampledAllocation', + 'simple_alloc::allocate', + '__malloc_alloc_template::allocate', + '__builtin_delete', + '__builtin_new', + '__builtin_vec_delete', + '__builtin_vec_new', + 'operator new', + 'operator new[]', + # The entry to our memory-allocation routines on OS X + 'malloc_zone_malloc', + 'malloc_zone_calloc', + 'malloc_zone_valloc', + 'malloc_zone_realloc', + 'malloc_zone_memalign', + 'malloc_zone_free', + # These mark the beginning/end of our custom sections + '__start_google_malloc', + '__stop_google_malloc', + '__start_malloc_hook', + '__stop_malloc_hook') { + $skip{$name} = 1; + $skip{"_" . $name} = 1; # Mach (OS X) adds a _ prefix to everything + } + # TODO: Remove TCMalloc once everything has been + # moved into the tcmalloc:: namespace and we have flushed + # old code out of the system. + $skip_regexp = "TCMalloc|^tcmalloc::"; + } elsif ($main::profile_type eq 'contention') { + foreach my $vname ('base::RecordLockProfileData', + 'base::SubmitMutexProfileData', + 'base::SubmitSpinLockProfileData', + 'Mutex::Unlock', + 'Mutex::UnlockSlow', + 'Mutex::ReaderUnlock', + 'MutexLock::~MutexLock', + 'SpinLock::Unlock', + 'SpinLock::SlowUnlock', + 'SpinLockHolder::~SpinLockHolder') { + $skip{$vname} = 1; + } + } elsif ($main::profile_type eq 'cpu') { + # Drop signal handlers used for CPU profile collection + # TODO(dpeng): this should not be necessary; it's taken + # care of by the general 2nd-pc mechanism below. + foreach my $name ('ProfileData::Add', # historical + 'ProfileData::prof_handler', # historical + 'CpuProfiler::prof_handler', + '__FRAME_END__', + '__pthread_sighandler', + '__restore') { + $skip{$name} = 1; + } + } else { + # Nothing skipped for unknown types + } + + if ($main::profile_type eq 'cpu') { + # If all the second-youngest program counters are the same, + # this STRONGLY suggests that it is an artifact of measurement, + # i.e., stack frames pushed by the CPU profiler signal handler. + # Hence, we delete them. + # (The topmost PC is read from the signal structure, not from + # the stack, so it does not get involved.) + while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) { + my $result = {}; + my $func = ''; + if (exists($symbols->{$second_pc})) { + $second_pc = $symbols->{$second_pc}->[0]; + } + print STDERR "Removing $second_pc from all stack traces.\n"; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + splice @addrs, 1, 1; + my $reduced_path = join("\n", @addrs); + AddEntry($result, $reduced_path, $count); + } + $profile = $result; + } + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + if (exists($symbols->{$a})) { + my $func = $symbols->{$a}->[0]; + if ($skip{$func} || ($func =~ m/$skip_regexp/)) { + # Throw away the portion of the backtrace seen so far, under the + # assumption that previous frames were for functions internal to the + # allocator. + @path = (); + next; + } + } + push(@path, $a); + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + + $result = FilterFrames($symbols, $result); + + return $result; +} + +# Reduce profile to granularity given by user +sub ReduceProfile { + my $symbols = shift; + my $profile = shift; + my $result = {}; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + my @path = (); + my %seen = (); + $seen{''} = 1; # So that empty keys are skipped + foreach my $e (@translated) { + # To avoid double-counting due to recursion, skip a stack-trace + # entry if it has already been seen + if (!$seen{$e}) { + $seen{$e} = 1; + push(@path, $e); + } + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + return $result; +} + +# Does the specified symbol array match the regexp? +sub SymbolMatches { + my $sym = shift; + my $re = shift; + if (defined($sym)) { + for (my $i = 0; $i < $#{$sym}; $i += 3) { + if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) { + return 1; + } + } + } + return 0; +} + +# Focus only on paths involving specified regexps +sub FocusProfile { + my $symbols = shift; + my $profile = shift; + my $focus = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) { + AddEntry($result, $k, $count); + last; + } + } + } + return $result; +} + +# Focus only on paths not involving specified regexps +sub IgnoreProfile { + my $symbols = shift; + my $profile = shift; + my $ignore = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my $matched = 0; + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) { + $matched = 1; + last; + } + } + if (!$matched) { + AddEntry($result, $k, $count); + } + } + return $result; +} + +# Get total count in profile +sub TotalProfile { + my $profile = shift; + my $result = 0; + foreach my $k (keys(%{$profile})) { + $result += $profile->{$k}; + } + return $result; +} + +# Add A to B +sub AddProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + my $v = $A->{$k}; + AddEntry($R, $k, $v); + } + # add all keys in B + foreach my $k (keys(%{$B})) { + my $v = $B->{$k}; + AddEntry($R, $k, $v); + } + return $R; +} + +# Merges symbol maps +sub MergeSymbols { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + $R->{$k} = $A->{$k}; + } + if (defined($B)) { + foreach my $k (keys(%{$B})) { + $R->{$k} = $B->{$k}; + } + } + return $R; +} + + +# Add A to B +sub AddPcs { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + $R->{$k} = 1 + } + # add all keys in B + foreach my $k (keys(%{$B})) { + $R->{$k} = 1 + } + return $R; +} + +# Subtract B from A +sub SubtractProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + my $v = $A->{$k} - GetEntry($B, $k); + if ($v < 0 && $main::opt_drop_negative) { + $v = 0; + } + AddEntry($R, $k, $v); + } + if (!$main::opt_drop_negative) { + # Take care of when subtracted profile has more entries + foreach my $k (keys(%{$B})) { + if (!exists($A->{$k})) { + AddEntry($R, $k, 0 - $B->{$k}); + } + } + } + return $R; +} + +# Get entry from profile; zero if not present +sub GetEntry { + my $profile = shift; + my $k = shift; + if (exists($profile->{$k})) { + return $profile->{$k}; + } else { + return 0; + } +} + +# Add entry to specified profile +sub AddEntry { + my $profile = shift; + my $k = shift; + my $n = shift; + if (!exists($profile->{$k})) { + $profile->{$k} = 0; + } + $profile->{$k} += $n; +} + +# Add a stack of entries to specified profile, and add them to the $pcs +# list. +sub AddEntries { + my $profile = shift; + my $pcs = shift; + my $stack = shift; + my $count = shift; + my @k = (); + + foreach my $e (split(/\s+/, $stack)) { + my $pc = HexExtend($e); + $pcs->{$pc} = 1; + push @k, $pc; + } + AddEntry($profile, (join "\n", @k), $count); +} + +##### Code to profile a server dynamically ##### + +sub CheckSymbolPage { + my $url = SymbolPageURL(); + my $command = ShellEscape(@URL_FETCHER, $url); + open(SYMBOL, "$command |") or error($command); + my $line = ; + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(SYMBOL); + unless (defined($line)) { + error("$url doesn't exist\n"); + } + + if ($line =~ /^num_symbols:\s+(\d+)$/) { + if ($1 == 0) { + error("Stripped binary. No symbols available.\n"); + } + } else { + error("Failed to get the number of symbols from $url\n"); + } +} + +sub IsProfileURL { + my $profile_name = shift; + if (-f $profile_name) { + printf STDERR "Using local file $profile_name.\n"; + return 0; + } + return 1; +} + +sub ParseProfileURL { + my $profile_name = shift; + + if (!defined($profile_name) || $profile_name eq "") { + return (); + } + + # Split profile URL - matches all non-empty strings, so no test. + $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,; + + my $proto = $1 || "http://"; + my $hostport = $2; + my $prefix = $3; + my $profile = $4 || "/"; + + my $host = $hostport; + $host =~ s/:.*//; + + my $baseurl = "$proto$hostport$prefix"; + return ($host, $baseurl, $profile); +} + +# We fetch symbols from the first profile argument. +sub SymbolPageURL { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + return "$baseURL$SYMBOL_PAGE"; +} + +sub FetchProgramName() { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + my $url = "$baseURL$PROGRAM_NAME_PAGE"; + my $command_line = ShellEscape(@URL_FETCHER, $url); + open(CMDLINE, "$command_line |") or error($command_line); + my $cmdline = ; + $cmdline =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(CMDLINE); + error("Failed to get program name from $url\n") unless defined($cmdline); + $cmdline =~ s/\x00.+//; # Remove argv[1] and latters. + $cmdline =~ s!\n!!g; # Remove LFs. + return $cmdline; +} + +# Gee, curl's -L (--location) option isn't reliable at least +# with its 7.12.3 version. Curl will forget to post data if +# there is a redirection. This function is a workaround for +# curl. Redirection happens on borg hosts. +sub ResolveRedirectionForCurl { + my $url = shift; + my $command_line = ShellEscape(@URL_FETCHER, "--head", $url); + open(CMDLINE, "$command_line |") or error($command_line); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^Location: (.*)/) { + $url = $1; + } + } + close(CMDLINE); + return $url; +} + +# Add a timeout flat to URL_FETCHER. Returns a new list. +sub AddFetchTimeout { + my $timeout = shift; + my @fetcher = @_; + if (defined($timeout)) { + if (join(" ", @fetcher) =~ m/\bcurl -s/) { + push(@fetcher, "--max-time", sprintf("%d", $timeout)); + } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) { + push(@fetcher, sprintf("--deadline=%d", $timeout)); + } + } + return @fetcher; +} + +# Reads a symbol map from the file handle name given as $1, returning +# the resulting symbol map. Also processes variables relating to symbols. +# Currently, the only variable processed is 'binary=' which updates +# $main::prog to have the correct program name. +sub ReadSymbols { + my $in = shift; + my $map = {}; + while (<$in>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Removes all the leading zeroes from the symbols, see comment below. + if (m/^0x0*([0-9a-f]+)\s+(.+)/) { + $map->{$1} = $2; + } elsif (m/^---/) { + last; + } elsif (m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1, $2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "binary") { + if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) { + printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n", + $main::prog, $value); + } + $main::prog = $value; + } else { + printf STDERR ("Ignoring unknown variable in symbols list: " . + "'%s' = '%s'\n", $variable, $value); + } + } + } + return $map; +} + +sub URLEncode { + my $str = shift; + $str =~ s/([^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%02x", ord $1 /eg; + return $str; +} + +sub AppendSymbolFilterParams { + my $url = shift; + my @params = (); + if ($main::opt_retain ne '') { + push(@params, sprintf("retain=%s", URLEncode($main::opt_retain))); + } + if ($main::opt_exclude ne '') { + push(@params, sprintf("exclude=%s", URLEncode($main::opt_exclude))); + } + if (scalar @params > 0) { + $url = sprintf("%s?%s", $url, join("&", @params)); + } + return $url; +} + +# Fetches and processes symbols to prepare them for use in the profile output +# code. If the optional 'symbol_map' arg is not given, fetches symbols from +# $SYMBOL_PAGE for all PC values found in profile. Otherwise, the raw symbols +# are assumed to have already been fetched into 'symbol_map' and are simply +# extracted and processed. +sub FetchSymbols { + my $pcset = shift; + my $symbol_map = shift; + + my %seen = (); + my @pcs = grep { !$seen{$_}++ } keys(%$pcset); # uniq + + if (!defined($symbol_map)) { + my $post_data = join("+", sort((map {"0x" . "$_"} @pcs))); + + open(POSTFILE, ">$main::tmpfile_sym"); + print POSTFILE $post_data; + close(POSTFILE); + + my $url = SymbolPageURL(); + + my $command_line; + if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) { + $url = ResolveRedirectionForCurl($url); + $url = AppendSymbolFilterParams($url); + $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym", + $url); + } else { + $url = AppendSymbolFilterParams($url); + $command_line = (ShellEscape(@URL_FETCHER, "--post", $url) + . " < " . ShellEscape($main::tmpfile_sym)); + } + # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols. + my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"}); + open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line); + $symbol_map = ReadSymbols(*SYMBOL{IO}); + close(SYMBOL); + } + + my $symbols = {}; + foreach my $pc (@pcs) { + my $fullname; + # For 64 bits binaries, symbols are extracted with 8 leading zeroes. + # Then /symbol reads the long symbols in as uint64, and outputs + # the result with a "0x%08llx" format which get rid of the zeroes. + # By removing all the leading zeroes in both $pc and the symbols from + # /symbol, the symbols match and are retrievable from the map. + my $shortpc = $pc; + $shortpc =~ s/^0*//; + # Each line may have a list of names, which includes the function + # and also other functions it has inlined. They are separated (in + # PrintSymbolizedProfile), by --, which is illegal in function names. + my $fullnames; + if (defined($symbol_map->{$shortpc})) { + $fullnames = $symbol_map->{$shortpc}; + } else { + $fullnames = "0x" . $pc; # Just use addresses + } + my $sym = []; + $symbols->{$pc} = $sym; + foreach my $fullname (split("--", $fullnames)) { + my $name = ShortFunctionName($fullname); + push(@{$sym}, $name, "?", $fullname); + } + } + return $symbols; +} + +sub BaseName { + my $file_name = shift; + $file_name =~ s!^.*/!!; # Remove directory name + return $file_name; +} + +sub MakeProfileBaseName { + my ($binary_name, $profile_name) = @_; + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + my $binary_shortname = BaseName($binary_name); + return sprintf("%s.%s.%s", + $binary_shortname, $main::op_time, $host); +} + +sub FetchDynamicProfile { + my $binary_name = shift; + my $profile_name = shift; + my $fetch_name_only = shift; + my $encourage_patience = shift; + + if (!IsProfileURL($profile_name)) { + return $profile_name; + } else { + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + if ($path eq "" || $path eq "/") { + # Missing type specifier defaults to cpu-profile + $path = $PROFILE_PAGE; + } + + my $profile_file = MakeProfileBaseName($binary_name, $profile_name); + + my $url = "$baseURL$path"; + my $fetch_timeout = undef; + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) { + if ($path =~ m/[?]/) { + $url .= "&"; + } else { + $url .= "?"; + } + $url .= sprintf("seconds=%d", $main::opt_seconds); + $fetch_timeout = $main::opt_seconds * 1.01 + 60; + # Set $profile_type for consumption by PrintSymbolizedProfile. + $main::profile_type = 'cpu'; + } else { + # For non-CPU profiles, we add a type-extension to + # the target profile file name. + my $suffix = $path; + $suffix =~ s,/,.,g; + $profile_file .= $suffix; + # Set $profile_type for consumption by PrintSymbolizedProfile. + if ($path =~ m/$HEAP_PAGE/) { + $main::profile_type = 'heap'; + } elsif ($path =~ m/$GROWTH_PAGE/) { + $main::profile_type = 'growth'; + } elsif ($path =~ m/$CONTENTION_PAGE/) { + $main::profile_type = 'contention'; + } + } + + my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof"); + if (! -d $profile_dir) { + mkdir($profile_dir) + || die("Unable to create profile directory $profile_dir: $!\n"); + } + my $tmp_profile = "$profile_dir/.tmp.$profile_file"; + my $real_profile = "$profile_dir/$profile_file"; + + if ($fetch_name_only > 0) { + return $real_profile; + } + + my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER); + my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile); + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){ + print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n ${real_profile}\n"; + if ($encourage_patience) { + print STDERR "Be patient...\n"; + } + } else { + print STDERR "Fetching $path profile from $url to\n ${real_profile}\n"; + } + + (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n"); + (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n"); + print STDERR "Wrote profile to $real_profile\n"; + $main::collected_profile = $real_profile; + return $main::collected_profile; + } +} + +# Collect profiles in parallel +sub FetchDynamicProfiles { + my $items = scalar(@main::pfile_args); + my $levels = log($items) / log(2); + + if ($items == 1) { + $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1); + } else { + # math rounding issues + if ((2 ** $levels) < $items) { + $levels++; + } + my $count = scalar(@main::pfile_args); + for (my $i = 0; $i < $count; $i++) { + $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0); + } + print STDERR "Fetching $count profiles, Be patient...\n"; + FetchDynamicProfilesRecurse($levels, 0, 0); + $main::collected_profile = join(" \\\n ", @main::profile_files); + } +} + +# Recursively fork a process to get enough processes +# collecting profiles +sub FetchDynamicProfilesRecurse { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if (my $pid = fork()) { + $position = 0 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + wait; + } else { + $position = 1 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + cleanup(); + exit(0); + } +} + +# Collect a single profile +sub TryCollectProfile { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if ($level >= ($maxlevel - 1)) { + if ($position < scalar(@main::pfile_args)) { + FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0); + } + } else { + FetchDynamicProfilesRecurse($maxlevel, $level+1, $position); + } +} + +##### Parsing code ##### + +# Provide a small streaming-read module to handle very large +# cpu-profile files. Stream in chunks along a sliding window. +# Provides an interface to get one 'slot', correctly handling +# endian-ness differences. A slot is one 32-bit or 64-bit word +# (depending on the input profile). We tell endianness and bit-size +# for the profile by looking at the first 8 bytes: in cpu profiles, +# the second slot is always 3 (we'll accept anything that's not 0). +BEGIN { + package CpuProfileStream; + + sub new { + my ($class, $file, $fname) = @_; + my $self = { file => $file, + base => 0, + stride => 512 * 1024, # must be a multiple of bitsize/8 + slots => [], + unpack_code => "", # N for big-endian, V for little + perl_is_64bit => 1, # matters if profile is 64-bit + }; + bless $self, $class; + # Let unittests adjust the stride + if ($main::opt_test_stride > 0) { + $self->{stride} = $main::opt_test_stride; + } + # Read the first two slots to figure out bitsize and endianness. + my $slots = $self->{slots}; + my $str; + read($self->{file}, $str, 8); + # Set the global $address_length based on what we see here. + # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars). + $address_length = ($str eq (chr(0)x8)) ? 16 : 8; + if ($address_length == 8) { + if (substr($str, 6, 2) eq chr(0)x2) { + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 4, 2) eq chr(0)x2) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**16\n"); + } + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # If we're a 64-bit profile, check if we're a 64-bit-capable + # perl. Otherwise, each slot will be represented as a float + # instead of an int64, losing precision and making all the + # 64-bit addresses wrong. We won't complain yet, but will + # later if we ever see a value that doesn't fit in 32 bits. + my $has_q = 0; + eval { $has_q = pack("Q", "1") ? 1 : 1; }; + if (!$has_q) { + $self->{perl_is_64bit} = 0; + } + read($self->{file}, $str, 8); + if (substr($str, 4, 4) eq chr(0)x4) { + # We'd love to use 'Q', but it's a) not universal, b) not endian-proof. + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 0, 4) eq chr(0)x4) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**32\n"); + } + my @pair = unpack($self->{unpack_code} . "*", $str); + # Since we know one of the pair is 0, it's fine to just add them. + @$slots = (0, $pair[0] + $pair[1]); + } + return $self; + } + + # Load more data when we access slots->get(X) which is not yet in memory. + sub overflow { + my ($self) = @_; + my $slots = $self->{slots}; + $self->{base} += $#$slots + 1; # skip over data we're replacing + my $str; + read($self->{file}, $str, $self->{stride}); + if ($address_length == 8) { # the 32-bit case + # This is the easy case: unpack provides 32-bit unpacking primitives. + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # We need to unpack 32 bits at a time and combine. + my @b32_values = unpack($self->{unpack_code} . "*", $str); + my @b64_values = (); + for (my $i = 0; $i < $#b32_values; $i += 2) { + # TODO(csilvers): if this is a 32-bit perl, the math below + # could end up in a too-large int, which perl will promote + # to a double, losing necessary precision. Deal with that. + # Right now, we just die. + my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]); + if ($self->{unpack_code} eq 'N') { # big-endian + ($lo, $hi) = ($hi, $lo); + } + my $value = $lo + $hi * (2**32); + if (!$self->{perl_is_64bit} && # check value is exactly represented + (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) { + ::error("Need a 64-bit perl to process this 64-bit profile.\n"); + } + push(@b64_values, $value); + } + @$slots = @b64_values; + } + } + + # Access the i-th long in the file (logically), or -1 at EOF. + sub get { + my ($self, $idx) = @_; + my $slots = $self->{slots}; + while ($#$slots >= 0) { + if ($idx < $self->{base}) { + # The only time we expect a reference to $slots[$i - something] + # after referencing $slots[$i] is reading the very first header. + # Since $stride > |header|, that shouldn't cause any lookback + # errors. And everything after the header is sequential. + print STDERR "Unexpected look-back reading CPU profile"; + return -1; # shrug, don't know what better to return + } elsif ($idx > $self->{base} + $#$slots) { + $self->overflow(); + } else { + return $slots->[$idx - $self->{base}]; + } + } + # If we get here, $slots is [], which means we've reached EOF + return -1; # unique since slots is supposed to hold unsigned numbers + } +} + +# Reads the top, 'header' section of a profile, and returns the last +# line of the header, commonly called a 'header line'. The header +# section of a profile consists of zero or more 'command' lines that +# are instructions to jeprof, which jeprof executes when reading the +# header. All 'command' lines start with a %. After the command +# lines is the 'header line', which is a profile-specific line that +# indicates what type of profile it is, and perhaps other global +# information about the profile. For instance, here's a header line +# for a heap profile: +# heap profile: 53: 38236 [ 5525: 1284029] @ heapprofile +# For historical reasons, the CPU profile does not contain a text- +# readable header line. If the profile looks like a CPU profile, +# this function returns "". If no header line could be found, this +# function returns undef. +# +# The following commands are recognized: +# %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:' +# +# The input file should be in binmode. +sub ReadProfileHeader { + local *PROFILE = shift; + my $firstchar = ""; + my $line = ""; + read(PROFILE, $firstchar, 1); + seek(PROFILE, -1, 1); # unread the firstchar + if ($firstchar !~ /[[:print:]]/) { # is not a text character + return ""; + } + while (defined($line = )) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /^%warn\s+(.*)/) { # 'warn' command + # Note this matches both '%warn blah\n' and '%warn\n'. + print STDERR "WARNING: $1\n"; # print the rest of the line + } elsif ($line =~ /^%/) { + print STDERR "Ignoring unknown command from profile header: $line"; + } else { + # End of commands, must be the header line. + return $line; + } + } + return undef; # got to EOF without seeing a header line +} + +sub IsSymbolizedProfileFile { + my $file_name = shift; + if (!(-e $file_name) || !(-r $file_name)) { + return 0; + } + # Check if the file contains a symbol-section marker. + open(TFILE, "<$file_name"); + binmode TFILE; + my $firstline = ReadProfileHeader(*TFILE); + close(TFILE); + if (!$firstline) { + return 0; + } + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + return $firstline =~ /^--- *$symbol_marker/; +} + +# Parse profile generated by common/profiler.cc and return a reference +# to a map: +# $result->{version} Version number of profile file +# $result->{period} Sampling period (in microseconds) +# $result->{profile} Profile object +# $result->{threads} Map of thread IDs to profile objects +# $result->{map} Memory map info from profile +# $result->{pcs} Hash of all PC values seen, key is hex address +sub ReadProfile { + my $prog = shift; + my $fname = shift; + my $result; # return value + + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $contention_marker = $&; + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $growth_marker = $&; + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $profile_marker = $&; + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $heap_marker = $&; + + # Look at first line to see if it is a heap or a CPU profile. + # CPU profile may start with no header at all, and just binary data + # (starting with \0\0\0\0) -- in that case, don't try to read the + # whole firstline, since it may be gigabytes(!) of data. + open(PROFILE, "<$fname") || error("$fname: $!\n"); + binmode PROFILE; # New perls do UTF-8 processing + my $header = ReadProfileHeader(*PROFILE); + if (!defined($header)) { # means "at EOF" + error("Profile is empty.\n"); + } + + my $symbols; + if ($header =~ m/^--- *$symbol_marker/o) { + # Verify that the user asked for a symbolized profile + if (!$main::use_symbolized_profile) { + # we have both a binary and symbolized profiles, abort + error("FATAL ERROR: Symbolized profile\n $fname\ncannot be used with " . + "a binary arg. Try again without passing\n $prog\n"); + } + # Read the symbol section of the symbolized profile file. + $symbols = ReadSymbols(*PROFILE{IO}); + # Read the next line to get the header for the remaining profile. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + if ($header =~ m/^--- *($heap_marker|$growth_marker)/o) { + # Skip "--- ..." line for profile types that have their own headers. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + $main::profile_type = ''; + + if ($header =~ m/^heap profile:.*$growth_marker/o) { + $main::profile_type = 'growth'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap profile:/) { + $main::profile_type = 'heap'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap/) { + $main::profile_type = 'heap'; + $result = ReadThreadedHeapProfile($prog, $fname, $header); + } elsif ($header =~ m/^--- *$contention_marker/o) { + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *Stacks:/) { + print STDERR + "Old format contention profile: mistakenly reports " . + "condition variable signals as lock contentions.\n"; + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *$profile_marker/) { + # the binary cpu profile data starts immediately after this line + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } else { + if (defined($symbols)) { + # a symbolized profile contains a format we don't recognize, bail out + error("$fname: Cannot recognize profile section after symbols.\n"); + } + # no ascii header present -- must be a CPU profile + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } + + close(PROFILE); + + # if we got symbols along with the profile, return those as well + if (defined($symbols)) { + $result->{symbols} = $symbols; + } + + return $result; +} + +# Subtract one from caller pc so we map back to call instr. +# However, don't do this if we're reading a symbolized profile +# file, in which case the subtract-one was done when the file +# was written. +# +# We apply the same logic to all readers, though ReadCPUProfile uses an +# independent implementation. +sub FixCallerAddresses { + my $stack = shift; + # --raw/http: Always subtract one from pc's, because PrintSymbolizedProfile() + # dumps unadjusted profiles. + { + $stack =~ /(\s)/; + my $delimiter = $1; + my @addrs = split(' ', $stack); + my @fixedaddrs; + $#fixedaddrs = $#addrs; + if ($#addrs >= 0) { + $fixedaddrs[0] = $addrs[0]; + } + for (my $i = 1; $i <= $#addrs; $i++) { + $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1"); + } + return join $delimiter, @fixedaddrs; + } +} + +# CPU profile reader +sub ReadCPUProfile { + my $prog = shift; + my $fname = shift; # just used for logging + local *PROFILE = shift; + my $version; + my $period; + my $i; + my $profile = {}; + my $pcs = {}; + + # Parse string into array of slots. + my $slots = CpuProfileStream->new(*PROFILE, $fname); + + # Read header. The current header version is a 5-element structure + # containing: + # 0: header count (always 0) + # 1: header "words" (after this one: 3) + # 2: format version (0) + # 3: sampling period (usec) + # 4: unused padding (always 0) + if ($slots->get(0) != 0 ) { + error("$fname: not a profile file, or old format profile file\n"); + } + $i = 2 + $slots->get(1); + $version = $slots->get(2); + $period = $slots->get(3); + # Do some sanity checking on these header values. + if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) { + error("$fname: not a profile file, or corrupted profile file\n"); + } + + # Parse profile + while ($slots->get($i) != -1) { + my $n = $slots->get($i++); + my $d = $slots->get($i++); + if ($d > (2**16)) { # TODO(csilvers): what's a reasonable max-stack-depth? + my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8)); + print STDERR "At index $i (address $addr):\n"; + error("$fname: stack trace depth >= 2**32\n"); + } + if ($slots->get($i) == 0) { + # End of profile data marker + $i += $d; + last; + } + + # Make key out of the stack entries + my @k = (); + for (my $j = 0; $j < $d; $j++) { + my $pc = $slots->get($i+$j); + # Subtract one from caller pc so we map back to call instr. + $pc--; + $pc = sprintf("%0*x", $address_length, $pc); + $pcs->{$pc} = 1; + push @k, $pc; + } + + AddEntry($profile, (join "\n", @k), $n); + $i += $d; + } + + # Parse map + my $map = ''; + seek(PROFILE, $i * 4, 0); + read(PROFILE, $map, (stat PROFILE)[7]); + + my $r = {}; + $r->{version} = $version; + $r->{period} = $period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + + return $r; +} + +sub HeapProfileIndex { + my $index = 1; + if ($main::opt_inuse_space) { + $index = 1; + } elsif ($main::opt_inuse_objects) { + $index = 0; + } elsif ($main::opt_alloc_space) { + $index = 3; + } elsif ($main::opt_alloc_objects) { + $index = 2; + } + return $index; +} + +sub ReadMappedLibraries { + my $fh = shift; + my $map = ""; + # Read the /proc/self/maps data + while (<$fh>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + $map .= $_; + } + return $map; +} + +sub ReadMemoryMap { + my $fh = shift; + my $map = ""; + # Read /proc/self/maps data as formatted by DumpAddressMap() + my $buildvar = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Parse "build=" specification if supplied + if (m/^\s*build=(.*)\n/) { + $buildvar = $1; + } + + # Expand "$build" variable if available + $_ =~ s/\$build\b/$buildvar/g; + + $map .= $_; + } + return $map; +} + +sub AdjustSamples { + my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_; + if ($sample_adjustment) { + if ($sampling_algorithm == 2) { + # Remote-heap version 2 + # The sampling frequency is the rate of a Poisson process. + # This means that the probability of sampling an allocation of + # size X with sampling rate Y is 1 - exp(-X/Y) + if ($n1 != 0) { + my $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n1 *= $scale_factor; + $s1 *= $scale_factor; + } + if ($n2 != 0) { + my $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n2 *= $scale_factor; + $s2 *= $scale_factor; + } + } else { + # Remote-heap version 1 + my $ratio; + $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + if ($ratio < 1) { + $n1 /= $ratio; + $s1 /= $ratio; + } + $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + if ($ratio < 1) { + $n2 /= $ratio; + $s2 /= $ratio; + } + } + } + return ($n1, $s1, $n2, $s2); +} + +sub ReadHeapProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $index = HeapProfileIndex(); + + # Find the type of this profile. The header line looks like: + # heap profile: 1246: 8800744 [ 1246: 8800744] @ /266053 + # There are two pairs , the first inuse objects/space, and the + # second allocated objects/space. This is followed optionally by a profile + # type, and if that is present, optionally by a sampling frequency. + # For remote heap profiles (v1): + # The interpretation of the sampling frequency is that the profiler, for + # each sample, calculates a uniformly distributed random integer less than + # the given value, and records the next sample after that many bytes have + # been allocated. Therefore, the expected sample interval is half of the + # given frequency. By default, if not specified, the expected sample + # interval is 128KB. Only remote-heap-page profiles are adjusted for + # sample size. + # For remote heap profiles (v2): + # The sampling frequency is the rate of a Poisson process. This means that + # the probability of sampling an allocation of size X with sampling rate Y + # is 1 - exp(-X/Y) + # For version 2, a typical header line might look like this: + # heap profile: 1922: 127792360 [ 1922: 127792360] @ _v2/524288 + # the trailing number (524288) is the sampling rate. (Version 1 showed + # double the 'rate' here) + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") { + if (defined($6) && ($6 ne '')) { + $type = $6; + my $sample_period = $8; + # $type is "heapprofile" for profiles generated by the + # heap-profiler, and either "heap" or "heap_v2" for profiles + # generated by sampling directly within tcmalloc. It can also + # be "growth" for heap-growth profiles. The first is typically + # found for profiles generated locally, and the others for + # remote profiles. + if (($type eq "heapprofile") || ($type !~ /heap/) ) { + # No need to adjust for the sampling rate with heap-profiler-derived data + $sampling_algorithm = 0; + } elsif ($type =~ /_v2/) { + $sampling_algorithm = 2; # version 2 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period); + } + } else { + $sampling_algorithm = 1; # version 1 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period)/2; + } + } + } else { + # We detect whether or not this is a remote-heap profile by checking + # that the total-allocated stats ($n2,$s2) are exactly the + # same as the in-use stats ($n1,$s1). It is remotely conceivable + # that a non-remote-heap profile may pass this check, but it is hard + # to imagine how that could happen. + # In this case it's so old it's guaranteed to be remote-heap version 1. + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + if (($n1 == $n2) && ($s1 == $s2)) { + # This is likely to be a remote-heap based sample profile + $sampling_algorithm = 1; + } + } + } + + if ($sampling_algorithm > 0) { + # For remote-heap generated profiles, adjust the counts and sizes to + # account for the sample rate (we sample once every 128KB by default). + if ($sample_adjustment == 0) { + # Turn on profile adjustment. + $sample_adjustment = 128*1024; + print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n"; + } else { + printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n", + $sample_adjustment); + } + if ($sampling_algorithm > 1) { + # We don't bother printing anything for the original version (version 1) + printf STDERR "Heap version $sampling_algorithm\n"; + } + } + + my $profile = {}; + my $pcs = {}; + my $map = ""; + + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # : [: ] @ a1 a2 a3 ... an + s/^\s*//; + s/\s*$//; + if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) { + my $stack = $5; + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadThreadedHeapProfile { + my ($prog, $fname, $header) = @_; + + my $index = HeapProfileIndex(); + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + # Assuming a very specific type of header for now. + if ($header =~ m"^heap_v2/(\d+)") { + $type = "_v2"; + $sampling_algorithm = 2; + $sample_adjustment = int($1); + } + if ($type ne "_v2" || !defined($sample_adjustment)) { + die "Threaded heap profiles require v2 sampling with a sample rate\n"; + } + + my $profile = {}; + my $thread_profiles = {}; + my $pcs = {}; + my $map = ""; + my $stack = ""; + + while () { + s/\r//g; + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # @ a1 a2 ... an + # t*: : [: ] + # t1: : [: ] + # ... + # tn: : [: ] + s/^\s*//; + s/\s*$//; + if (m/^@\s+(.*)$/) { + $stack = $1; + } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) { + if ($stack eq "") { + # Still in the header, so this is just a per-thread summary. + next; + } + my $thread = $2; + my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + if ($thread eq "*") { + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } else { + if (!exists($thread_profiles->{$thread})) { + $thread_profiles->{$thread} = {}; + } + AddEntries($thread_profiles->{$thread}, $pcs, + FixCallerAddresses($stack), $counts[$index]); + } + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{threads} = $thread_profiles; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadSynchProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $map = ''; + my $profile = {}; + my $pcs = {}; + my $sampling_period = 1; + my $cyclespernanosec = 2.8; # Default assumption for old binaries + my $seen_clockrate = 0; + my $line; + + my $index = 0; + if ($main::opt_total_delay) { + $index = 0; + } elsif ($main::opt_contentions) { + $index = 1; + } elsif ($main::opt_mean_delay) { + $index = 2; + } + + while ( $line = ) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $count, $stack) = ($1, $2, $3); + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + $count *= $sampling_period; + + my @values = ($cycles, $count, $cycles / $count); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]); + + } elsif ( $line =~ /^(slow release).*thread \d+ \@\s*(.*?)\s*$/ || + $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $stack) = ($1, $2); + if ($cycles !~ /^\d+$/) { + next; + } + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + + AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles); + + } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1,$2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "cycles/second") { + $cyclespernanosec = $value / 1e9; + $seen_clockrate = 1; + } elsif ($variable eq "sampling period") { + $sampling_period = $value; + } elsif ($variable eq "ms since reset") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } elsif ($variable eq "discarded samples") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } else { + printf STDERR ("Ignoring unnknown variable in /contention output: " . + "'%s' = '%s'\n",$variable,$value); + } + } else { + # Memory map entry + $map .= $line; + } + } + + if (!$seen_clockrate) { + printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n", + $cyclespernanosec); + } + + my $r = {}; + $r->{version} = 0; + $r->{period} = $sampling_period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +# Given a hex value in the form "0x1abcd" or "1abcd", return either +# "0001abcd" or "000000000001abcd", depending on the current (global) +# address length. +sub HexExtend { + my $addr = shift; + + $addr =~ s/^(0x)?0*//; + my $zeros_needed = $address_length - length($addr); + if ($zeros_needed < 0) { + printf STDERR "Warning: address $addr is longer than address length $address_length\n"; + return $addr; + } + return ("0" x $zeros_needed) . $addr; +} + +##### Symbol extraction ##### + +# Aggressively search the lib_prefix values for the given library +# If all else fails, just return the name of the library unmodified. +# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so" +# it will search the following locations in this order, until it finds a file: +# /my/path/lib/dir/mylib.so +# /other/path/lib/dir/mylib.so +# /my/path/dir/mylib.so +# /other/path/dir/mylib.so +# /my/path/mylib.so +# /other/path/mylib.so +# /lib/dir/mylib.so (returned as last resort) +sub FindLibrary { + my $file = shift; + my $suffix = $file; + + # Search for the library as described above + do { + foreach my $prefix (@prefix_list) { + my $fullpath = $prefix . $suffix; + if (-e $fullpath) { + return $fullpath; + } + } + } while ($suffix =~ s|^/[^/]+/|/|); + return $file; +} + +# Return path to library with debugging symbols. +# For libc libraries, the copy in /usr/lib/debug contains debugging symbols +sub DebuggingLibrary { + my $file = shift; + + if ($file !~ m|^/|) { + return undef; + } + + # Find debug symbol file if it's named after the library's name. + + if (-f "/usr/lib/debug$file") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; } + return "/usr/lib/debug$file"; + } elsif (-f "/usr/lib/debug$file.debug") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; } + return "/usr/lib/debug$file.debug"; + } + + if(!$main::opt_debug_syms_by_id) { + if($main::opt_debug) { print STDERR "no debug symbols found for $file\n" }; + return undef; + } + + # Find debug file if it's named after the library's build ID. + + my $readelf = ''; + if (!$main::gave_up_on_elfutils) { + $readelf = qx/eu-readelf -n ${file}/; + if ($?) { + print STDERR "Cannot run eu-readelf. To use --debug-syms-by-id you must be on Linux, with elfutils installed.\n"; + $main::gave_up_on_elfutils = 1; + return undef; + } + my $buildID = $1 if $readelf =~ /Build ID: ([A-Fa-f0-9]+)/s; + if (defined $buildID && length $buildID > 0) { + my $symbolFile = '/usr/lib/debug/.build-id/' . substr($buildID, 0, 2) . '/' . substr($buildID, 2) . '.debug'; + if (-e $symbolFile) { + if($main::opt_debug) { print STDERR "found debug symbol file $symbolFile for $file\n" }; + return $symbolFile; + } else { + if($main::opt_debug) { print STDERR "no debug symbol file found for $file, build ID: $buildID\n" }; + return undef; + } + } + } + + if($main::opt_debug) { print STDERR "no debug symbols found for $file, build ID unknown\n" }; + return undef; +} + + +# Parse text section header of a library using objdump +sub ParseTextSectionHeaderFromObjdump { + my $lib = shift; + + my $size = undef; + my $vma; + my $file_offset; + # Get objdump output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Idx Name Size VMA LMA File off Algn + # 10 .text 00104b2c 420156f0 420156f0 000156f0 2**4 + # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file + # offset may still be 8. But AddressSub below will still handle that. + my @x = split; + if (($#x >= 6) && ($x[1] eq '.text')) { + $size = $x[2]; + $vma = $x[3]; + $file_offset = $x[5]; + last; + } + } + close(OBJDUMP); + + if (!defined($size)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +# Parse text section header of a library using otool (on OS X) +sub ParseTextSectionHeaderFromOtool { + my $lib = shift; + + my $size = undef; + my $vma = undef; + my $file_offset = undef; + # Get otool output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib); + open(OTOOL, "$command |") || error("$command: $!\n"); + my $cmd = ""; + my $sectname = ""; + my $segname = ""; + foreach my $line () { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + # Load command <#> + # cmd LC_SEGMENT + # [...] + # Section + # sectname __text + # segname __TEXT + # addr 0x000009f8 + # size 0x00018b9e + # offset 2552 + # align 2^2 (4) + # We will need to strip off the leading 0x from the hex addresses, + # and convert the offset into hex. + if ($line =~ /Load command/) { + $cmd = ""; + $sectname = ""; + $segname = ""; + } elsif ($line =~ /Section/) { + $sectname = ""; + $segname = ""; + } elsif ($line =~ /cmd (\w+)/) { + $cmd = $1; + } elsif ($line =~ /sectname (\w+)/) { + $sectname = $1; + } elsif ($line =~ /segname (\w+)/) { + $segname = $1; + } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") && + $sectname eq "__text" && + $segname eq "__TEXT")) { + next; + } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) { + $vma = $1; + } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) { + $size = $1; + } elsif ($line =~ /\boffset ([0-9]+)/) { + $file_offset = sprintf("%016x", $1); + } + if (defined($vma) && defined($size) && defined($file_offset)) { + last; + } + } + close(OTOOL); + + if (!defined($vma) || !defined($size) || !defined($file_offset)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +sub ParseTextSectionHeader { + # obj_tool_map("otool") is only defined if we're in a Mach-O environment + if (defined($obj_tool_map{"otool"})) { + my $r = ParseTextSectionHeaderFromOtool(@_); + if (defined($r)){ + return $r; + } + } + # If otool doesn't work, or we don't have it, fall back to objdump + return ParseTextSectionHeaderFromObjdump(@_); +} + +# Split /proc/pid/maps dump into a list of libraries +sub ParseLibraries { + return if $main::use_symbol_page; # We don't need libraries info. + my $prog = Cwd::abs_path(shift); + my $map = shift; + my $pcs = shift; + + my $result = []; + my $h = "[a-f0-9]+"; + my $zero_offset = HexExtend("0"); + + my $buildvar = ""; + foreach my $l (split("\n", $map)) { + if ($l =~ m/^\s*build=(.*)$/) { + $buildvar = $1; + } + + my $start; + my $finish; + my $offset; + my $lib; + if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) { + # Full line from /proc/self/maps. Example: + # 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) { + # Cooked line from DumpAddressMap. Example: + # 40000000-40015000: /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = $3; + } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) { + # PIEs and address space randomization do not play well with our + # default assumption that main executable is at lowest + # addresses. So we're detecting main executable in + # /proc/self/maps as well. + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } + # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in + # function procfs_doprocmap (sys/fs/procfs/procfs_map.c) + # + # Example: + # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s + # o.1 NCH -1 + elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) { + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = FindLibrary($5); + + } else { + next; + } + + # Expand "$build" variable if available + $lib =~ s/\$build\b/$buildvar/g; + + $lib = FindLibrary($lib); + + # Check for pre-relocated libraries, which use pre-relocated symbol tables + # and thus require adjusting the offset that we'll use to translate + # VM addresses into symbol table addresses. + # Only do this if we're not going to fetch the symbol table from a + # debugging copy of the library. + if (!DebuggingLibrary($lib)) { + my $text = ParseTextSectionHeader($lib); + if (defined($text)) { + my $vma_offset = AddressSub($text->{vma}, $text->{file_offset}); + $offset = AddressAdd($offset, $vma_offset); + } + } + + if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; } + push(@{$result}, [$lib, $start, $finish, $offset]); + } + + # Append special entry for additional library (not relocated) + if ($main::opt_lib ne "") { + my $text = ParseTextSectionHeader($main::opt_lib); + if (defined($text)) { + my $start = $text->{vma}; + my $finish = AddressAdd($start, $text->{size}); + + push(@{$result}, [$main::opt_lib, $start, $finish, $start]); + } + } + + # Append special entry for the main program. This covers + # 0..max_pc_value_seen, so that we assume pc values not found in one + # of the library ranges will be treated as coming from the main + # program binary. + my $min_pc = HexExtend("0"); + my $max_pc = $min_pc; # find the maximal PC value in any sample + foreach my $pc (keys(%{$pcs})) { + if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); } + } + push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]); + + return $result; +} + +# Add two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressAdd { + my $addr1 = shift; + my $addr2 = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + + if ($main::opt_debug and $main::opt_test) { + print STDERR "AddressAdd $addr1 + $addr2 = "; + } + + my $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + my $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2); + my $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + my $r = sprintf("%07x", $sum); + + $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2) + $c; + $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + $r = sprintf("%07x", $sum) . $r; + + $sum = hex($addr1) + hex($addr2) + $c; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; } + + return $r; + } +} + + +# Subtract two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressSub { + my $addr1 = shift; + my $addr2 = shift; + my $diff; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $diff); + + } else { + # Do the addition in 7-nibble chunks to trivialize borrow handling. + # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; } + + my $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + my $a2 = hex(substr($addr2,-7)); + $addr2 = substr($addr2,0,-7); + my $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + my $r = sprintf("%07x", $diff); + + $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + $a2 = hex(substr($addr2,-7)) + $b; + $addr2 = substr($addr2,0,-7); + $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + $r = sprintf("%07x", $diff) . $r; + + $a1 = hex($addr1); + $a2 = hex($addr2) + $b; + if ($a2 > $a1) { $a1 += 0x100; } + $diff = $a1 - $a2; + $r = sprintf("%02x", $diff) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + + return $r; + } +} + +# Increment a hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressInc { + my $addr = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr)+1) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + # We are always doing this to step through the addresses in a function, + # and will almost never overflow the first chunk, so we check for this + # case and exit early. + + # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; } + + my $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + my $r = sprintf("%07x", $sum); + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "0000000"; + } + + $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + $r = sprintf("%07x", $sum) . $r; + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "00000000000000"; + } + + $sum = hex($addr) + 1; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + return $r; + } +} + +# Extract symbols for all PC values found in profile +sub ExtractSymbols { + my $libs = shift; + my $pcset = shift; + + my $symbols = {}; + + # Map each PC value to the containing library. To make this faster, + # we sort libraries by their starting pc value (highest first), and + # advance through the libraries as we advance the pc. Sometimes the + # addresses of libraries may overlap with the addresses of the main + # binary, so to make sure the libraries 'win', we iterate over the + # libraries in reverse order (which assumes the binary doesn't start + # in the middle of a library, which seems a fair assumption). + my @pcs = (sort { $a cmp $b } keys(%{$pcset})); # pcset is 0-extended strings + foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) { + my $libname = $lib->[0]; + my $start = $lib->[1]; + my $finish = $lib->[2]; + my $offset = $lib->[3]; + + # Use debug library if it exists + my $debug_libname = DebuggingLibrary($libname); + if ($debug_libname) { + $libname = $debug_libname; + } + + # Get list of pcs that belong in this library. + my $contained = []; + my ($start_pc_index, $finish_pc_index); + # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index]. + for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0; + $finish_pc_index--) { + last if $pcs[$finish_pc_index - 1] le $finish; + } + # Find smallest start_pc_index such that $start <= $pc[$start_pc_index]. + for ($start_pc_index = $finish_pc_index; $start_pc_index > 0; + $start_pc_index--) { + last if $pcs[$start_pc_index - 1] lt $start; + } + # This keeps PC values higher than $pc[$finish_pc_index] in @pcs, + # in case there are overlaps in libraries and the main binary. + @{$contained} = splice(@pcs, $start_pc_index, + $finish_pc_index - $start_pc_index); + # Map to symbols + MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols); + } + + return $symbols; +} + +# Map list of PC values to symbols for a given image +sub MapToSymbols { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + my $debug = 0; + + # Ignore empty binaries + if ($#{$pclist} < 0) { return; } + + # Figure out the addr2line command to use + my $addr2line = $obj_tool_map{"addr2line"}; + my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image); + if (exists $obj_tool_map{"addr2line_pdb"}) { + $addr2line = $obj_tool_map{"addr2line_pdb"}; + $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image); + } + + # If "addr2line" isn't installed on the system at all, just use + # nm to get what info we can (function names, but not line numbers). + if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) { + MapSymbolsWithNM($image, $offset, $pclist, $symbols); + return; + } + + # "addr2line -i" can produce a variable number of lines per input + # address, with no separator that allows us to tell when data for + # the next address starts. So we find the address for a special + # symbol (_fini) and interleave this address between all real + # addresses passed to addr2line. The name of this special symbol + # can then be used as a separator. + $sep_address = undef; # May be filled in by MapSymbolsWithNM() + my $nm_symbols = {}; + MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols); + if (defined($sep_address)) { + # Only add " -i" to addr2line if the binary supports it. + # addr2line --help returns 0, but not if it sees an unknown flag first. + if (system("$cmd -i --help >$dev_null 2>&1") == 0) { + $cmd .= " -i"; + } else { + $sep_address = undef; # no need for sep_address if we don't support -i + } + } + + # Make file with all PC values with intervening 'sep_address' so + # that we can reliably detect the end of inlined function list + open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n"); + if ($debug) { print("---- $image ---\n"); } + for (my $i = 0; $i <= $#{$pclist}; $i++) { + # addr2line always reads hex addresses, and does not need '0x' prefix. + if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); } + printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset)); + if (defined($sep_address)) { + printf ADDRESSES ("%s\n", $sep_address); + } + } + close(ADDRESSES); + if ($debug) { + print("----\n"); + system("cat", $main::tmpfile_sym); + print("----\n"); + system("$cmd < " . ShellEscape($main::tmpfile_sym)); + print("----\n"); + } + + open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |") + || error("$cmd: $!\n"); + my $count = 0; # Index in pclist + while () { + # Read fullfunction and filelineinfo from next pair of lines + s/\r?\n$//g; + my $fullfunction = $_; + $_ = ; + s/\r?\n$//g; + my $filelinenum = $_; + + if (defined($sep_address) && $fullfunction eq $sep_symbol) { + # Terminating marker for data for this address + $count++; + next; + } + + $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths + + my $pcstr = $pclist->[$count]; + my $function = ShortFunctionName($fullfunction); + my $nms = $nm_symbols->{$pcstr}; + if (defined($nms)) { + if ($fullfunction eq '??') { + # nm found a symbol for us. + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } else { + # MapSymbolsWithNM tags each routine with its starting address, + # useful in case the image has multiple occurrences of this + # routine. (It uses a syntax that resembles template parameters, + # that are automatically stripped out by ShortFunctionName().) + # addr2line does not provide the same information. So we check + # if nm disambiguated our symbol, and if so take the annotated + # (nm) version of the routine-name. TODO(csilvers): this won't + # catch overloaded, inlined symbols, which nm doesn't see. + # Better would be to do a check similar to nm's, in this fn. + if ($nms->[2] =~ m/^\Q$function\E/) { # sanity check it's the right fn + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } + } + } + + # Prepend to accumulated symbols for pcstr + # (so that caller comes before callee) + my $sym = $symbols->{$pcstr}; + if (!defined($sym)) { + $sym = []; + $symbols->{$pcstr} = $sym; + } + unshift(@{$sym}, $function, $filelinenum, $fullfunction); + if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); } + if (!defined($sep_address)) { + # Inlining is off, so this entry ends immediately + $count++; + } + } + close(SYMBOLS); +} + +# Use nm to map the list of referenced PCs to symbols. Return true iff we +# are able to read procedure information via nm. +sub MapSymbolsWithNM { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + # Get nm output sorted by increasing address + my $symbol_table = GetProcedureBoundaries($image, "."); + if (!%{$symbol_table}) { + return 0; + } + # Start addresses are already the right length (8 or 16 hex digits). + my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] } + keys(%{$symbol_table}); + + if ($#names < 0) { + # No symbols: just use addresses + foreach my $pc (@{$pclist}) { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + return 0; + } + + # Sort addresses so we can do a join against nm output + my $index = 0; + my $fullname = $names[0]; + my $name = ShortFunctionName($fullname); + foreach my $pc (sort { $a cmp $b } @{$pclist}) { + # Adjust for mapped offset + my $mpc = AddressSub($pc, $offset); + while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){ + $index++; + $fullname = $names[$index]; + $name = ShortFunctionName($fullname); + } + if ($mpc lt $symbol_table->{$fullname}->[1]) { + $symbols->{$pc} = [$name, "?", $fullname]; + } else { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + } + return 1; +} + +sub ShortFunctionName { + my $function = shift; + while ($function =~ s/\([^()]*\)(\s*const)?//g) { } # Argument types + while ($function =~ s/<[^<>]*>//g) { } # Remove template arguments + $function =~ s/^.*\s+(\w+::)/$1/; # Remove leading type + return $function; +} + +# Trim overly long symbols found in disassembler output +sub CleanDisassembly { + my $d = shift; + while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax) + while ($d =~ s/(\w+)<[^<>]*>/$1/g) { } # Remove template arguments + return $d; +} + +# Clean file name for display +sub CleanFileName { + my ($f) = @_; + $f =~ s|^/proc/self/cwd/||; + $f =~ s|^\./||; + return $f; +} + +# Make address relative to section and clean up for display +sub UnparseAddress { + my ($offset, $address) = @_; + $address = AddressSub($address, $offset); + $address =~ s/^0x//; + $address =~ s/^0*//; + return $address; +} + +##### Miscellaneous ##### + +# Find the right versions of the above object tools to use. The +# argument is the program file being analyzed, and should be an ELF +# 32-bit or ELF 64-bit executable file. The location of the tools +# is determined by considering the following options in this order: +# 1) --tools option, if set +# 2) JEPROF_TOOLS environment variable, if set +# 3) the environment +sub ConfigureObjTools { + my $prog_file = shift; + + # Check for the existence of $prog_file because /usr/bin/file does not + # predictably return error status in prod. + (-e $prog_file) || error("$prog_file does not exist.\n"); + + my $file_type = undef; + if (-e "/usr/bin/file") { + # Follow symlinks (at least for systems where "file" supports that). + my $escaped_prog_file = ShellEscape($prog_file); + $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null || + /usr/bin/file $escaped_prog_file`; + } elsif ($^O == "MSWin32") { + $file_type = "MS Windows"; + } else { + print STDERR "WARNING: Can't determine the file type of $prog_file"; + } + + if ($file_type =~ /64-bit/) { + # Change $address_length to 16 if the program file is ELF 64-bit. + # We can't detect this from many (most?) heap or lock contention + # profiles, since the actual addresses referenced are generally in low + # memory even for 64-bit programs. + $address_length = 16; + } + + if ($file_type =~ /MS Windows/) { + # For windows, we provide a version of nm and addr2line as part of + # the opensource release, which is capable of parsing + # Windows-style PDB executables. It should live in the path, or + # in the same directory as jeprof. + $obj_tool_map{"nm_pdb"} = "nm-pdb"; + $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb"; + } + + if ($file_type =~ /Mach-O/) { + # OS X uses otool to examine Mach-O files, rather than objdump. + $obj_tool_map{"otool"} = "otool"; + $obj_tool_map{"addr2line"} = "false"; # no addr2line + $obj_tool_map{"objdump"} = "false"; # no objdump + } + + # Go fill in %obj_tool_map with the pathnames to use: + foreach my $tool (keys %obj_tool_map) { + $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool}); + } +} + +# Returns the path of a caller-specified object tool. If --tools or +# JEPROF_TOOLS are specified, then returns the full path to the tool +# with that prefix. Otherwise, returns the path unmodified (which +# means we will look for it on PATH). +sub ConfigureTool { + my $tool = shift; + my $path; + + # --tools (or $JEPROF_TOOLS) is a comma separated list, where each + # item is either a) a pathname prefix, or b) a map of the form + # :. First we look for an entry of type (b) for our + # tool. If one is found, we use it. Otherwise, we consider all the + # pathname prefixes in turn, until one yields an existing file. If + # none does, we use a default path. + my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || ""; + if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) { + $path = $2; + # TODO(csilvers): sanity-check that $path exists? Hard if it's relative. + } elsif ($tools ne '') { + foreach my $prefix (split(',', $tools)) { + next if ($prefix =~ /:/); # ignore "tool:fullpath" entries in the list + if (-x $prefix . $tool) { + $path = $prefix . $tool; + last; + } + } + if (!$path) { + error("No '$tool' found with prefix specified by " . + "--tools (or \$JEPROF_TOOLS) '$tools'\n"); + } + } else { + # ... otherwise use the version that exists in the same directory as + # jeprof. If there's nothing there, use $PATH. + $0 =~ m,[^/]*$,; # this is everything after the last slash + my $dirname = $`; # this is everything up to and including the last slash + if (-x "$dirname$tool") { + $path = "$dirname$tool"; + } else { + $path = $tool; + } + } + if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; } + return $path; +} + +sub ShellEscape { + my @escaped_words = (); + foreach my $word (@_) { + my $escaped_word = $word; + if ($word =~ m![^a-zA-Z0-9/.,_=-]!) { # check for anything not in whitelist + $escaped_word =~ s/'/'\\''/; + $escaped_word = "'$escaped_word'"; + } + push(@escaped_words, $escaped_word); + } + return join(" ", @escaped_words); +} + +sub cleanup { + unlink($main::tmpfile_sym); + unlink(keys %main::tempnames); + + # We leave any collected profiles in $HOME/jeprof in case the user wants + # to look at them later. We print a message informing them of this. + if ((scalar(@main::profile_files) > 0) && + defined($main::collected_profile)) { + if (scalar(@main::profile_files) == 1) { + print STDERR "Dynamically gathered profile is in $main::collected_profile\n"; + } + print STDERR "If you want to investigate this profile further, you can do:\n"; + print STDERR "\n"; + print STDERR " jeprof \\\n"; + print STDERR " $main::prog \\\n"; + print STDERR " $main::collected_profile\n"; + print STDERR "\n"; + } +} + +sub sighandler { + cleanup(); + exit(1); +} + +sub error { + my $msg = shift; + print STDERR $msg; + cleanup(); + exit(1); +} + + +# Run $nm_command and get all the resulting procedure boundaries whose +# names match "$regexp" and returns them in a hashtable mapping from +# procedure name to a two-element vector of [start address, end address] +sub GetProcedureBoundariesViaNm { + my $escaped_nm_command = shift; # shell-escaped + my $regexp = shift; + + my $symbol_table = {}; + open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n"); + my $last_start = "0"; + my $routine = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (m/^\s*([0-9a-f]+) (.) (..*)/) { + my $start_val = $1; + my $type = $2; + my $this_routine = $3; + + # It's possible for two symbols to share the same address, if + # one is a zero-length variable (like __start_google_malloc) or + # one symbol is a weak alias to another (like __libc_malloc). + # In such cases, we want to ignore all values except for the + # actual symbol, which in nm-speak has type "T". The logic + # below does this, though it's a bit tricky: what happens when + # we have a series of lines with the same address, is the first + # one gets queued up to be processed. However, it won't + # *actually* be processed until later, when we read a line with + # a different address. That means that as long as we're reading + # lines with the same address, we have a chance to replace that + # item in the queue, which we do whenever we see a 'T' entry -- + # that is, a line with type 'T'. If we never see a 'T' entry, + # we'll just go ahead and process the first entry (which never + # got touched in the queue), and ignore the others. + if ($start_val eq $last_start && $type =~ /t/i) { + # We are the 'T' symbol at this address, replace previous symbol. + $routine = $this_routine; + next; + } elsif ($start_val eq $last_start) { + # We're not the 'T' symbol at this address, so ignore us. + next; + } + + if ($this_routine eq $sep_symbol) { + $sep_address = HexExtend($start_val); + } + + # Tag this routine with the starting address in case the image + # has multiple occurrences of this routine. We use a syntax + # that resembles template parameters that are automatically + # stripped out by ShortFunctionName() + $this_routine .= "<$start_val>"; + + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($start_val)]; + } + $last_start = $start_val; + $routine = $this_routine; + } elsif (m/^Loaded image name: (.+)/) { + # The win32 nm workalike emits information about the binary it is using. + if ($main::opt_debug) { print STDERR "Using Image $1\n"; } + } elsif (m/^PDB file name: (.+)/) { + # The win32 nm workalike emits information about the pdb it is using. + if ($main::opt_debug) { print STDERR "Using PDB $1\n"; } + } + } + close(NM); + # Handle the last line in the nm output. Unfortunately, we don't know + # how big this last symbol is, because we don't know how big the file + # is. For now, we just give it a size of 0. + # TODO(csilvers): do better here. + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($last_start)]; + } + return $symbol_table; +} + +# Gets the procedure boundaries for all routines in "$image" whose names +# match "$regexp" and returns them in a hashtable mapping from procedure +# name to a two-element vector of [start address, end address]. +# Will return an empty map if nm is not installed or not working properly. +sub GetProcedureBoundaries { + my $image = shift; + my $regexp = shift; + + # If $image doesn't start with /, then put ./ in front of it. This works + # around an obnoxious bug in our probing of nm -f behavior. + # "nm -f $image" is supposed to fail on GNU nm, but if: + # + # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND + # b. you have a.out in your current directory (a not uncommon occurrence) + # + # then "nm -f $image" succeeds because -f only looks at the first letter of + # the argument, which looks valid because it's [BbSsPp], and then since + # there's no image provided, it looks for a.out and finds it. + # + # This regex makes sure that $image starts with . or /, forcing the -f + # parsing to fail since . and / are not valid formats. + $image =~ s#^[^/]#./$&#; + + # For libc libraries, the copy in /usr/lib/debug contains debugging symbols + my $debugging = DebuggingLibrary($image); + if ($debugging) { + $image = $debugging; + } + + my $nm = $obj_tool_map{"nm"}; + my $cppfilt = $obj_tool_map{"c++filt"}; + + # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm + # binary doesn't support --demangle. In addition, for OS X we need + # to use the -f flag to get 'flat' nm output (otherwise we don't sort + # properly and get incorrect results). Unfortunately, GNU nm uses -f + # in an incompatible way. So first we test whether our nm supports + # --demangle and -f. + my $demangle_flag = ""; + my $cppfilt_flag = ""; + my $to_devnull = ">$dev_null 2>&1"; + if (system(ShellEscape($nm, "--demangle", $image) . $to_devnull) == 0) { + # In this mode, we do "nm --demangle " + $demangle_flag = "--demangle"; + $cppfilt_flag = ""; + } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) { + # In this mode, we do "nm | c++filt" + $cppfilt_flag = " | " . ShellEscape($cppfilt); + }; + my $flatten_flag = ""; + if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) { + $flatten_flag = "-f"; + } + + # Finally, in the case $imagie isn't a debug library, we try again with + # -D to at least get *exported* symbols. If we can't use --demangle, + # we use c++filt instead, if it exists on this system. + my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + # 6nm is for Go binaries + ShellEscape("6nm", "$image") . " 2>$dev_null | sort", + ); + + # If the executable is an MS Windows PDB-format executable, we'll + # have set up obj_tool_map("nm_pdb"). In this case, we actually + # want to use both unix nm and windows-specific nm_pdb, since + # PDB-format executables can apparently include dwarf .o files. + if (exists $obj_tool_map{"nm_pdb"}) { + push(@nm_commands, + ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image) + . " 2>$dev_null"); + } + + foreach my $nm_command (@nm_commands) { + my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp); + return $symbol_table if (%{$symbol_table}); + } + my $symbol_table = {}; + return $symbol_table; +} + + +# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings. +# To make them more readable, we add underscores at interesting places. +# This routine removes the underscores, producing the canonical representation +# used by jeprof to represent addresses, particularly in the tested routines. +sub CanonicalHex { + my $arg = shift; + return join '', (split '_',$arg); +} + + +# Unit test for AddressAdd: +sub AddressAddUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd ($row->[0], $row->[1]); + if ($sum ne $row->[2]) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + my $expected = join '', (split '_',$row->[2]); + if ($sum ne CanonicalHex($row->[2])) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressSub: +sub AddressSubUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub ($row->[0], $row->[1]); + if ($sum ne $row->[3]) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + if ($sum ne CanonicalHex($row->[3])) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressInc: +sub AddressIncUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc ($row->[0]); + if ($sum ne $row->[4]) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc (CanonicalHex($row->[0])); + if ($sum ne CanonicalHex($row->[4])) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Driver for unit tests. +# Currently just the address add/subtract/increment routines for 64-bit. +sub RunUnitTests { + my $error_count = 0; + + # This is a list of tuples [a, b, a+b, a-b, a+1] + my $unit_test_data_8 = [ + [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)], + [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)], + [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)], + [qw(00000001 ffffffff 00000000 00000002 00000002)], + [qw(00000001 fffffff0 fffffff1 00000011 00000002)], + ]; + my $unit_test_data_16 = [ + # The implementation handles data in 7-nibble chunks, so those are the + # interesting boundaries. + [qw(aaaaaaaa 50505050 + 00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)], + [qw(50505050 aaaaaaaa + 00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)], + [qw(ffffffff aaaaaaaa + 00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)], + [qw(00000001 ffffffff + 00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)], + [qw(00000001 fffffff0 + 00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)], + + [qw(00_a00000a_aaaaaaa 50505050 + 00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)], + [qw(0f_fff0005_0505050 aaaaaaaa + 0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)], + [qw(00_000000f_fffffff 01_800000a_aaaaaaa + 01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)], + [qw(00_0000000_0000001 ff_fffffff_fffffff + 00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)], + [qw(00_0000000_0000001 ff_fffffff_ffffff0 + ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)], + ]; + + $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16); + if ($error_count > 0) { + print STDERR $error_count, " errors: FAILED\n"; + } else { + print STDERR "PASS\n"; + } + exit ($error_count); +} \ No newline at end of file diff --git a/src/server/status_server/metrics.rs b/src/server/status_server/metrics.rs new file mode 100644 index 00000000000..9786ebd0a10 --- /dev/null +++ b/src/server/status_server/metrics.rs @@ -0,0 +1,13 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use prometheus::{exponential_buckets, register_histogram_vec, HistogramVec}; + +lazy_static::lazy_static! { + pub static ref STATUS_REQUEST_DURATION: HistogramVec = register_histogram_vec!( + "tikv_status_server_request_duration_seconds", + "Bucketed histogram of TiKV status server request duration", + &["method", "path"], + exponential_buckets(0.0001, 2.0, 24).unwrap() // 0.1ms ~ 1677.7s + ) + .unwrap(); +} diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 679f21fdf6c..90c966d13e2 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -1,11 +1,13 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +mod metrics; /// Provides profilers for TiKV. mod profile; + use std::{ + env::args, error::Error as StdError, net::SocketAddr, - path::PathBuf, pin::Pin, str::{self, FromStr}, sync::Arc, @@ -17,7 +19,7 @@ use async_stream::stream; use collections::HashMap; use flate2::{write::GzEncoder, Compression}; use futures::{ - compat::{Compat01As03, Stream01CompatExt}, + compat::Compat01As03, future::{ok, poll_fn}, prelude::*, }; @@ -33,16 +35,14 @@ use hyper::{ Body, Method, Request, Response, Server, StatusCode, }; use kvproto::resource_manager::ResourceGroup; +use metrics::STATUS_REQUEST_DURATION; use online_config::OnlineConfig; use openssl::{ ssl::{Ssl, SslAcceptor, SslContext, SslFiletype, SslMethod, SslVerifyMode}, x509::X509, }; use pin_project::pin_project; -pub use profile::{ - activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, list_heap_profiles, - read_file, start_one_cpu_profile, start_one_heap_profile, -}; +use profile::*; use prometheus::TEXT_FORMAT; use regex::Regex; use resource_control::ResourceGroupManager; @@ -58,7 +58,7 @@ use tikv_util::{ }; use tokio::{ io::{AsyncRead, AsyncWrite}, - runtime::{Builder, Handle, Runtime}, + runtime::{Builder, Runtime}, sync::oneshot::{self, Receiver, Sender}, }; use tokio_openssl::SslStream; @@ -92,7 +92,6 @@ pub struct StatusServer { cfg_controller: ConfigController, router: R, security_config: Arc, - store_path: PathBuf, resource_manager: Option>, grpc_service_mgr: GrpcServiceManager, } @@ -106,7 +105,6 @@ where cfg_controller: ConfigController, security_config: Arc, router: R, - store_path: PathBuf, resource_manager: Option>, grpc_service_mgr: GrpcServiceManager, ) -> Result { @@ -129,105 +127,28 @@ where cfg_controller, router, security_config, - store_path, resource_manager, grpc_service_mgr, }) } - fn list_heap_prof(_req: Request) -> hyper::Result> { - let profiles = match list_heap_profiles() { - Ok(s) => s, - Err(e) => return Ok(make_response(StatusCode::INTERNAL_SERVER_ERROR, e)), - }; - - let text = profiles - .into_iter() - .map(|(f, ct)| format!("{}\t\t{}", f, ct)) - .collect::>() - .join("\n") - .into_bytes(); - - let response = Response::builder() - .header("Content-Type", mime::TEXT_PLAIN.to_string()) - .header("Content-Length", text.len()) - .body(text.into()) - .unwrap(); - Ok(response) - } - - async fn activate_heap_prof( - req: Request, - store_path: PathBuf, - ) -> hyper::Result> { - let query = req.uri().query().unwrap_or(""); - let query_pairs: HashMap<_, _> = url::form_urlencoded::parse(query.as_bytes()).collect(); - - let interval: u64 = match query_pairs.get("interval") { - Some(val) => match val.parse() { - Ok(val) => val, - Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), - }, - None => 60, - }; - - let interval = Duration::from_secs(interval); - let period = GLOBAL_TIMER_HANDLE - .interval(Instant::now() + interval, interval) - .compat() - .map_ok(|_| ()) - .map_err(|_| TIMER_CANCELED.to_owned()) - .into_stream(); - let (tx, rx) = oneshot::channel(); - let callback = move || tx.send(()).unwrap_or_default(); - let res = Handle::current().spawn(activate_heap_profile(period, store_path, callback)); - if rx.await.is_ok() { - let msg = "activate heap profile success"; - Ok(make_response(StatusCode::OK, msg)) - } else { - let errmsg = format!("{:?}", res.await); - Ok(make_response(StatusCode::INTERNAL_SERVER_ERROR, errmsg)) - } - } - - fn deactivate_heap_prof(_req: Request) -> hyper::Result> { - let body = if deactivate_heap_profile() { - "deactivate heap profile success" - } else { - "no heap profile is running" - }; - Ok(make_response(StatusCode::OK, body)) - } - - #[allow(dead_code)] async fn dump_heap_prof_to_resp(req: Request) -> hyper::Result> { let query = req.uri().query().unwrap_or(""); let query_pairs: HashMap<_, _> = url::form_urlencoded::parse(query.as_bytes()).collect(); let use_jeprof = query_pairs.get("jeprof").map(|x| x.as_ref()) == Some("true"); - let result = if let Some(name) = query_pairs.get("name") { + let result = { + let file = match dump_one_heap_profile() { + Ok(file) => file, + Err(e) => return Ok(make_response(StatusCode::INTERNAL_SERVER_ERROR, e)), + }; + let path = file.path(); if use_jeprof { - jeprof_heap_profile(name) + jeprof_heap_profile(path.to_str().unwrap()) } else { - read_file(name) + read_file(path.to_str().unwrap()) } - } else { - let mut seconds = 10; - if let Some(s) = query_pairs.get("seconds") { - match s.parse() { - Ok(val) => seconds = val, - Err(_) => { - let errmsg = "request should have seconds argument".to_owned(); - return Ok(make_response(StatusCode::BAD_REQUEST, errmsg)); - } - } - } - let timer = GLOBAL_TIMER_HANDLE.delay(Instant::now() + Duration::from_secs(seconds)); - let end = Compat01As03::new(timer) - .map_err(|_| TIMER_CANCELED.to_owned()) - .into_future(); - start_one_heap_profile(end, use_jeprof).await }; match result { @@ -283,11 +204,100 @@ where }) } + async fn get_cmdline(_req: Request) -> hyper::Result> { + let args = args().into_iter().fold(String::new(), |mut a, b| { + a.push_str(&b); + a.push('\x00'); + a + }); + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .body(args.into()) + .unwrap(); + Ok(response) + } + + async fn get_symbol_count(req: Request) -> hyper::Result> { + assert_eq!(req.method(), Method::GET); + // We don't know how many symbols we have, but we + // do have symbol information. pprof only cares whether + // this number is 0 (no symbols available) or > 0. + let text = "num_symbols: 1\n"; + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .header("Content-Length", text.len()) + .body(text.into()) + .unwrap(); + Ok(response) + } + + // The request and response format follows pprof remote server + // https://gperftools.github.io/gperftools/pprof_remote_servers.html + // Here is the go pprof implementation: + // https://github.com/golang/go/blob/3857a89e7eb872fa22d569e70b7e076bec74ebbb/src/net/http/pprof/pprof.go#L191 + async fn get_symbol(req: Request) -> hyper::Result> { + assert_eq!(req.method(), Method::POST); + let mut text = String::new(); + let body_bytes = hyper::body::to_bytes(req.into_body()).await?; + let body = String::from_utf8(body_bytes.to_vec()).unwrap(); + + // The request body is a list of addr to be resolved joined by '+'. + // Resolve addrs with addr2line and write the symbols each per line in + // response. + for pc in body.split('+') { + let addr = usize::from_str_radix(pc.trim_start_matches("0x"), 16).unwrap_or(0); + if addr == 0 { + info!("invalid addr: {}", addr); + continue; + } + + // Would be multiple symbols if inlined. + let mut syms = vec![]; + backtrace::resolve(addr as *mut std::ffi::c_void, |sym| { + let name = sym + .name() + .unwrap_or_else(|| backtrace::SymbolName::new(b"")); + syms.push(name.to_string()); + }); + + if !syms.is_empty() { + // join inline functions with '--' + let f = syms.join("--"); + // should be + text.push_str(format!("{:#x} {}\n", addr, f).as_str()); + } else { + info!("can't resolve mapped addr: {:#x}", addr); + text.push_str(format!("{:#x} ??\n", addr).as_str()); + } + } + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("X-Content-Type-Options", "nosniff") + .header("Content-Length", text.len()) + .body(text.into()) + .unwrap(); + Ok(response) + } + async fn update_config( cfg_controller: ConfigController, req: Request, ) -> hyper::Result> { let mut body = Vec::new(); + let mut persist = true; + if let Some(query) = req.uri().query() { + let query_pairs: HashMap<_, _> = + url::form_urlencoded::parse(query.as_bytes()).collect(); + persist = match query_pairs.get("persist") { + Some(val) => match val.parse() { + Ok(val) => val, + Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), + }, + None => true, + }; + } req.into_body() .try_for_each(|bytes| { body.extend(bytes); @@ -295,7 +305,11 @@ where }) .await?; Ok(match decode_json(&body) { - Ok(change) => match cfg_controller.update(change) { + Ok(change) => match if persist { + cfg_controller.update(change) + } else { + cfg_controller.update_without_persist(change) + } { Err(e) => { if let Some(e) = e.downcast_ref::() { make_response( @@ -585,7 +599,6 @@ where let security_config = self.security_config.clone(); let cfg_controller = self.cfg_controller.clone(); let router = self.router.clone(); - let store_path = self.store_path.clone(); let resource_manager = self.resource_manager.clone(); let grpc_service_mgr = self.grpc_service_mgr.clone(); // Start to serve. @@ -594,7 +607,6 @@ where let security_config = security_config.clone(); let cfg_controller = cfg_controller.clone(); let router = router.clone(); - let store_path = store_path.clone(); let resource_manager = resource_manager.clone(); let grpc_service_mgr = grpc_service_mgr.clone(); async move { @@ -604,7 +616,6 @@ where let security_config = security_config.clone(); let cfg_controller = cfg_controller.clone(); let router = router.clone(); - let store_path = store_path.clone(); let resource_manager = resource_manager.clone(); let grpc_service_mgr = grpc_service_mgr.clone(); async move { @@ -637,21 +648,39 @@ where )); } - match (method, path.as_ref()) { + let mut is_unknown_path = false; + let start = Instant::now(); + let res = match (method.clone(), path.as_ref()) { (Method::GET, "/metrics") => { Self::handle_get_metrics(req, &cfg_controller) } (Method::GET, "/status") => Ok(Response::default()), - (Method::GET, "/debug/pprof/heap_list") => Self::list_heap_prof(req), + (Method::GET, "/debug/pprof/heap_list") => { + Ok(make_response( + StatusCode::GONE, + "Deprecated, heap profiling is always enabled by default, just use /debug/pprof/heap to get the heap profile when needed", + )) + } (Method::GET, "/debug/pprof/heap_activate") => { - Self::activate_heap_prof(req, store_path).await + Ok(make_response( + StatusCode::GONE, + "Deprecated, use config `memory.enable_heap_profiling` to toggle", + )) } (Method::GET, "/debug/pprof/heap_deactivate") => { - Self::deactivate_heap_prof(req) + Ok(make_response( + StatusCode::GONE, + "Deprecated, use config `memory.enable_heap_profiling` to toggle", + )) + } + (Method::GET, "/debug/pprof/heap") => { + Self::dump_heap_prof_to_resp(req).await + } + (Method::GET, "/debug/pprof/cmdline") => Self::get_cmdline(req).await, + (Method::GET, "/debug/pprof/symbol") => { + Self::get_symbol_count(req).await } - // (Method::GET, "/debug/pprof/heap") => { - // Self::dump_heap_prof_to_resp(req).await - // } + (Method::POST, "/debug/pprof/symbol") => Self::get_symbol(req).await, (Method::GET, "/config") => { Self::get_config(req, &cfg_controller).await } @@ -693,8 +722,21 @@ where (Method::PUT, "/resume_grpc") => { Self::handle_resume_grpc(grpc_service_mgr).await } - _ => Ok(make_response(StatusCode::NOT_FOUND, "path not found")), - } + _ => { + is_unknown_path = true; + Ok(make_response(StatusCode::NOT_FOUND, "path not found")) + }, + }; + // Using "unknown" for unknown paths to void creating high cardinality. + let path_label = if is_unknown_path { + "unknown".to_owned() + } else { + path + }; + STATUS_REQUEST_DURATION + .with_label_values(&[method.as_str(), &path_label]) + .observe(start.elapsed().as_secs_f64()); + res } })) } @@ -1109,13 +1151,11 @@ mod tests { #[test] fn test_status_service() { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1159,13 +1199,11 @@ mod tests { #[test] fn test_config_endpoint() { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1202,17 +1240,84 @@ mod tests { status_server.stop(); } + #[test] + fn test_update_config_endpoint() { + let test_config = |persist: bool| { + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut config = TikvConfig::default(); + config.cfg_path = temp_dir + .path() + .join("tikv.toml") + .to_str() + .unwrap() + .to_string(); + let mut status_server = StatusServer::new( + 1, + ConfigController::new(config), + Arc::new(SecurityConfig::default()), + MockRouter, + None, + GrpcServiceManager::dummy(), + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + let uri = if persist { + Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/config") + .build() + .unwrap() + } else { + Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/config?persist=false") + .build() + .unwrap() + }; + let mut req = Request::new(Body::from("{\"coprocessor.region-split-size\": \"1GB\"}")); + *req.method_mut() = Method::POST; + *req.uri_mut() = uri.clone(); + let handle = status_server.thread_pool.spawn(async move { + let resp = client.request(req).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + }); + block_on(handle).unwrap(); + + let client = Client::new(); + let handle2 = status_server.thread_pool.spawn(async move { + let resp = client.get(uri).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let mut v = Vec::new(); + resp.into_body() + .try_for_each(|bytes| { + v.extend(bytes); + ok(()) + }) + .await + .unwrap(); + let resp_json = String::from_utf8_lossy(&v).to_string(); + assert!(resp_json.contains("\"region-split-size\":\"1GiB\"")); + }); + block_on(handle2).unwrap(); + status_server.stop(); + }; + test_config(true); + test_config(false); + } + #[cfg(feature = "failpoints")] #[test] fn test_status_service_fail_endpoints() { let _guard = fail::FailScenario::setup(); - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1324,13 +1429,11 @@ mod tests { #[test] fn test_status_service_fail_endpoints_can_trigger_fails() { let _guard = fail::FailScenario::setup(); - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1370,13 +1473,11 @@ mod tests { #[test] fn test_status_service_fail_endpoints_should_give_404_when_failpoints_are_disable() { let _guard = fail::FailScenario::setup(); - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1408,13 +1509,11 @@ mod tests { } fn do_test_security_status_service(allowed_cn: HashSet, expected: bool) { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(new_security_cfg(Some(allowed_cn))), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1481,15 +1580,12 @@ mod tests { #[cfg(feature = "mem-profiling")] #[test] - #[ignore] fn test_pprof_heap_service() { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1515,13 +1611,11 @@ mod tests { #[test] fn test_pprof_profile_service() { let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1547,16 +1641,65 @@ mod tests { status_server.stop(); } + #[test] + fn test_pprof_symbol_service() { + let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); + let mut status_server = StatusServer::new( + 1, + ConfigController::default(), + Arc::new(SecurityConfig::default()), + MockRouter, + None, + GrpcServiceManager::dummy(), + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + + let mut addr = None; + backtrace::trace(|f| { + addr = Some(f.ip()); + false + }); + assert!(addr.is_some()); + + let uri = Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/debug/pprof/symbol") + .build() + .unwrap(); + let req = Request::builder() + .method(Method::POST) + .uri(uri) + .body(Body::from(format!("{:p}", addr.unwrap()))) + .unwrap(); + let handle = status_server + .thread_pool + .spawn(async move { client.request(req).await.unwrap() }); + let resp = block_on(handle).unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body_bytes = block_on(hyper::body::to_bytes(resp.into_body())).unwrap(); + assert!( + String::from_utf8(body_bytes.as_ref().to_owned()) + .unwrap() + .split(' ') + .last() + .unwrap() + .starts_with("backtrace::backtrace") + ); + status_server.stop(); + } + #[test] fn test_metrics() { let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1607,13 +1750,11 @@ mod tests { #[test] fn test_change_log_level() { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1663,13 +1804,11 @@ mod tests { let cfgs = [TikvConfig::default(), multi_rocks_cfg]; let resp_strs = ["raft-kv", "partitioned-raft-kv"]; for (cfg, resp_str) in IntoIterator::into_iter(cfgs).zip(resp_strs) { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::new(cfg), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) @@ -1702,13 +1841,11 @@ mod tests { multi_rocks_cfg.storage.engine = EngineType::RaftKv2; let cfgs = [TikvConfig::default(), multi_rocks_cfg]; for cfg in IntoIterator::into_iter(cfgs) { - let temp_dir = tempfile::TempDir::new().unwrap(); let mut status_server = StatusServer::new( 1, ConfigController::new(cfg), Arc::new(SecurityConfig::default()), MockRouter, - temp_dir.path().to_path_buf(), None, GrpcServiceManager::dummy(), ) diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index b3d91d3bea6..582e02066f8 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -1,43 +1,32 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - fs::{File, Metadata}, - io::Read, - path::PathBuf, + fs::File, + io::{Read, Write}, pin::Pin, - process::Command, - sync::Mutex as StdMutex, - time::{Duration, UNIX_EPOCH}, + process::{Command, Stdio}, + sync::Mutex, }; -use chrono::{offset::Local, DateTime}; use futures::{ - channel::oneshot::{self, Sender}, future::BoxFuture, - select, task::{Context, Poll}, - Future, FutureExt, Stream, StreamExt, + Future, FutureExt, }; use lazy_static::lazy_static; use pprof::protos::Message; use regex::Regex; -use tempfile::{NamedTempFile, TempDir}; +use tempfile::NamedTempFile; #[cfg(not(test))] -use tikv_alloc::{activate_prof, deactivate_prof, dump_prof}; -use tokio::sync::{Mutex, MutexGuard}; +use tikv_alloc::dump_prof; #[cfg(test)] -pub use self::test_utils::TEST_PROFILE_MUTEX; +use self::test_utils::dump_prof; #[cfg(test)] -use self::test_utils::{activate_prof, deactivate_prof, dump_prof}; - -// File name suffix for periodically dumped heap profiles. -const HEAP_PROFILE_SUFFIX: &str = ".heap"; +pub use self::test_utils::TEST_PROFILE_MUTEX; lazy_static! { - // If it's locked it means there are already a heap or CPU profiling. - static ref PROFILE_MUTEX: Mutex<()> = Mutex::new(()); - // The channel is used to deactivate a profiling. - static ref PROFILE_ACTIVE: StdMutex, TempDir)>> = StdMutex::new(None); + // If it's some it means there are already a CPU profiling. + static ref CPU_PROFILE_ACTIVE: Mutex> = Mutex::new(None); // To normalize thread names. static ref THREAD_NAME_RE: Regex = @@ -47,32 +36,26 @@ lazy_static! { type OnEndFn = Box Result + Send + 'static>; -struct ProfileGuard<'a, I, T> { - _guard: MutexGuard<'a, ()>, +struct ProfileRunner { item: Option, on_end: Option>, end: BoxFuture<'static, Result<(), String>>, } -impl<'a, I, T> Unpin for ProfileGuard<'a, I, T> {} +impl Unpin for ProfileRunner {} -impl<'a, I, T> ProfileGuard<'a, I, T> { +impl ProfileRunner { fn new( on_start: F1, on_end: F2, end: BoxFuture<'static, Result<(), String>>, - ) -> Result, String> + ) -> Result where F1: FnOnce() -> Result, F2: FnOnce(I) -> Result + Send + 'static, { - let _guard = match PROFILE_MUTEX.try_lock() { - Ok(guard) => guard, - _ => return Err("Already in Profiling".to_owned()), - }; let item = on_start()?; - Ok(ProfileGuard { - _guard, + Ok(ProfileRunner { item: Some(item), on_end: Some(Box::new(on_end) as OnEndFn), end, @@ -80,7 +63,7 @@ impl<'a, I, T> ProfileGuard<'a, I, T> { } } -impl<'a, I, T> Future for ProfileGuard<'a, I, T> { +impl Future for ProfileRunner { type Output = Result; fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { match self.end.as_mut().poll(cx) { @@ -98,83 +81,12 @@ impl<'a, I, T> Future for ProfileGuard<'a, I, T> { } } -/// Trigger a heap profie and return the content. -#[allow(dead_code)] -pub async fn start_one_heap_profile(end: F, use_jeprof: bool) -> Result, String> -where - F: Future> + Send + 'static, -{ - let on_start = || activate_prof().map_err(|e| format!("activate_prof: {}", e)); - - let on_end = move |_| { - deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e))?; - let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; - let path = f.path().to_str().unwrap(); - dump_prof(path).map_err(|e| format!("dump_prof: {}", e))?; - if use_jeprof { - jeprof_heap_profile(path) - } else { - read_file(path) - } - }; - - ProfileGuard::new(on_start, on_end, end.boxed())?.await -} - -/// Activate heap profile and call `callback` if successfully. -/// `deactivate_heap_profile` can only be called after it's notified from -/// `callback`. -pub async fn activate_heap_profile( - dump_period: S, - store_path: PathBuf, - callback: F, -) -> Result<(), String> -where - S: Stream> + Send + Unpin + 'static, - F: FnOnce() + Send + 'static, -{ - let (tx, rx) = oneshot::channel(); - let dir = tempfile::Builder::new() - .prefix("heap-") - .tempdir_in(store_path) - .map_err(|e| format!("create temp directory: {}", e))?; - let dir_path = dir.path().to_str().unwrap().to_owned(); - - let on_start = move || { - let mut activate = PROFILE_ACTIVE.lock().unwrap(); - assert!(activate.is_none()); - activate_prof().map_err(|e| format!("activate_prof: {}", e))?; - *activate = Some((tx, dir)); - callback(); - info!("periodical heap profiling is started"); - Ok(()) - }; - - let on_end = |_| { - deactivate_heap_profile(); - deactivate_prof().map_err(|e| format!("deactivate_prof: {}", e)) - }; - - let end = async move { - select! { - _ = rx.fuse() => { - info!("periodical heap profiling is canceled"); - Ok(()) - }, - res = dump_heap_profile_periodically(dump_period, dir_path).fuse() => { - warn!("the heap profiling dump loop shouldn't break"; "res" => ?res); - res - } - } - }; - - ProfileGuard::new(on_start, on_end, end.boxed())?.await -} - -/// Deactivate heap profile. Return `false` if it hasn't been activated. -pub fn deactivate_heap_profile() -> bool { - let mut activate = PROFILE_ACTIVE.lock().unwrap(); - activate.take().is_some() +/// Trigger a heap profile and return the content. +pub fn dump_one_heap_profile() -> Result { + let f = NamedTempFile::new().map_err(|e| format!("create tmp file fail: {}", e))?; + let path = f.path(); + dump_prof(path.to_str().unwrap()).map_err(|e| format!("dump_prof: {}", e))?; + Ok(f) } /// Trigger one cpu profile. @@ -186,7 +98,14 @@ pub async fn start_one_cpu_profile( where F: Future> + Send + 'static, { + if CPU_PROFILE_ACTIVE.lock().unwrap().is_some() { + return Err("Already in CPU Profiling".to_owned()); + } + let on_start = || { + let mut activate = CPU_PROFILE_ACTIVE.lock().unwrap(); + assert!(activate.is_none()); + *activate = Some(()); let guard = pprof::ProfilerGuardBuilder::default() .frequency(frequency) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) @@ -217,10 +136,13 @@ where .flamegraph(&mut body) .map_err(|e| format!("generate flamegraph from report fail: {}", e))?; } + drop(guard); + *CPU_PROFILE_ACTIVE.lock().unwrap() = None; + Ok(body) }; - ProfileGuard::new(on_start, on_end, end.boxed())?.await + ProfileRunner::new(on_start, on_end, end.boxed())?.await } pub fn read_file(path: &str) -> Result, String> { @@ -233,9 +155,26 @@ pub fn read_file(path: &str) -> Result, String> { pub fn jeprof_heap_profile(path: &str) -> Result, String> { info!("using jeprof to process {}", path); - let output = Command::new("./jeprof") - .args(["--show_bytes", "./bin/tikv-server", path, "--svg"]) - .output() + let bin = std::env::current_exe().map_err(|e| format!("get current exe path fail: {}", e))?; + let mut jeprof = Command::new("perl") + .args([ + "/dev/stdin", + "--show_bytes", + &bin.as_os_str().to_string_lossy(), + path, + "--svg", + ]) + .stdin(Stdio::piped()) + .spawn() + .map_err(|e| format!("spawn jeprof fail: {}", e))?; + jeprof + .stdin + .take() + .unwrap() + .write_all(include_bytes!("jeprof.in")) + .unwrap(); + let output = jeprof + .wait_with_output() .map_err(|e| format!("jeprof: {}", e))?; if !output.status.success() { let stderr = std::str::from_utf8(&output.stderr).unwrap_or("invalid utf8"); @@ -244,49 +183,6 @@ pub fn jeprof_heap_profile(path: &str) -> Result, String> { Ok(output.stdout) } -pub fn list_heap_profiles() -> Result, String> { - let path = match &*PROFILE_ACTIVE.lock().unwrap() { - Some((_, ref dir)) => dir.path().to_str().unwrap().to_owned(), - None => return Ok(vec![]), - }; - - let dir = std::fs::read_dir(path).map_err(|e| format!("read dir fail: {}", e))?; - let mut profiles = Vec::new(); - for item in dir { - let item = match item { - Ok(x) => x, - _ => continue, - }; - let f = item.path().to_str().unwrap().to_owned(); - if !f.ends_with(HEAP_PROFILE_SUFFIX) { - continue; - } - let ct = item.metadata().map(|x| last_change_epoch(&x)).unwrap(); - let dt = DateTime::::from(UNIX_EPOCH + Duration::from_secs(ct)); - profiles.push((f, dt.format("%Y-%m-%d %H:%M:%S").to_string())); - } - - // Reverse sort them. - profiles.sort_by(|x, y| y.1.cmp(&x.1)); - info!("list_heap_profiles gets {} items", profiles.len()); - Ok(profiles) -} - -async fn dump_heap_profile_periodically(mut period: S, dir: String) -> Result<(), String> -where - S: Stream> + Send + Unpin + 'static, -{ - let mut id = 0; - while let Some(res) = period.next().await { - res?; - id += 1; - let path = format!("{}/{:0>6}{}", dir, id, HEAP_PROFILE_SUFFIX); - dump_prof(&path).map_err(|e| format!("dump_prof: {}", e))?; - info!("a heap profile is dumped to {}", path); - } - Ok(()) -} - fn extract_thread_name(thread_name: &str) -> String { THREAD_NAME_RE .captures(thread_name) @@ -312,43 +208,18 @@ mod test_utils { pub static ref TEST_PROFILE_MUTEX: Mutex<()> = Mutex::new(()); } - pub fn activate_prof() -> ProfResult<()> { - Ok(()) - } - pub fn deactivate_prof() -> ProfResult<()> { - Ok(()) - } pub fn dump_prof(_: &str) -> ProfResult<()> { Ok(()) } } -#[cfg(unix)] -fn last_change_epoch(metadata: &Metadata) -> u64 { - use std::os::unix::fs::MetadataExt; - metadata.ctime() as u64 -} - -#[cfg(not(unix))] -fn last_change_epoch(metadata: &Metadata) -> u64 { - 0 -} - #[cfg(test)] mod tests { - use std::sync::mpsc::sync_channel; - - use futures::{channel::mpsc, executor::block_on, SinkExt}; + use futures::executor::block_on; use tokio::runtime; use super::*; - #[test] - fn test_last_change_epoch() { - let f = tempfile::tempfile().unwrap(); - assert!(last_change_epoch(&f.metadata().unwrap()) > 0); - } - #[test] fn test_extract_thread_name() { assert_eq!(&extract_thread_name("test-name-1"), "test-name"); @@ -372,7 +243,7 @@ mod tests { .build() .unwrap(); - let expected = "Already in Profiling"; + let expected = "Already in CPU Profiling"; let (tx1, rx1) = oneshot::channel(); let rx1 = rx1.map_err(|_| "channel canceled".to_owned()); @@ -384,76 +255,7 @@ mod tests { let res2 = rt.spawn(start_one_cpu_profile(rx2, 99, false)); assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); - let (_tx2, rx2) = oneshot::channel(); - let rx2 = rx2.map_err(|_| "channel canceled".to_owned()); - let res2 = rt.spawn(start_one_heap_profile(rx2, false)); - assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); - - let (_tx2, rx2) = mpsc::channel(1); - let res2 = rt.spawn(activate_heap_profile(rx2, std::env::temp_dir(), || {})); - assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); - drop(tx1); block_on(res1).unwrap().unwrap_err(); } - - #[test] - fn test_profile_guard_toggle() { - let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); - let rt = runtime::Builder::new_multi_thread() - .worker_threads(4) - .build() - .unwrap(); - - // Test activated profiling can be stopped by canceling the period stream. - let (tx, rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); - drop(tx); - block_on(res).unwrap().unwrap(); - - // Test activated profiling can be stopped by the handle. - let (tx, rx) = sync_channel::(1); - let on_activated = move || drop(tx); - let check_activated = move || rx.recv().is_err(); - - let (_tx, _rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile( - _rx, - std::env::temp_dir(), - on_activated, - )); - assert!(check_activated()); - assert!(deactivate_heap_profile()); - block_on(res).unwrap().unwrap(); - } - - #[test] - fn test_heap_profile_exit() { - let _test_guard = TEST_PROFILE_MUTEX.lock().unwrap(); - let rt = runtime::Builder::new_multi_thread() - .worker_threads(4) - .build() - .unwrap(); - - // Test heap profiling can be stopped by sending an error. - let (mut tx, rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); - block_on(tx.send(Err("test".to_string()))).unwrap(); - block_on(res).unwrap().unwrap_err(); - - // Test heap profiling can be activated again. - let (tx, rx) = sync_channel::(1); - let on_activated = move || drop(tx); - let check_activated = move || rx.recv().is_err(); - - let (_tx, _rx) = mpsc::channel(1); - let res = rt.spawn(activate_heap_profile( - _rx, - std::env::temp_dir(), - on_activated, - )); - assert!(check_activated()); - assert!(deactivate_heap_profile()); - block_on(res).unwrap().unwrap(); - } } diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index ca869f5c761..997a932be9d 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -35,7 +35,7 @@ use std::{ use collections::HashMap; use crc64fast::Digest; use encryption_export::{DataKeyImporter, DataKeyManager}; -use engine_traits::{Checkpointer, EncryptionKeyManager, KvEngine, TabletRegistry}; +use engine_traits::{Checkpointer, KvEngine, TabletRegistry}; use file_system::{IoType, OpenOptions, WithIoType}; use futures::{ future::FutureExt, diff --git a/src/server/ttl/ttl_compaction_filter.rs b/src/server/ttl/ttl_compaction_filter.rs index 06fc6981cf2..be4f0df6cf4 100644 --- a/src/server/ttl/ttl_compaction_filter.rs +++ b/src/server/ttl/ttl_compaction_filter.rs @@ -11,9 +11,23 @@ use engine_rocks::{ RocksTtlProperties, }; use engine_traits::raw_ttl::ttl_current_ts; +use prometheus::*; use crate::server::metrics::TTL_CHECKER_ACTIONS_COUNTER_VEC; +lazy_static! { + pub static ref TTL_EXPIRE_KV_SIZE_COUNTER: IntCounter = register_int_counter!( + "tikv_ttl_expire_kv_size_total", + "Total size of rawkv ttl expire", + ) + .unwrap(); + pub static ref TTL_EXPIRE_KV_COUNT_COUNTER: IntCounter = register_int_counter!( + "tikv_ttl_expire_kv_count_total", + "Total number of rawkv ttl expire", + ) + .unwrap(); +} + #[derive(Default)] pub struct TtlCompactionFilterFactory { _phantom: PhantomData, @@ -41,10 +55,7 @@ impl CompactionFilterFactory for TtlCompactionFilterFactory { } let name = CString::new("ttl_compaction_filter").unwrap(); - let filter = TtlCompactionFilter:: { - ts: current, - _phantom: PhantomData, - }; + let filter = TtlCompactionFilter::::new(); Some((name, filter)) } @@ -56,6 +67,28 @@ impl CompactionFilterFactory for TtlCompactionFilterFactory { pub struct TtlCompactionFilter { ts: u64, _phantom: PhantomData, + expire_count: u64, + expire_size: u64, +} + +impl Drop for TtlCompactionFilter { + fn drop(&mut self) { + // Accumulate counters would slightly improve performance as prometheus counters + // are atomic variables underlying + TTL_EXPIRE_KV_SIZE_COUNTER.inc_by(self.expire_size); + TTL_EXPIRE_KV_COUNT_COUNTER.inc_by(self.expire_count); + } +} + +impl TtlCompactionFilter { + fn new() -> Self { + Self { + ts: ttl_current_ts(), + _phantom: PhantomData, + expire_count: 0, + expire_size: 0, + } + } } impl CompactionFilter for TtlCompactionFilter { @@ -83,7 +116,11 @@ impl CompactionFilter for TtlCompactionFilter { Ok(RawValue { expire_ts: Some(expire_ts), .. - }) if expire_ts <= self.ts => CompactionFilterDecision::Remove, + }) if expire_ts <= self.ts => { + self.expire_size += key.len() as u64 + value.len() as u64; + self.expire_count += 1; + CompactionFilterDecision::Remove + } Err(err) => { TTL_CHECKER_ACTIONS_COUNTER_VEC .with_label_values(&["ts_error"]) diff --git a/src/storage/config.rs b/src/storage/config.rs index a40db2c424b..91c98ebf57b 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -31,6 +31,13 @@ const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; +// In tests, we've observed 1.2M entries in the TxnStatusCache. We +// conservatively set the limit to 5M entries in total. +// As TxnStatusCache have 128 slots by default. We round it to 5.12M. +// This consumes at most around 300MB memory theoretically, but usually it's +// much less as it's hard to see the capacity being used up. +const DEFAULT_TXN_STATUS_CACHE_CAPACITY: usize = 40_000 * 128; + // Block cache capacity used when TikvConfig isn't validated. It should only // occur in tests. const FALLBACK_BLOCK_CACHE_CAPACITY: ReadableSize = ReadableSize::mb(128); @@ -76,6 +83,8 @@ pub struct Config { pub background_error_recovery_window: ReadableDuration, /// Interval to check TTL for all SSTs, pub ttl_check_poll_interval: ReadableDuration, + #[online_config(skip)] + pub txn_status_cache_capacity: usize, #[online_config(submodule)] pub flow_control: FlowControlConfig, #[online_config(submodule)] @@ -105,6 +114,7 @@ impl Default for Config { api_version: 1, enable_ttl: false, ttl_check_poll_interval: ReadableDuration::hours(12), + txn_status_cache_capacity: DEFAULT_TXN_STATUS_CACHE_CAPACITY, flow_control: FlowControlConfig::default(), block_cache: BlockCacheConfig::default(), io_rate_limit: IoRateLimitConfig::default(), diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 0e7db9ffc96..b603b904708 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -12,7 +12,7 @@ use std::{ use error_code::{self, ErrorCode, ErrorCodeExt}; use kvproto::{errorpb, kvrpcpb, kvrpcpb::ApiVersion}; use thiserror::Error; -use tikv_util::deadline::DeadlineError; +use tikv_util::deadline::{set_deadline_exceeded_busy_error, DeadlineError}; use txn_types::{KvPair, TimeStamp}; use crate::storage::{ @@ -222,7 +222,6 @@ impl Display for ErrorHeaderKind { const SCHEDULER_IS_BUSY: &str = "scheduler is busy"; const GC_WORKER_IS_BUSY: &str = "gc worker is busy"; -const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; /// Get the `ErrorHeaderKind` enum that corresponds to the error in the protobuf /// message. Returns `ErrorHeaderKind::Other` if no match found. @@ -319,9 +318,8 @@ pub fn extract_region_error_from_error(e: &Error) -> Option { } Error(box ErrorInner::DeadlineExceeded) => { let mut err = errorpb::Error::default(); - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); - err.set_server_is_busy(server_is_busy_err); + err.set_message(e.to_string()); + set_deadline_exceeded_busy_error(&mut err); Some(err) } _ => None, diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e9477b56b0f..cf7956d76b7 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -370,6 +370,13 @@ make_static_metric! { keys, }, } + + pub struct TxnStatusCacheSizeGauge: IntGauge { + "type" => { + used, + allocated, + } + } } lazy_static! { @@ -601,4 +608,12 @@ lazy_static! { exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); + + pub static ref SCHED_TXN_STATUS_CACHE_SIZE: TxnStatusCacheSizeGauge = register_static_int_gauge_vec!( + TxnStatusCacheSizeGauge, + "tikv_scheduler_txn_status_cache_size", + "Statistics of size and capacity of txn status cache (represented in count of entries)", + &["type"] + ) + .unwrap(); } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0d4679fbe18..13d868849f4 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -80,6 +80,7 @@ use engine_traits::{ raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, DATA_CFS_LEN, }; use futures::{future::Either, prelude::*}; +use itertools::Itertools; use kvproto::{ kvrpcpb::{ ApiVersion, ChecksumAlgorithm, CommandPri, Context, GetRequest, IsolationLevel, KeyRange, @@ -97,7 +98,7 @@ use tikv_util::{ deadline::Deadline, future::try_poll, quota_limiter::QuotaLimiter, - time::{duration_to_ms, Instant, ThreadReadId}, + time::{duration_to_ms, duration_to_sec, Instant, ThreadReadId}, }; use tracker::{ clear_tls_tracker_token, set_tls_tracker_token, with_tls_tracker, TrackedFuture, TrackerToken, @@ -121,7 +122,7 @@ pub use self::{ use self::{kv::SnapContext, test_util::latest_feature_gate}; use crate::{ read_pool::{ReadPool, ReadPoolHandle}, - server::lock_manager::waiter_manager, + server::{lock_manager::waiter_manager, metrics::ResourcePriority}, storage::{ config::Config, kv::{with_tls_engine, Modify, WriteData}, @@ -609,6 +610,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -645,7 +647,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [key.as_encoded()])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); // The bypass_locks and access_locks set will be checked at most once. // `TsSet::vec` is more efficient here. @@ -697,12 +699,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); let read_bytes = key.len() + result @@ -765,20 +770,27 @@ impl Storage { ids: Vec, trackers: Vec, consumer: P, - begin_instant: tikv_util::time::Instant, + begin_instant: Instant, ) -> impl Future> { const CMD: CommandKind = CommandKind::batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = requests[0].get_context().get_priority(); let metadata = TaskMetadata::from_ctx(requests[0].get_context().get_resource_control_context()); + let resource_group_name = requests[0] + .get_context() + .get_resource_control_context() + .get_resource_group_name(); + let group_priority = requests[0] + .get_context() + .get_resource_control_context() + .get_override_priority(); + let resource_priority = ResourcePriority::from(group_priority); let resource_limiter = self.resource_manager.as_ref().and_then(|r| { r.get_resource_limiter( - requests[0] - .get_context() - .get_resource_control_context() - .get_resource_group_name(), + resource_group_name, requests[0].get_context().get_request_source(), + group_priority, ) }); let concurrency_manager = self.concurrency_manager.clone(); @@ -805,7 +817,7 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(requests.len() as f64); - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let read_id = Some(ThreadReadId::new()); let mut statistics = Statistics::default(); let mut req_snaps = vec![]; @@ -854,7 +866,7 @@ impl Storage { snap_ctx } Err(e) => { - consumer.consume(id, Err(e), begin_instant, source); + consumer.consume(id, Err(e), begin_instant, source, resource_priority); continue; } }; @@ -893,7 +905,13 @@ impl Storage { ) = req_snap; let snap_res = snap.await; if let Err(e) = deadline.check() { - consumer.consume(id, Err(Error::from(e)), begin_instant, source); + consumer.consume( + id, + Err(Error::from(e)), + begin_instant, + source, + resource_priority, + ); continue; } @@ -925,6 +943,7 @@ impl Storage { .map(|v| (v, stat)), begin_instant, source, + resource_priority, ); } Err(e) => { @@ -933,12 +952,13 @@ impl Storage { Err(Error::from(txn::Error::from(e))), begin_instant, source, + resource_priority, ); } } }), Err(e) => { - consumer.consume(id, Err(e), begin_instant, source); + consumer.consume(id, Err(e), begin_instant, source, resource_priority); } } } @@ -975,6 +995,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1019,7 +1040,7 @@ impl Storage { keys.iter().map(Key::as_encoded), )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1086,12 +1107,15 @@ impl Storage { (result, stats) }); metrics::tls_collect_scan_details(CMD, &stats); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); let read_bytes = stats.cf_statistics(CF_DEFAULT).flow_stats.read_bytes + stats.cf_statistics(CF_LOCK).flow_stats.read_bytes @@ -1164,6 +1188,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1217,7 +1242,7 @@ impl Storage { if reverse_scan { std::mem::swap(&mut start_key, &mut end_key); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1296,12 +1321,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); res.map_err(Error::from).map(|results| { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC @@ -1337,6 +1365,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1383,7 +1412,7 @@ impl Storage { // which resolves locks on regions, and boundary of regions will be out of range // of TiDB keys. - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); concurrency_manager.update_max_ts(max_ts); let begin_instant = Instant::now(); @@ -1430,6 +1459,15 @@ impl Storage { Some(ScanMode::Forward), !ctx.get_not_fill_cache(), ); + let memory_locks = reader + .load_in_memory_pessimisitic_lock_range( + start_key.as_ref(), + end_key.as_ref(), + |_, lock| lock.start_ts <= max_ts, + limit, + ) + .map_err(txn::Error::from); + let (memory_lock_kv_pairs, _) = memory_locks?; let result = reader .scan_locks( start_key.as_ref(), @@ -1440,8 +1478,18 @@ impl Storage { .map_err(txn::Error::from); statistics.add(&reader.statistics); let (kv_pairs, _) = result?; - let mut locks = Vec::with_capacity(kv_pairs.len()); - for (key, lock) in kv_pairs { + + // Merge the results from in-memory pessimistic locks and the lock cf. + // The result order is decided by the key. + let memory_lock_iter = memory_lock_kv_pairs.into_iter(); + let lock_iter = kv_pairs.into_iter(); + let merged_iter = memory_lock_iter + .merge_by(lock_iter, |(memory_key, _), (key, _)| memory_key <= key); + let mut locks = Vec::with_capacity(limit); + for (key, lock) in merged_iter { + if limit > 0 && locks.len() >= limit { + break; + } let lock_info = lock.into_lock_info(key.into_raw().map_err(txn::Error::from)?); locks.push(lock_info); @@ -1455,12 +1503,15 @@ impl Storage { &statistics, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(locks) }) @@ -1650,6 +1701,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1669,7 +1721,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1704,12 +1756,15 @@ impl Storage { &stats, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); r } } @@ -1732,13 +1787,20 @@ impl Storage { // all requests in a batch have the same region, epoch, term, replica_read let priority = gets[0].get_context().get_priority(); let metadata = TaskMetadata::from_ctx(gets[0].get_context().get_resource_control_context()); + let resource_group_name = gets[0] + .get_context() + .get_resource_control_context() + .get_resource_group_name(); + let group_priority = gets[0] + .get_context() + .get_resource_control_context() + .get_override_priority(); + let resource_priority = ResourcePriority::from(group_priority); let resource_limiter = self.resource_manager.as_ref().and_then(|r| { r.get_resource_limiter( - gets[0] - .get_context() - .get_resource_control_context() - .get_resource_group_name(), + resource_group_name, gets[0].get_context().get_request_source(), + group_priority, ) }); let priority_tag = get_priority_tag(priority); @@ -1776,7 +1838,7 @@ impl Storage { .map_err(Error::from)?; } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let read_id = Some(ThreadReadId::new()); let mut snaps = vec![]; for (mut req, id) in gets.into_iter().zip(ids) { @@ -1820,6 +1882,7 @@ impl Storage { .map_err(Error::from), begin_instant, ctx.take_request_source(), + resource_priority, ); tls_collect_read_flow( ctx.get_region_id(), @@ -1835,22 +1898,32 @@ impl Storage { Err(e), begin_instant, ctx.take_request_source(), + resource_priority, ); } } } Err(e) => { - consumer.consume(id, Err(e), begin_instant, ctx.take_request_source()); + consumer.consume( + id, + Err(e), + begin_instant, + ctx.take_request_source(), + resource_priority, + ); } } } + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(()) } .in_resource_metering_tag(resource_tag), @@ -1875,6 +1948,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -1896,7 +1970,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, &keys)?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1947,12 +2021,15 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(stats.data.flow_stats.read_keys as f64); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(result) } } @@ -2028,7 +2105,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2140,7 +2217,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2205,7 +2282,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2266,7 +2343,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let start_key = F::encode_raw_key_owned(start_key, None); let end_key = F::encode_raw_key_owned(end_key, None); @@ -2314,7 +2391,7 @@ impl Storage { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { return callback(Err(e)); @@ -2378,6 +2455,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2400,7 +2478,7 @@ impl Storage { [(Some(&start_key), end_key.as_ref())], )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2477,12 +2555,15 @@ impl Storage { .get(CMD) .observe(statistics.data.flow_stats.read_keys as f64); metrics::tls_collect_scan_details(CMD, &statistics); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); result } @@ -2512,6 +2593,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2542,7 +2624,7 @@ impl Storage { .map(|range| (Some(range.get_start_key()), Some(range.get_end_key()))), )?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2640,12 +2722,15 @@ impl Storage { .get(CMD) .observe(statistics.data.flow_stats.read_keys as f64); metrics::tls_collect_scan_details(CMD, &statistics); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); Ok(result) } } @@ -2671,6 +2756,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2690,7 +2776,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2725,12 +2811,15 @@ impl Storage { &stats, buckets.as_ref(), ); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed_secs()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed_secs()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); r } } @@ -2849,6 +2938,7 @@ impl Storage { r.get_resource_limiter( ctx.get_resource_control_context().get_resource_group_name(), ctx.get_request_source(), + ctx.get_resource_control_context().get_override_priority(), ) }); let priority_tag = get_priority_tag(priority); @@ -2887,7 +2977,7 @@ impl Storage { range.set_end_key(end_key.into_encoded()); } - let command_duration = tikv_util::time::Instant::now(); + let command_duration = Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2898,7 +2988,7 @@ impl Storage { let store = RawStore::new(snapshot, api_version); let cf = Self::rawkv_cf("", api_version)?; - let begin_instant = tikv_util::time::Instant::now(); + let begin_instant = Instant::now(); let mut stats = Vec::with_capacity(ranges.len()); let ret = store .raw_checksum_ranges(cf, &ranges, &mut stats) @@ -2913,12 +3003,15 @@ impl Storage { buckets.as_ref(), ); }); + let now = Instant::now(); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) - .observe(begin_instant.saturating_elapsed().as_secs_f64()); - SCHED_HISTOGRAM_VEC_STATIC - .get(CMD) - .observe(command_duration.saturating_elapsed().as_secs_f64()); + .observe(duration_to_sec( + now.saturating_duration_since(begin_instant), + )); + SCHED_HISTOGRAM_VEC_STATIC.get(CMD).observe(duration_to_sec( + now.saturating_duration_since(command_duration), + )); ret } @@ -3293,7 +3386,8 @@ impl TestStorageBuilder { } else { None }; - + let manager = Arc::new(ResourceGroupManager::default()); + let resource_ctl = manager.derive_controller("test".into(), false); Storage::from_engine( self.engine, &self.config, @@ -3311,11 +3405,8 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), ts_provider, - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - false, - ))), - None, + Some(resource_ctl), + Some(manager), ) } @@ -3328,7 +3419,8 @@ impl TestStorageBuilder { &crate::config::StorageReadPoolConfig::default_for_test(), engine.clone(), ); - + let manager = Arc::new(ResourceGroupManager::default()); + let resource_ctl = manager.derive_controller("test".into(), false); Storage::from_engine( engine, &self.config, @@ -3346,16 +3438,14 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), None, - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - false, - ))), - None, + Some(resource_ctl), + Some(manager), ) } pub fn build_for_resource_controller( self, + resource_manager: Arc, resource_controller: Arc, ) -> Result, L, F>> { let engine = TxnTestEngine { @@ -3385,7 +3475,7 @@ impl TestStorageBuilder { latest_feature_gate(), None, Some(resource_controller), - None, + Some(resource_manager), ) } } @@ -3397,6 +3487,7 @@ pub trait ResponseBatchConsumer: Send { res: Result, begin: Instant, request_source: String, + resource_priority: ResourcePriority, ); } @@ -3695,8 +3786,9 @@ pub mod test_util { &self, id: u64, res: Result<(Option>, Statistics)>, - _: tikv_util::time::Instant, + _: Instant, _source: String, + _resource_priority: ResourcePriority, ) { self.data.lock().unwrap().push(GetResult { id, @@ -3710,8 +3802,9 @@ pub mod test_util { &self, id: u64, res: Result>>, - _: tikv_util::time::Instant, + _: Instant, _source: String, + _resource_priority: ResourcePriority, ) { self.data.lock().unwrap().push(GetResult { id, res }); } @@ -3826,6 +3919,7 @@ mod tests { commands, commands::{AcquirePessimisticLock, Prewrite}, tests::must_rollback, + txn_status_cache::TxnStatusCache, Error as TxnError, ErrorInner as TxnErrorInner, }, types::{PessimisticLockKeyResult, PessimisticLockResults}, @@ -3857,6 +3951,7 @@ mod tests { statistics: &mut Statistics::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -7321,6 +7416,126 @@ mod tests { ); } + #[test] + fn test_scan_lock_with_memory_lock() { + for in_memory_pessimistic_lock_enabled in [false, true] { + let txn_ext = Arc::new(TxnExt::default()); + let lock_mgr = MockLockManager::new(); + let storage = TestStorageBuilderApiV1::new(lock_mgr.clone()) + .pipelined_pessimistic_lock(in_memory_pessimistic_lock_enabled) + .in_memory_pessimistic_lock(in_memory_pessimistic_lock_enabled) + .build_for_txn(txn_ext.clone()) + .unwrap(); + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::AcquirePessimisticLock::new( + vec![(Key::from_raw(b"a"), false), (Key::from_raw(b"b"), false)], + b"a".to_vec(), + 20.into(), + 3000, + true, + 20.into(), + Some(WaitTimeout::Millis(1000)), + false, + 21.into(), + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + if in_memory_pessimistic_lock_enabled { + // Check if the lock exists in the memory buffer. + let pessimistic_locks = txn_ext.pessimistic_locks.read(); + let lock = pessimistic_locks.get(&Key::from_raw(b"a")).unwrap(); + assert_eq!( + lock, + &( + PessimisticLock { + primary: Box::new(*b"a"), + start_ts: 20.into(), + ttl: 3000, + for_update_ts: 20.into(), + min_commit_ts: 21.into(), + last_change: LastChange::NotExist, + is_locked_with_conflict: false, + }, + false + ) + ); + } + + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![ + Mutation::make_put(Key::from_raw(b"x"), b"foo".to_vec()), + Mutation::make_put(Key::from_raw(b"y"), b"foo".to_vec()), + Mutation::make_put(Key::from_raw(b"z"), b"foo".to_vec()), + ], + b"x".to_vec(), + 10.into(), + ), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + + let (lock_a, lock_b, lock_x, lock_y, lock_z) = ( + { + let mut lock = LockInfo::default(); + lock.set_primary_lock(b"a".to_vec()); + lock.set_lock_version(20); + lock.set_lock_for_update_ts(20); + lock.set_key(b"a".to_vec()); + lock.set_min_commit_ts(21); + lock.set_lock_type(Op::PessimisticLock); + lock.set_lock_ttl(3000); + lock + }, + { + let mut lock = LockInfo::default(); + lock.set_primary_lock(b"a".to_vec()); + lock.set_lock_version(20); + lock.set_lock_for_update_ts(20); + lock.set_key(b"b".to_vec()); + lock.set_min_commit_ts(21); + lock.set_lock_type(Op::PessimisticLock); + lock.set_lock_ttl(3000); + lock + }, + { + let mut lock = LockInfo::default(); + lock.set_primary_lock(b"x".to_vec()); + lock.set_lock_version(10); + lock.set_key(b"x".to_vec()); + lock + }, + { + let mut lock = LockInfo::default(); + lock.set_primary_lock(b"x".to_vec()); + lock.set_lock_version(10); + lock.set_key(b"y".to_vec()); + lock + }, + { + let mut lock = LockInfo::default(); + lock.set_primary_lock(b"x".to_vec()); + lock.set_lock_version(10); + lock.set_key(b"z".to_vec()); + lock + }, + ); + let res = block_on(storage.scan_lock(Context::default(), 101.into(), None, None, 10)) + .unwrap(); + assert_eq!(res, vec![lock_a, lock_b, lock_x, lock_y, lock_z,]); + } + } + #[test] fn test_scan_lock() { let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) @@ -10842,4 +11057,507 @@ mod tests { // Prewrite still succeeds rx.recv().unwrap().unwrap(); } + + #[test] + fn test_prewrite_cached_committed_transaction_do_not_skip_constraint_check() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let cm = storage.concurrency_manager.clone(); + let k1 = Key::from_raw(b"k1"); + let pk = b"pk"; + // Simulate the case that the current TiKV instance have a non-unique + // index key of a pessimistic transaction. It won't be pessimistic + // locked, and prewrite skips constraint checks. + // Simulate the case that a prewrite is performed twice, with async + // commit enabled, and max_ts changes when the second request arrives. + + // A retrying prewrite request arrives. + cm.update_max_ts(20.into()); + let mut ctx = Context::default(); + ctx.set_is_retry_request(true); + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + SkipPessimisticCheck, + )], + pk.to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + ctx, + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.min_commit_ts, 21.into()); + + // Commit it. + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::Commit::new(vec![k1.clone()], 10.into(), 21.into(), Context::default()), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // The txn's status is cached + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 21.into() + ); + + // Check committed; push max_ts to 30 + assert_eq!( + block_on(storage.get(Context::default(), k1.clone(), 30.into())) + .unwrap() + .0, + Some(b"v".to_vec()) + ); + + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + SkipPessimisticCheck, + )], + pk.to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + Context::default(), + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.min_commit_ts, 21.into()); + + // Key must not be locked. + assert_eq!( + block_on(storage.get(Context::default(), k1, 50.into())) + .unwrap() + .0, + Some(b"v".to_vec()) + ); + } + + #[test] + fn test_updating_txn_status_cache() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let cm = storage.concurrency_manager.clone(); + + // Commit + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::PrewritePessimistic::new( + vec![( + Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), + SkipPessimisticCheck, + )], + b"k1".to_vec(), + 10.into(), + 3000, + 10.into(), + 1, + 11.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + vec![], + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .is_none() + ); + + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k1")], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 20.into() + ); + + // Unsuccessful commit won't update cache + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k2")], + 30.into(), + 40.into(), + Context::default(), + ), + expect_fail_callback(tx, 0, |_| ()), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(30.into()) + .is_none() + ); + + // 1PC update + let (tx, rx) = channel(); + cm.update_max_ts(59.into()); + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k3"), b"v3".to_vec())], + b"k3".to_vec(), + 50.into(), + 3000, + false, + 1, + 51.into(), + 0.into(), + Some(vec![]), + true, + AssertionLevel::Off, + Context::default(), + ), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); + let res = rx.recv().unwrap().unwrap(); + assert_eq!(res.one_pc_commit_ts, 60.into()); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(50.into()) + .unwrap(), + 60.into() + ); + + // Resolve lock commit + let (tx, rx) = channel(); + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k4"), b"v4".to_vec())], + b"pk".to_vec(), + 70.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + storage + .sched_txn_command( + commands::ResolveLockReadPhase::new( + vec![(TimeStamp::from(70), TimeStamp::from(80))] + .into_iter() + .collect(), + None, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(70.into()) + .unwrap(), + 80.into() + ); + + // Resolve lock lite + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k5"), b"v5".to_vec())], + b"pk".to_vec(), + 90.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + storage + .sched_txn_command( + commands::ResolveLockLite::new( + 90.into(), + 100.into(), + vec![Key::from_raw(b"k5")], + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(90.into()) + .unwrap(), + 100.into() + ); + + // CheckTxnStatus: uncommitted transaction + storage + .sched_txn_command( + commands::CheckTxnStatus::new( + Key::from_raw(b"k1"), + 9.into(), + 110.into(), + 110.into(), + true, + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(9.into()) + .is_none() + ); + + // CheckTxnStatus: committed transaction + storage.sched.get_txn_status_cache().remove(10.into()); + storage + .sched_txn_command( + commands::CheckTxnStatus::new( + Key::from_raw(b"k1"), + 10.into(), + 110.into(), + 110.into(), + true, + false, + false, + false, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(10.into()) + .unwrap(), + 20.into() + ); + + // CheckSecondaryLocks: uncommitted transaction + storage + .sched_txn_command( + Prewrite::new( + vec![Mutation::make_put(Key::from_raw(b"k6"), b"v6".to_vec())], + b"pk".to_vec(), + 120.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock exists but the transaction status is still unknown + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k6")], + 120.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(120.into()) + .is_none() + ); + + // One of the lock doesn't exist so the transaction becomes rolled-back status. + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k6"), Key::from_raw(b"k7")], + 120.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(120.into()) + .is_none() + ); + + // CheckSecondaryLocks: committed transaction + storage + .sched_txn_command( + Prewrite::new( + vec![ + Mutation::make_put(Key::from_raw(b"k8"), b"v8".to_vec()), + Mutation::make_put(Key::from_raw(b"k9"), b"v9".to_vec()), + ], + b"pk".to_vec(), + 130.into(), + 3000, + false, + 1, + 0.into(), + 0.into(), + Some(vec![]), + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + // Commit one of the key + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k9")], + 130.into(), + 140.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .remove(130.into()) + .unwrap(), + 140.into() + ); + + storage + .sched_txn_command( + commands::CheckSecondaryLocks::new( + vec![Key::from_raw(b"k8"), Key::from_raw(b"k9")], + 130.into(), + Context::default(), + ), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + assert_eq!( + storage + .sched + .get_txn_status_cache() + .get_no_promote(130.into()) + .unwrap(), + 140.into() + ); + } } diff --git a/src/storage/mvcc/metrics.rs b/src/storage/mvcc/metrics.rs index 3c4bda63f7e..eaef1134d81 100644 --- a/src/storage/mvcc/metrics.rs +++ b/src/storage/mvcc/metrics.rs @@ -36,6 +36,11 @@ make_static_metric! { write_not_loaded_skip } + pub label_enum ScanLockReadTimeSource { + resolve_lock, + pessimistic_rollback, + } + pub struct MvccConflictCounterVec: IntCounter { "type" => MvccConflictKind, } @@ -51,6 +56,17 @@ make_static_metric! { pub struct MvccPrewriteAssertionPerfCounterVec: IntCounter { "type" => MvccPrewriteAssertionPerfKind, } + + pub struct MvccPrewriteRequestAfterCommitCounterVec: IntCounter { + "type" => { + non_retry_req, + retry_req, + }, + } + + pub struct ScanLockReadTimeVec: Histogram { + "type" => ScanLockReadTimeSource, + } } lazy_static! { @@ -104,4 +120,21 @@ lazy_static! { ) .unwrap() }; + pub static ref MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC: MvccPrewriteRequestAfterCommitCounterVec = { + register_static_int_counter_vec!( + MvccPrewriteRequestAfterCommitCounterVec, + "tikv_storage_mvcc_prewrite_request_after_commit_counter", + "Counter of prewrite requests of already-committed transactions that are determined by checking TxnStatucCache", + &["type"] + ) + .unwrap() + }; + pub static ref SCAN_LOCK_READ_TIME_VEC: ScanLockReadTimeVec = register_static_histogram_vec!( + ScanLockReadTimeVec, + "tikv_storage_mvcc_scan_lock_read_duration_seconds", + "Bucketed histogram of memory lock read lock hold for scan lock", + &["type"], + exponential_buckets(0.00001, 2.0, 20).unwrap() + ) + .unwrap(); } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 48158eda946..257789b4765 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -8,9 +8,12 @@ use kvproto::{ errorpb::{self, EpochNotMatch, FlashbackInProgress, StaleCommand}, kvrpcpb::Context, }; -use raftstore::store::LocksStatus; +use raftstore::store::{LocksStatus, PeerPessimisticLocks}; use tikv_kv::{SnapshotExt, SEEK_BOUND}; -use txn_types::{Key, LastChange, Lock, OldValue, TimeStamp, Value, Write, WriteRef, WriteType}; +use tikv_util::time::Instant; +use txn_types::{ + Key, LastChange, Lock, OldValue, PessimisticLock, TimeStamp, Value, Write, WriteRef, WriteType, +}; use crate::storage::{ kv::{ @@ -18,6 +21,7 @@ use crate::storage::{ }, mvcc::{ default_not_found_error, + metrics::SCAN_LOCK_READ_TIME_VEC, reader::{OverlappedWrite, TxnCommitRecord}, Result, }, @@ -251,44 +255,76 @@ impl MvccReader { Ok(res) } - fn load_in_memory_pessimistic_lock(&self, key: &Key) -> Result> { - self.snapshot - .ext() - .get_txn_ext() - .and_then(|txn_ext| { - // If the term or region version has changed, do not read the lock table. - // Instead, just return a StaleCommand or EpochNotMatch error, so the - // client will not receive a false error because the lock table has been - // cleared. - let locks = txn_ext.pessimistic_locks.read(); - if self.term != 0 && locks.term != self.term { - let mut err = errorpb::Error::default(); - err.set_stale_command(StaleCommand::default()); - return Some(Err(KvError::from(err).into())); - } - if self.version != 0 && locks.version != self.version { - let mut err = errorpb::Error::default(); - // We don't know the current regions. Just return an empty EpochNotMatch error. - err.set_epoch_not_match(EpochNotMatch::default()); - return Some(Err(KvError::from(err).into())); - } - // If the region is in the flashback state, it should not be allowed to read the - // locks. - if locks.status == LocksStatus::IsInFlashback && !self.allow_in_flashback { - let mut err = errorpb::Error::default(); - err.set_flashback_in_progress(FlashbackInProgress::default()); - return Some(Err(KvError::from(err).into())); + fn check_term_version_status(&self, locks: &PeerPessimisticLocks) -> Result<()> { + // If the term or region version has changed, do not read the lock table. + // Instead, just return a StaleCommand or EpochNotMatch error, so the + // client will not receive a false error because the lock table has been + // cleared. + if self.term != 0 && locks.term != self.term { + let mut err = errorpb::Error::default(); + err.set_stale_command(StaleCommand::default()); + return Err(KvError::from(err).into()); + } + if self.version != 0 && locks.version != self.version { + let mut err = errorpb::Error::default(); + err.set_epoch_not_match(EpochNotMatch::default()); + return Err(KvError::from(err).into()); + } + if locks.status == LocksStatus::IsInFlashback && !self.allow_in_flashback { + let mut err = errorpb::Error::default(); + err.set_flashback_in_progress(FlashbackInProgress::default()); + return Err(KvError::from(err).into()); + } + Ok(()) + } + + pub fn load_in_memory_pessimisitic_lock_range( + &self, + start_key: Option<&Key>, + end_key: Option<&Key>, + filter: F, + scan_limit: usize, + ) -> Result<(Vec<(Key, Lock)>, bool)> + where + F: Fn(&Key, &PessimisticLock) -> bool, + { + if let Some(txn_ext) = self.snapshot.ext().get_txn_ext() { + let begin_instant = Instant::now(); + let res = match self.check_term_version_status(&txn_ext.pessimistic_locks.read()) { + Ok(_) => { + // Scan locks within the specified range and filter by max_ts. + Ok(txn_ext + .pessimistic_locks + .read() + .scan_locks(start_key, end_key, filter, scan_limit)) } + Err(e) => Err(e), + }; + let elapsed = begin_instant.saturating_elapsed(); + SCAN_LOCK_READ_TIME_VEC + .resolve_lock + .observe(elapsed.as_secs_f64()); - locks.get(key).map(|(lock, _)| { - // For write commands that are executed in serial, it should be impossible - // to read a deleted lock. - // For read commands in the scheduler, it should read the lock marked deleted - // because the lock is not actually deleted from the underlying storage. - Ok(lock.to_lock()) - }) - }) - .transpose() + res + } else { + Ok((vec![], false)) + } + } + + fn load_in_memory_pessimistic_lock(&self, key: &Key) -> Result> { + if let Some(txn_ext) = self.snapshot.ext().get_txn_ext() { + let locks = txn_ext.pessimistic_locks.read(); + self.check_term_version_status(&locks)?; + Ok(locks.get(key).map(|(lock, _)| { + // For write commands that are executed in serial, it should be impossible + // to read a deleted lock. + // For read commands in the scheduler, it should read the lock marked deleted + // because the lock is not actually deleted from the underlying storage. + lock.to_lock() + })) + } else { + Ok(None) + } } fn get_scan_mode(&self, allow_backward: bool) -> ScanMode { diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 90f739b8705..64e22a13585 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -239,6 +239,7 @@ impl LockStatus { } /// A single mutation to be prewritten. +#[derive(Debug)] struct PrewriteMutation<'a> { key: Key, value: Option, @@ -677,6 +678,12 @@ impl<'a> PrewriteMutation<'a> { if self.skip_constraint_check() { self.check_for_newer_version(reader)?; } + let (write, commit_ts) = write + .as_ref() + .map(|(w, ts)| (Some(w), Some(ts))) + .unwrap_or((None, None)); + error!("assertion failure"; "assertion" => ?self.assertion, "write" => ?write, + "commit_ts" => commit_ts, "mutation" => ?self); assertion_err?; } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 58c33706bbc..ceb7957c926 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -183,6 +183,7 @@ impl WriteCommand for AcquirePessimisticLock new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs index 7640edd7c0c..a1e2e6fc119 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -194,6 +194,7 @@ impl WriteCommand for AcquirePessimisticLockR new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, + known_txn_status: vec![], }) } } @@ -239,6 +240,7 @@ mod tests { txn::{ commands::pessimistic_rollback::tests::must_success as must_pessimistic_rollback, tests::{must_commit, must_pessimistic_locked, must_prewrite_put, must_rollback}, + txn_status_cache::TxnStatusCache, }, TestEngineBuilder, }; @@ -275,6 +277,7 @@ mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 9a54895e7e2..4bca5d514c5 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -63,6 +63,7 @@ impl WriteCommand for RawAtomicStore { new_acquired_locks: vec![], lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -77,7 +78,9 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Statistics, TestEngineBuilder, + lock_manager::MockLockManager, + txn::{scheduler::get_raw_ext, txn_status_cache::TxnStatusCache}, + Statistics, TestEngineBuilder, }; #[test] @@ -116,6 +119,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 92985c4d90d..ceb169f79b2 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -201,6 +201,12 @@ impl WriteCommand for CheckSecondaryLocks { } } + let write_result_known_txn_status = + if let SecondaryLocksStatus::Committed(commit_ts) = &result { + vec![(self.start_ts, *commit_ts)] + } else { + vec![] + }; let mut rows = 0; if let SecondaryLocksStatus::RolledBack = &result { // One row is mutated only when a secondary lock is rolled back. @@ -220,6 +226,7 @@ impl WriteCommand for CheckSecondaryLocks { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: write_result_known_txn_status, }) } } @@ -235,7 +242,10 @@ pub mod tests { kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, + }, Engine, }; @@ -265,6 +275,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -303,6 +314,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index dc99ebf3b01..9e9a6cc0895 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -131,6 +131,12 @@ impl WriteCommand for CheckTxnStatus { let mut released_locks = ReleasedLocks::new(); released_locks.push(released); + let write_result_known_txn_status = if let TxnStatus::Committed { commit_ts } = &txn_status + { + vec![(self.lock_ts, *commit_ts)] + } else { + vec![] + }; let pr = ProcessResult::TxnStatus { txn_status }; let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); @@ -145,6 +151,7 @@ impl WriteCommand for CheckTxnStatus { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: write_result_known_txn_status, }) } } @@ -168,6 +175,7 @@ pub mod tests { commands::{pessimistic_rollback, WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, }, types::TxnStatus, ProcessResult, TestEngineBuilder, @@ -211,6 +219,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -259,6 +268,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .map(|r| { diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index 302c4fe1308..886094a7f34 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -80,6 +80,7 @@ impl WriteCommand for Cleanup { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 4f05df8fe83..8daff9b2aee 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -80,6 +80,7 @@ impl WriteCommand for Commit { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![(self.lock_ts, self.commit_ts)], }) } } diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index ca9213b57d3..3725de47273 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -117,6 +117,7 @@ impl WriteCommand for RawCompareAndSwap { new_acquired_locks: vec![], lock_guards, response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -134,8 +135,9 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Engine, Statistics, - TestEngineBuilder, + lock_manager::MockLockManager, + txn::{scheduler::get_raw_ext, txn_status_cache::TxnStatusCache}, + Engine, Statistics, TestEngineBuilder, }; #[test] @@ -215,6 +217,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; match ret.pr { @@ -269,6 +272,7 @@ mod tests { statistics: &mut statistic, async_apply_prewrite: false, raw_ext, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 37d288fa266..efbeefa2494 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -185,6 +185,7 @@ impl WriteCommand for FlashbackToVersion { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 5896d6562f1..dabef707e61 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -70,7 +70,7 @@ use crate::storage::{ }, metrics, mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, - txn::{latch, ProcessResult, Result}, + txn::{latch, txn_status_cache::TxnStatusCache, ProcessResult, Result}, types::{ MvccInfo, PessimisticLockParameters, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, StorageCallbackType, TxnStatus, @@ -422,6 +422,12 @@ pub struct WriteResult { pub new_acquired_locks: Vec, pub lock_guards: Vec, pub response_policy: ResponsePolicy, + /// The txn status that can be inferred by the successful writing. This will + /// be used to update the cache. + /// + /// Currently only commit_ts of committed transactions will be collected. + /// Rolled-back transactions may also be collected in the future. + pub known_txn_status: Vec<(TimeStamp, TimeStamp)>, } pub struct WriteResultLockInfo { @@ -573,6 +579,7 @@ pub struct WriteContext<'a, L: LockManager> { pub statistics: &'a mut Statistics, pub async_apply_prewrite: bool, pub raw_ext: Option, // use for apiv2 + pub txn_status_cache: &'a TxnStatusCache, } pub struct ReaderWithStats<'a, S: Snapshot> { @@ -823,6 +830,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; let res = match ret.pr { @@ -983,6 +991,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; @@ -1008,6 +1017,7 @@ pub mod test_util { statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let ret = cmd.cmd.process_write(snap, context)?; diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 5d3aa7f6d2f..1f5d40b2d4e 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -53,6 +53,7 @@ impl WriteCommand for Pause { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index 4e0bf8c8c56..531eb256c40 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -96,6 +96,7 @@ impl WriteCommand for PessimisticRollback { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -116,6 +117,7 @@ pub mod tests { commands::{WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, }, TestEngineBuilder, }; @@ -146,6 +148,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let result = command.process_write(snapshot, write_context).unwrap(); write(engine, &ctx, result.to_be_write.modifies); diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 10446db6292..34c98dab156 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -24,7 +24,7 @@ use crate::storage::{ kv::WriteData, lock_manager::LockManager, mvcc::{ - has_data_in_range, Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, + has_data_in_range, metrics::*, Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, TxnCommitRecord, }, txn::{ @@ -489,6 +489,36 @@ impl Prewriter { snapshot: impl Snapshot, mut context: WriteContext<'_, impl LockManager>, ) -> Result { + // Handle special cases about retried prewrite requests for pessimistic + // transactions. + if let TransactionKind::Pessimistic(_) = self.kind.txn_kind() { + if let Some(commit_ts) = context.txn_status_cache.get_no_promote(self.start_ts) { + fail_point!("before_prewrite_txn_status_cache_hit"); + if self.ctx.is_retry_request { + MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC + .retry_req + .inc(); + } else { + MVCC_PREWRITE_REQUEST_AFTER_COMMIT_COUNTER_VEC + .non_retry_req + .inc(); + } + warn!("prewrite request received due to transaction is known to be already committed"; "start_ts" => %self.start_ts, "commit_ts" => %commit_ts); + // In normal cases if the transaction is committed, then the key should have + // been already prewritten successfully. But in order to + // simplify code as well as prevent possible corner cases or + // special cases in the future, we disallow skipping constraint + // check in this case. + // We regard this request as a retried request no matter if it really is (the + // original request may arrive later than retried request due to + // network latency, in which case we'd better handle it like a + // retried request). + self.ctx.is_retry_request = true; + } else { + fail_point!("before_prewrite_txn_status_cache_miss"); + } + } + self.kind .can_skip_constraint_check(&mut self.mutations, &snapshot, &mut context)?; self.check_max_ts_synced(&snapshot)?; @@ -748,6 +778,11 @@ impl Prewriter { new_acquired_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, + known_txn_status: if !one_pc_commit_ts.is_zero() { + vec![(self.start_ts, one_pc_commit_ts)] + } else { + vec![] + }, } } else { // Skip write stage if some keys are locked. @@ -768,6 +803,7 @@ impl Prewriter { new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], } }; @@ -1002,6 +1038,7 @@ mod tests { must_acquire_pessimistic_lock, must_acquire_pessimistic_lock_err, must_commit, must_prewrite_put_err_impl, must_prewrite_put_impl, must_rollback, }, + txn_status_cache::TxnStatusCache, Error, ErrorInner, }, types::TxnStatus, @@ -1647,6 +1684,7 @@ mod tests { statistics: &mut Statistics::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), } }; } @@ -1818,6 +1856,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: case.async_apply_prewrite, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let mut engine = TestEngineBuilder::new().build().unwrap(); let snap = engine.snapshot(Default::default()).unwrap(); @@ -1932,6 +1971,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -1960,6 +2000,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2043,6 +2084,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2075,6 +2117,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2345,6 +2388,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2369,6 +2413,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2575,6 +2620,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }; let snap = engine.snapshot(Default::default()).unwrap(); let res = prewrite_cmd.cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index f3d141807e8..cd01fc60475 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -83,6 +83,7 @@ impl WriteCommand for ResolveLock { let mut scan_key = self.scan_key.take(); let rows = key_locks.len(); let mut released_locks = ReleasedLocks::new(); + let mut known_txn_status = vec![]; for (current_key, current_lock) in key_locks { txn.start_ts = current_lock.ts; reader.start_ts = current_lock.ts; @@ -103,7 +104,10 @@ impl WriteCommand for ResolveLock { // type. They could be left if the transaction is finally committed and // pessimistic conflict retry happens during execution. match commit(&mut txn, &mut reader, current_key.clone(), commit_ts) { - Ok(res) => res, + Ok(res) => { + known_txn_status.push((current_lock.ts, commit_ts)); + res + } Err(MvccError(box MvccErrorInner::TxnLockNotFound { .. })) if current_lock.is_pessimistic_lock() => { @@ -125,6 +129,9 @@ impl WriteCommand for ResolveLock { } } + known_txn_status.sort(); + known_txn_status.dedup(); + let pr = if scan_key.is_none() { ProcessResult::Res } else { @@ -151,6 +158,7 @@ impl WriteCommand for ResolveLock { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status, }) } } diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index 63fe201596d..318e5d57313 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -63,6 +63,11 @@ impl WriteCommand for ResolveLockLite { }); } + let known_txn_status = if !self.commit_ts.is_zero() { + vec![(self.start_ts, self.commit_ts)] + } else { + vec![] + }; let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -76,6 +81,7 @@ impl WriteCommand for ResolveLockLite { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status, }) } } diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index f3b674f4916..df60767e716 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -71,6 +71,7 @@ impl WriteCommand for Rollback { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 448395fc436..c900464099a 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -96,6 +96,7 @@ impl WriteCommand for TxnHeartBeat { new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, + known_txn_status: vec![], }) } } @@ -111,7 +112,10 @@ pub mod tests { kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, + txn_status_cache::TxnStatusCache, + }, Engine, }; @@ -143,6 +147,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .unwrap(); @@ -185,6 +190,7 @@ pub mod tests { statistics: &mut Default::default(), async_apply_prewrite: false, raw_ext: None, + txn_status_cache: &TxnStatusCache::new_for_test(), }, ) .is_err() diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index a662d9bab79..c76d71d7c7a 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -62,8 +62,6 @@ impl Latch { } self.waiting.push_front(item); } - // FIXME: remove this clippy attribute once https://github.com/rust-lang/rust-clippy/issues/6784 is fixed. - #[allow(clippy::manual_flatten)] for it in self.waiting.iter_mut() { if let Some((v, _)) = it { if *v == key_hash { diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 640c534fc86..8c30ae0a068 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -6,6 +6,7 @@ pub mod commands; pub mod flow_controller; pub mod sched_pool; pub mod scheduler; +pub mod txn_status_cache; mod actions; mod latch; diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 19736304373..c6d7b477db0 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -12,7 +12,9 @@ use kvproto::{kvrpcpb::CommandPri, pdpb::QueryKind}; use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; -use resource_control::{ControlledFuture, ResourceController, TaskMetadata}; +use resource_control::{ + with_resource_limiter, ControlledFuture, ResourceController, ResourceGroupManager, TaskMetadata, +}; use tikv_util::{ sys::SysQuota, yatp_pool::{Full, FuturePool, PoolTicker, YatpPoolBuilder}, @@ -101,6 +103,7 @@ impl VanillaQueue { struct PriorityQueue { worker_pool: FuturePool, resource_ctl: Arc, + resource_mgr: Arc, } impl PriorityQueue { @@ -118,15 +121,23 @@ impl PriorityQueue { // TODO: maybe use a better way to generate task_id let task_id = rand::random::(); let group_name = metadata.group_name().to_owned(); + let resource_limiter = self.resource_mgr.get_resource_limiter( + unsafe { std::str::from_utf8_unchecked(&group_name) }, + "", + metadata.override_priority() as u64, + ); let mut extras = Extras::new_multilevel(task_id, fixed_level); extras.set_metadata(metadata.to_vec()); self.worker_pool.spawn_with_extras( - ControlledFuture::new( - async move { - f.await; - }, - self.resource_ctl.clone(), - group_name, + with_resource_limiter( + ControlledFuture::new( + async move { + f.await; + }, + self.resource_ctl.clone(), + group_name, + ), + resource_limiter, ), extras, ) @@ -155,6 +166,7 @@ impl SchedPool { reporter: R, feature_gate: FeatureGate, resource_ctl: Option>, + resource_mgr: Option>, ) -> Self { let builder = |pool_size: usize, name_prefix: &str| { let engine = Arc::new(Mutex::new(engine.clone())); @@ -181,6 +193,7 @@ impl SchedPool { destroy_tls_engine::(); tls_flush(&reporter); }) + .enable_task_wait_metrics(true) }; let vanilla = VanillaQueue { worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), @@ -191,6 +204,7 @@ impl SchedPool { worker_pool: builder(pool_size, "sched-worker-priority") .build_priority_future_pool(r.clone()), resource_ctl: r.clone(), + resource_mgr: resource_mgr.unwrap(), }); let queue_type = if resource_ctl.is_some() { QueueType::Dynamic diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 4df7033c21a..6d087d894df 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -83,6 +83,7 @@ use crate::{ flow_controller::FlowController, latch::{Latches, Lock}, sched_pool::{tls_collect_query, tls_collect_scan_details, SchedPool}, + txn_status_cache::TxnStatusCache, Error, ErrorInner, ProcessResult, }, types::StorageCallback, @@ -293,6 +294,8 @@ struct TxnSchedulerInner { quota_limiter: Arc, resource_manager: Option>, feature_gate: FeatureGate, + + txn_status_cache: TxnStatusCache, } #[inline] @@ -469,6 +472,7 @@ impl TxnScheduler { reporter, feature_gate.clone(), resource_ctl, + resource_manager.clone(), ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), lock_mgr, @@ -484,6 +488,7 @@ impl TxnScheduler { quota_limiter, resource_manager, feature_gate, + txn_status_cache: TxnStatusCache::new(config.txn_status_cache_capacity), }); slow_log!( @@ -815,6 +820,7 @@ impl TxnScheduler { pipelined: bool, async_apply_prewrite: bool, new_acquired_locks: Vec, + known_txn_status: Vec<(TimeStamp, TimeStamp)>, tag: CommandKind, metadata: TaskMetadata<'_>, sched_details: &SchedulerDetails, @@ -837,6 +843,17 @@ impl TxnScheduler { debug!("write command finished"; "cid" => cid, "pipelined" => pipelined, "async_apply_prewrite" => async_apply_prewrite); drop(lock_guards); + + if result.is_ok() && !known_txn_status.is_empty() { + // Update cache before calling the callback. + // Reversing the order can lead to test failures as the cache may still + // remain not updated after receiving signal from the callback. + let now = std::time::SystemTime::now(); + for (start_ts, commit_ts) in known_txn_status { + self.inner.txn_status_cache.insert(start_ts, commit_ts, now); + } + } + let tctx = self.inner.dequeue_task_context(cid); let mut do_wake_up = !tctx.woken_up_resumable_lock_requests.is_empty(); @@ -1220,6 +1237,10 @@ impl TxnScheduler { .get_resource_control_context() .get_resource_group_name(), task.cmd.ctx().get_request_source(), + task.cmd + .ctx() + .get_resource_control_context() + .get_override_priority(), ) }); let mut sample = quota_limiter.new_sample(true); @@ -1258,6 +1279,7 @@ impl TxnScheduler { statistics: &mut sched_details.stat, async_apply_prewrite: self.inner.enable_async_apply_prewrite, raw_ext, + txn_status_cache: &self.inner.txn_status_cache, }; let begin_instant = Instant::now(); let res = unsafe { @@ -1279,10 +1301,14 @@ impl TxnScheduler { // TODO: write bytes can be a bit inaccurate due to error requests or in-memory // pessimistic locks. sample.add_write_bytes(write_bytes); - // estimate the cpu time for write by the schdule cpu time and write bytes - let expected_dur = (sample.cpu_time() + Duration::from_micros(write_bytes as u64)) - * SCHEDULER_CPU_TIME_FACTOR; if let Some(limiter) = resource_limiter { + let expected_dur = if limiter.is_background() { + // estimate the cpu time for write by the schduling cpu time and write bytes + (sample.cpu_time() + Duration::from_micros(write_bytes as u64)) + * SCHEDULER_CPU_TIME_FACTOR + } else { + sample.cpu_time() + }; limiter .async_consume( expected_dur, @@ -1328,6 +1354,7 @@ impl TxnScheduler { new_acquired_locks, lock_guards, response_policy, + known_txn_status, } = match deadline .check() .map_err(StorageError::from) @@ -1406,6 +1433,7 @@ impl TxnScheduler { false, false, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1441,6 +1469,7 @@ impl TxnScheduler { false, false, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1636,6 +1665,7 @@ impl TxnScheduler { pipelined, is_async_apply_prewrite, new_acquired_locks, + known_txn_status, tag, metadata, sched_details, @@ -1665,10 +1695,15 @@ impl TxnScheduler { // it may break correctness. // However, not release latch will cause deadlock which may ultimately block all // following txns, so we panic here. - panic!( - "response channel is unexpectedly dropped, tag {:?}, cid {}", - tag, cid - ); + // + // todo(spadea): Now, we only panic if it's not shutting down, although even in + // close, this behavior is not acceptable. + if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { + panic!( + "response channel is unexpectedly dropped, tag {:?}, cid {}", + tag, cid + ); + } } /// Returns whether it succeeds to write pessimistic locks to the in-memory @@ -1874,6 +1909,11 @@ impl TxnScheduler { .push_lock_wait(entry, Default::default()); } } + + #[cfg(test)] + pub fn get_txn_status_cache(&self) -> &TxnStatusCache { + &self.inner.txn_status_cache + } } pub async fn get_raw_ext( @@ -1997,6 +2037,8 @@ mod tests { enable_async_apply_prewrite: false, ..Default::default() }; + let resource_manager = Arc::new(ResourceGroupManager::default()); + let controller = resource_manager.derive_controller("test".into(), false); ( TxnScheduler::new( engine.clone(), @@ -2014,11 +2056,8 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), latest_feature_gate(), - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - true, - ))), - None, + Some(controller), + Some(resource_manager), ), engine, ) @@ -2353,6 +2392,8 @@ mod tests { }; let feature_gate = FeatureGate::default(); feature_gate.set_version("6.0.0").unwrap(); + let resource_manager = Arc::new(ResourceGroupManager::default()); + let controller = resource_manager.derive_controller("test".into(), false); let scheduler = TxnScheduler::new( engine, @@ -2370,11 +2411,8 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), feature_gate.clone(), - Some(Arc::new(ResourceController::new_for_test( - "test".to_owned(), - true, - ))), - None, + Some(controller), + Some(resource_manager), ); // Use sync mode if pipelined_pessimistic_lock is false. assert_eq!(scheduler.pessimistic_lock_mode(), PessimisticLockMode::Sync); diff --git a/src/storage/txn/txn_status_cache.rs b/src/storage/txn/txn_status_cache.rs new file mode 100644 index 00000000000..ab50bd0412e --- /dev/null +++ b/src/storage/txn/txn_status_cache.rs @@ -0,0 +1,978 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements a cache for the status of recent finished +//! transactions. When a transaction is committed or rolled back, we store the +//! information in the cache for a while. Later, in some cases, one can find +//! the transaction status without accessing the physical storage. This helps +//! to quickly find out the transaction status in some cases. +//! +//! > **Note:** +//! > * Currently, only committed transactions are cached. We may also cache +//! > rolled-back transactions in the future. +//! > * Currently, the cache is only used to filter unnecessary stale prewrite +//! > requests. We will also consider use the cache for other purposes in the +//! > future. +//! +//! ## Why we need this? +//! +//! ### For filtering out unwanted late-arrived stale prewrite requests +//! +//! This solves a problem which has a complicated background. +//! +//! There's such an optimization in pessimistic transactions when TiKV runs +//! accompanied with TiDB: non-unique index keys don't need to be pessimistic- +//! locked, and WRITE CF don't need to be checked either when prewriting. The +//! correctness in case there's any kinds of conflicts will be protected by +//! the corresponding row key, as the index key is never written without +//! writing the corresponding row key. +//! +//! However, it's later found to be problematic, especially with async commit +//! and 1PC, as the prewrite requests on these index keys lost its idempotency. +//! You can see [this issue](https://github.com/tikv/tikv/issues/11187) to see +//! how it causes problems, including those that affects transaction +//! correctness. +//! +//! The problem happens when the prewrite request to the same index key is +//! sent more than once. Our first solution is to add a `is_retry_request` flag +//! to the second (or even more) requests, which is sent due to retrying from +//! the client side. But it's still imperfect, considering that it's +//! theoretically possible that the original request arrives to TiKV later than +//! the retried one. In fact, we once observed this happens in an environment +//! where the network is terribly unstable. +//! +//! Our second solution, additional to the previous one, is to use this cache. +//! Each committed transaction should be guaranteed to be kept in the cache for +//! [a long-enough time](CACHE_ITEMS_REQUIRED_KEEP_TIME). When a prewrite +//! request is received, it should check the cache before executing. If it finds +//! its belonging transaction is already committed, it won't skip constraint +//! check in WRITE CF. Note that if the index key is already committed but the +//! transaction info is not cached, then a late-arrived prewrite request cannot +//! be protected by this mechanism. This means we shouldn't miss any cacheable +//! transactions, and it is the reason why committed transactions should be +//! cached for *a long-enough time*. +//! +//! Unfortunately, the solution is still imperfect. As it's already known, it +//! may still be problematic due to the following reasons: +//! +//! 1. We don't have mechanism to refuse requests that have +//! past more than [CACHE_ITEMS_REQUIRED_KEEP_TIME] since they were sent. +//! 2. To prevent the cache from consuming too much more memory than expected, +//! we have a limit to the capacity (though the limit is very large), and it's +//! configurable (so the cache can be disabled, see how the `capacity` parameter +//! of function [TxnStatusCache::new] is used) as a way to escape from potential +//! faults. +//! 3. The cache can't be synced across different TiKV instances. +//! +//! The third case above needs detailed explanation to be clarified. This is +//! an example of the problem: +//! +//! 1. Client try to send prewrite request to TiKV A, who has the leader of the +//! region containing a index key. The request is not received by TiKV and the +//! client retries. +//! 2. The leader is transferred to TiKV B, and the retries prewrite request +//! is sent to it and processed successfully. +//! 3. The transaction is committed on TiKV B, not being known by TiKV A. +//! 4. The leader transferred back to TiKV A. +//! 5. The original request arrives to TiKV A and being executed. As the +//! status of the transaction is not in the cache in TiKV A, the prewrite +//! request will be handled in normal way, skipping constraint checks. +//! +//! As of the time when this module is written, the above remaining cases have +//! not yet been handled, considering the extremely low possibility to happen +//! and high complexity to fix. +//! +//! The perfect and most elegant way to fix all of these problem is never to +//! skip constraint checks or never skipping pessimistic locks for index keys. +//! Or to say, totally remove the optimization mentioned above on index keys. +//! But for historical reason, this may lead to significant performance +//! regression in existing clusters. +//! +//! ### For read data locked by large transactions more efficiently +//! +//! * Note: the `TxnStatusCache` is designed prepared for this usage, but not +//! used yet for now. +//! +//! Consider the case that a very-large transaction locked a lot of keys after +//! prewriting, while many simple reads and writes executes frequently, thus +//! these simple transactions frequently meets the lock left by the large +//! transaction. It will be very inefficient for these small transactions to +//! come back to the client and start resolve lock procedure. Even if the client +//! side has the cache of that transaction, it still wastes an RTT. +//! +//! There would be more possibilities if we have such a cache in TiKV side: for +//! read requests, it can check the cache to know whether it can read from the +//! lock; and for write requests, if it finds the transaction of that lock is +//! already committed, it can merge together the resolve-lock-committing and the +//! write operation that the request needs to perform. + +use std::{ + sync::{atomic::AtomicU64, Arc}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; + +use crossbeam::utils::CachePadded; +use parking_lot::Mutex; +use tikv_util::{ + lru, + lru::{GetTailEntry, LruCache}, +}; +use txn_types::TimeStamp; + +use crate::storage::metrics::*; + +const TXN_STATUS_CACHE_SLOTS: usize = 128; + +/// An cache item should be kept for at least this time. +/// Actually this should be guaranteed only for committed transactions. See +/// [this section](# +/// for-filtering-out-unwanted-late-arrived-stale-prewrite-requests) for details +/// about why this is needed. +const CACHE_ITEMS_REQUIRED_KEEP_TIME: Duration = Duration::from_secs(30); + +struct CacheEntry { + commit_ts: TimeStamp, + /// The system timestamp in milliseconds when the entry is inserted to the + /// cache. + insert_time: u64, +} + +/// Defines the policy to evict expired entries from the cache. +/// [`TxnStatusCache`] needs to keep entries for a while, so the common +/// policy that only limiting capacity is not proper to be used here. +struct TxnStatusCacheEvictPolicy { + required_keep_time_millis: u64, + #[cfg(test)] + simulated_system_time: Option>, +} + +impl TxnStatusCacheEvictPolicy { + fn new( + required_keep_time: Duration, + #[allow(unused_variables)] simulated_system_time: Option>, + ) -> Self { + Self { + required_keep_time_millis: required_keep_time.as_millis() as u64, + #[cfg(test)] + simulated_system_time, + } + } + + #[inline] + #[cfg(not(test))] + fn now(&self) -> SystemTime { + SystemTime::now() + } + + /// When used in tests, the system time can be simulated by controlling the + /// field `simulated_system_time`. + #[inline] + #[cfg(test)] + fn now(&self) -> SystemTime { + // Always get the system time to simulate the latency. + let now = SystemTime::now(); + if let Some(pseudo_system_time) = &self.simulated_system_time { + UNIX_EPOCH + + std::time::Duration::from_millis( + pseudo_system_time.load(std::sync::atomic::Ordering::Acquire), + ) + } else { + now + } + } +} + +impl lru::EvictPolicy for TxnStatusCacheEvictPolicy { + fn should_evict( + &self, + current_size: usize, + capacity: usize, + get_tail_entry: &impl GetTailEntry, + ) -> bool { + // See how much time has been elapsed since the tail entry is inserted. + // If it's long enough, remove it. + if let Some((_, v)) = get_tail_entry.get_tail_entry() { + if self.now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + > self.required_keep_time_millis + v.insert_time + { + return true; + } + } + + // If the capacity limit is exceeded, remove it. + current_size > capacity + } +} + +type TxnStatusCacheSlot = + LruCache; + +/// The cache for storing transaction status. It holds recent +/// `start_ts` -> `commit_ts` pairs for a while, which can be useful for quickly +/// but not strictly determining transaction status. +/// +/// `TxnStatusCache` is divided into several slots +/// to make the lock more fine-grained. Each slot uses an [`LruCache`] as the +/// internal implementation, with customized evict policy. However, we do not +/// always adopt the LRU behavior. Some operation to an existing entry in the +/// cache won't promote it to the most-recent place. +/// +/// Note that the `TxnStatusCache` updates metrics in some operations assuming +/// there's at most one instance of `TxnStatusCache` in a process. +pub struct TxnStatusCache { + slots: Vec>>, + is_enabled: bool, +} + +unsafe impl Sync for TxnStatusCache {} + +impl TxnStatusCache { + fn new_impl( + slots: usize, + required_keep_time: Duration, + capacity: usize, + simulated_system_time: Option>, + ) -> Self { + if capacity == 0 { + return Self { + slots: vec![], + is_enabled: false, + }; + } + + // The limit of the LruCache of each slot. + let allowed_capacity_per_slot = capacity / slots; + // The total memory allocated initially by the LruCache's internal data + // structure for all slots. + + let mut initial_allocated_capacity_total = 0; + let res = Self { + slots: (0..slots) + .map(|_| { + let cache = LruCache::new( + allowed_capacity_per_slot, + 0, + lru::CountTracker::default(), + TxnStatusCacheEvictPolicy::new( + required_keep_time, + simulated_system_time.clone(), + ), + ); + let allocated_capacity = cache.internal_allocated_capacity(); + initial_allocated_capacity_total += allocated_capacity; + Mutex::new(cache).into() + }) + .collect(), + is_enabled: true, + }; + SCHED_TXN_STATUS_CACHE_SIZE + .allocated + .set(initial_allocated_capacity_total as i64); + res + } + + pub fn new(capacity: usize) -> Self { + Self::with_slots_and_time_limit( + TXN_STATUS_CACHE_SLOTS, + CACHE_ITEMS_REQUIRED_KEEP_TIME, + capacity, + ) + } + + #[cfg(test)] + pub fn new_for_test() -> Self { + // 1M capacity should be enough for tests. + Self::with_slots_and_time_limit(16, CACHE_ITEMS_REQUIRED_KEEP_TIME, 1 << 20) + } + + pub fn with_slots_and_time_limit( + slots: usize, + required_keep_time: Duration, + capacity: usize, + ) -> Self { + Self::new_impl(slots, required_keep_time, capacity, None) + } + + /// Create a `TxnStatusCache` instance for test purpose, with simulating + /// system time enabled. This helps when testing functionalities that are + /// related to system time. + /// + /// An `AtomicU64` will be returned. Store timestamps + /// in milliseconds in it to control the time. + #[cfg(test)] + fn with_simulated_system_time( + slots: usize, + requried_keep_time: Duration, + capacity: usize, + ) -> (Self, Arc) { + let system_time = Arc::new(AtomicU64::new(0)); + let res = Self::new_impl( + slots, + requried_keep_time, + capacity, + Some(system_time.clone()), + ); + (res, system_time) + } + + fn slot_index(&self, start_ts: TimeStamp) -> usize { + fxhash::hash(&start_ts) % self.slots.len() + } + + /// Insert a transaction status into the cache. The current system time + /// should be passed from outside to avoid getting system time repeatedly + /// when multiple items is being inserted. + /// + /// If the transaction's information is already in the cache, it will + /// **NOT** be promoted to the most-recent place of the internal LRU. + pub fn insert(&self, start_ts: TimeStamp, commit_ts: TimeStamp, now: SystemTime) { + if !self.is_enabled { + return; + } + + let insert_time = now.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + let previous_size = slot.size(); + let previous_allocated = slot.internal_allocated_capacity(); + slot.insert_if_not_exist( + start_ts, + CacheEntry { + commit_ts, + insert_time, + }, + ); + let size = slot.size(); + let allocated = slot.internal_allocated_capacity(); + // Update statistics. + // CAUTION: Assuming that only one TxnStatusCache instance is in a TiKV process. + SCHED_TXN_STATUS_CACHE_SIZE + .used + .add(size as i64 - previous_size as i64); + SCHED_TXN_STATUS_CACHE_SIZE + .allocated + .add(allocated as i64 - previous_allocated as i64); + } + + /// Try to get an item from the cache, without promoting the item (if + /// exists) to the most recent place. + pub fn get_no_promote(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let slot = self.slots[self.slot_index(start_ts)].lock(); + slot.get_no_promote(&start_ts).map(|entry| entry.commit_ts) + } + + pub fn get(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + slot.get(&start_ts).map(|entry| entry.commit_ts) + } + + /// Remove an entry from the cache. We usually don't need to remove anything + /// from the `TxnStatusCache`, but it's useful in tests to construct cache- + /// miss cases. + #[cfg(test)] + pub fn remove(&self, start_ts: TimeStamp) -> Option { + if !self.is_enabled { + return None; + } + + let res = { + let mut slot = self.slots[self.slot_index(start_ts)].lock(); + slot.remove(&start_ts).map(|e| e.commit_ts) + }; + debug_assert!(self.get_no_promote(start_ts).is_none()); + res + } +} + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant, SystemTime}, + }; + + use rand::{prelude::SliceRandom, Rng}; + + use super::*; + + fn bench_insert_impl(b: &mut test::Bencher, init_size: usize) { + let (c, time) = TxnStatusCache::with_simulated_system_time( + TXN_STATUS_CACHE_SLOTS, + Duration::from_millis(init_size as u64), + 1 << 20, + ); + let start_time = SystemTime::now(); + // Spread these items evenly in a specific time limit, so that every time + // a new item is inserted, an item will be popped out. + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + start_time + Duration::from_millis(i as u64), + ); + } + let mut current_time_shift = (init_size + 1) as u64; + b.iter(|| { + let simulated_now = start_time + Duration::from_millis(current_time_shift); + // Simulate the system time advancing. + time.store( + simulated_now + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64, + Ordering::Release, + ); + c.insert( + current_time_shift.into(), + (current_time_shift + 1).into(), + simulated_now, + ); + current_time_shift += 1; + }); + test::black_box(&c); + } + + fn bench_get_impl(b: &mut test::Bencher, init_size: usize) { + let c = TxnStatusCache::with_slots_and_time_limit( + TXN_STATUS_CACHE_SLOTS, + CACHE_ITEMS_REQUIRED_KEEP_TIME, + 1 << 20, + ); + let now = SystemTime::now(); + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + now + Duration::from_millis(i as u64), + ); + } + let rand_range = if init_size == 0 { 10000 } else { init_size } as u64; + b.iter(|| { + let ts = rand::thread_rng().gen_range(0u64, rand_range); + let res = c.get_no_promote(ts.into()); + test::black_box(&res); + }) + } + + #[bench] + fn bench_insert_empty(b: &mut test::Bencher) { + bench_insert_impl(b, 0); + } + + #[bench] + fn bench_insert_100000(b: &mut test::Bencher) { + bench_insert_impl(b, 100000); + } + + #[bench] + fn bench_get_empty(b: &mut test::Bencher) { + bench_get_impl(b, 0); + } + + #[bench] + fn bench_get_100000(b: &mut test::Bencher) { + bench_get_impl(b, 100000); + } + + /// A simple statistic tool for collecting a set of data and calculating the + /// average, stddev, and percentiles (by using a linear histogram). + /// Data is collected in u128, and results are given in f64. + struct SimpleStatistics { + sum: u128, + sum_square: u128, + count: usize, + bucket_width: u128, + buckets: Vec, + } + + impl SimpleStatistics { + fn new(bucket_width: u128) -> Self { + Self { + sum: 0, + sum_square: 0, + count: 0, + bucket_width, + buckets: vec![], + } + } + + /// Merge another instance into the current one + fn add(&mut self, other: Self) { + self.sum += other.sum; + self.sum_square += other.sum_square; + self.count += other.count; + assert_eq!(self.bucket_width, other.bucket_width); + if self.buckets.len() < other.buckets.len() { + self.buckets.resize(other.buckets.len(), 0); + } + for (count, other_count) in self.buckets.iter_mut().zip(other.buckets.iter()) { + *count += *other_count + } + } + + fn avg(&self) -> f64 { + self.sum as f64 / (self.count as f64) + } + + fn stddev(&self) -> f64 { + let avg = self.avg(); + let sum_sqr_diff: f64 = + (self.sum_square as f64) - (self.sum as f64 * avg * 2.0) + avg * self.count as f64; + (sum_sqr_diff / (self.count - 1) as f64).sqrt() + } + + /// Calculate the percentile value at specified position (should be in + /// range [0, 1]) + fn percentile(&self, position: f64) -> f64 { + let mut bucket = self.buckets.len(); + let mut prefix_sum = self.count; + while bucket > 0 { + bucket -= 1; + prefix_sum -= self.buckets[bucket]; + let prefix_percentile = prefix_sum as f64 / self.count as f64; + if prefix_percentile <= position { + assert_le!(prefix_sum as f64, position * self.count as f64); + assert_lt!( + position * self.count as f64, + (prefix_sum + self.buckets[bucket]) as f64 + ); + break; + } + } + + bucket as f64 * self.bucket_width as f64 + + (position * self.count as f64 - prefix_sum as f64) * self.bucket_width as f64 + / self.buckets[bucket] as f64 + } + + fn observe(&mut self, value: u128) { + self.sum += value; + self.sum_square += value * value; + self.count += 1; + let bucket = (value / self.bucket_width) as usize; + if self.buckets.len() <= bucket { + self.buckets.resize(bucket + 1, 0); + } + self.buckets[bucket] += 1; + } + } + + fn bench_concurrent_impl( + name: &str, + threads: usize, + function: impl Fn(u64) -> T + Send + Sync + 'static, + ) { + let start_time = Instant::now(); + // Run the benchmark code repeatedly for 10 seconds. + const TIME_LIMIT: Duration = Duration::from_secs(10); + let iteration = Arc::new(AtomicU64::new(0)); + + // Make the lifetime checker happy. + let function = Arc::new(function); + + let mut handles = Vec::with_capacity(threads); + for _ in 0..threads { + let f = function.clone(); + let iteration = iteration.clone(); + let handle = std::thread::spawn(move || { + let mut stats = SimpleStatistics::new(20); + loop { + if start_time.elapsed() > TIME_LIMIT { + break; + } + let i = iteration.fetch_add(1, Ordering::SeqCst); + let iter_start_time = Instant::now(); + test::black_box(f(i)); + let duration = iter_start_time.elapsed(); + stats.observe(duration.as_nanos()); + } + stats + }); + handles.push(handle); + } + + let mut total_stats = SimpleStatistics::new(20); + for h in handles { + total_stats.add(h.join().unwrap()); + } + + println!( + "benchmark {}: duration per iter: avg: {:?}, stddev: {:?}, percentile .99: {:?}, percentile .999: {:?}", + name, + Duration::from_nanos(total_stats.avg() as u64), + Duration::from_nanos(total_stats.stddev() as u64), + Duration::from_nanos(total_stats.percentile(0.99) as u64), + Duration::from_nanos(total_stats.percentile(0.999) as u64), + ); + } + + fn bench_txn_status_cache_concurrent_impl( + threads: usize, + init_size: usize, + simulate_contention: bool, + get_before_insert: bool, + ) { + let slots = if simulate_contention { + 1 + } else { + TXN_STATUS_CACHE_SLOTS + }; + let (c, time) = TxnStatusCache::with_simulated_system_time( + slots, + Duration::from_millis(init_size as u64), + 1 << 20, + ); + let start_time = SystemTime::now(); + for i in 1..=init_size { + c.insert( + (i as u64).into(), + (i as u64 + 1).into(), + start_time + Duration::from_millis(i as u64), + ); + } + + let name = format!( + "bench_concurrent_{}_{}_size{}{}", + if get_before_insert { + "get_and_insert" + } else { + "insert" + }, + threads, + init_size, + if simulate_contention { + "_contention" + } else { + "" + }, + ); + + bench_concurrent_impl(&name, threads, move |iter| { + let time_shift = init_size as u64 + iter; + let now = start_time + Duration::from_millis(time_shift); + time.store( + now.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64, + Ordering::Release, + ); + + if get_before_insert { + test::black_box(c.get_no_promote(time_shift.into())); + } + c.insert(time_shift.into(), (time_shift + 1).into(), now); + test::black_box(&c); + }); + } + + #[bench] + #[ignore] + fn bench_txn_status_cache_concurrent(_b: &mut test::Bencher) { + // This case is implemented to run the concurrent benchmark in a handy way + // just like running other normal benchmarks. However, it doesn't seem + // to be possible to benchmark an operation in concurrent way by using + // either the built-in bencher or criterion. + // Here we test it in our own way without using the built-in bencher, + // and output the result by stdout. + // When you need to run this benchmark, comment out the `#[ignore]` and + // add --nocapture in your benchmark command line to get the result. + bench_txn_status_cache_concurrent_impl(16, 10000, false, false); + bench_txn_status_cache_concurrent_impl(16, 10000, true, false); + bench_txn_status_cache_concurrent_impl(16, 10000, false, true); + bench_txn_status_cache_concurrent_impl(16, 10000, true, true); + bench_txn_status_cache_concurrent_impl(64, 10000, false, false); + bench_txn_status_cache_concurrent_impl(64, 10000, true, false); + bench_txn_status_cache_concurrent_impl(64, 10000, false, true); + bench_txn_status_cache_concurrent_impl(64, 10000, true, true); + } + + #[test] + fn test_insert_and_get() { + let c = TxnStatusCache::new_for_test(); + assert!(c.get_no_promote(1.into()).is_none()); + + let now = SystemTime::now(); + + c.insert(1.into(), 2.into(), now); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + c.insert(3.into(), 4.into(), now); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // This won't actually happen, since a transaction will never have commit info + // with two different commit_ts. We just use this to check replacing + // won't happen. + c.insert(1.into(), 4.into(), now); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + + let mut start_ts_list: Vec<_> = (1..100).step_by(2).map(TimeStamp::from).collect(); + start_ts_list.shuffle(&mut rand::thread_rng()); + for &start_ts in &start_ts_list { + let commit_ts = start_ts.next(); + c.insert(start_ts, commit_ts, now); + } + start_ts_list.shuffle(&mut rand::thread_rng()); + for &start_ts in &start_ts_list { + let commit_ts = start_ts.next(); + assert_eq!(c.get_no_promote(start_ts).unwrap(), commit_ts); + } + } + + #[test] + fn test_evicting_expired() { + let (c, time) = + TxnStatusCache::with_simulated_system_time(1, Duration::from_millis(1000), 1000); + let time_base = SystemTime::now(); + let set_time = |offset_millis: u64| { + time.store( + time_base.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + offset_millis, + Ordering::Release, + ) + }; + let now = || UNIX_EPOCH + Duration::from_millis(time.load(Ordering::Acquire)); + + set_time(0); + assert_lt!( + time_base.duration_since(now()).unwrap(), + Duration::from_millis(1) + ); + + c.insert(1.into(), 2.into(), now()); + set_time(1); + c.insert(3.into(), 4.into(), now()); + set_time(2); + c.insert(5.into(), 6.into(), now()); + // Size should be calculated by count. + assert_eq!(c.slots[0].lock().size(), 3); + + // Insert entry 1 again. So if entry 1 is the first one to be popped out, it + // verifies that inserting an existing key won't promote it. + c.insert(1.into(), 2.into(), now()); + + // All the 3 entries are kept + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + assert_eq!(c.get_no_promote(5.into()).unwrap(), 6.into()); + + set_time(1001); + c.insert(7.into(), 8.into(), now()); + // Entry 1 will be popped out. + assert!(c.get_no_promote(1.into()).is_none()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + assert_eq!(c.get_no_promote(5.into()).unwrap(), 6.into()); + set_time(1004); + c.insert(9.into(), 10.into(), now()); + // It pops more than 1 entries if there are many expired items at the tail. + // Entry 3 and 5 will be popped out. + assert!(c.get_no_promote(1.into()).is_none()); + assert!(c.get_no_promote(3.into()).is_none()); + assert!(c.get_no_promote(5.into()).is_none()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + assert_eq!(c.get_no_promote(9.into()).unwrap(), 10.into()); + + // Now the cache's contents are: + // 7@1001, 9@1004 + // Test `get` promotes an entry and entries are not in order on insert time. + assert_eq!(c.get(7.into()).unwrap(), 8.into()); + set_time(2003); + c.insert(11.into(), 12.into(), now()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + assert_eq!(c.get_no_promote(9.into()).unwrap(), 10.into()); + assert_eq!(c.get_no_promote(11.into()).unwrap(), 12.into()); + + set_time(2005); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(7.into()).is_none()); + assert!(c.get_no_promote(9.into()).is_none()); + assert_eq!(c.get_no_promote(11.into()).unwrap(), 12.into()); + + // Now the cache's contents are: + // 11@2003, 13@2005 + // Test inserting existed entries. + // According to the implementation of LruCache, though it won't do any update to + // the content, it still check the tail to see if anything can be + // evicted. + set_time(3004); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(11.into()).is_none()); + assert_eq!(c.get_no_promote(13.into()).unwrap(), 14.into()); + + set_time(3006); + c.insert(13.into(), 14.into(), now()); + assert!(c.get_no_promote(13.into()).is_none()); + + // Now the cache is empty. + c.insert(15.into(), 16.into(), now()); + set_time(3008); + c.insert(17.into(), 18.into(), now()); + // Test inserting existed entry doesn't promote it. + // Re-insert 15. + set_time(3009); + c.insert(15.into(), 16.into(), now()); + set_time(4007); + c.insert(19.into(), 20.into(), now()); + // 15's insert time is not updated, and is at the tail of the LRU, so it should + // be popped. + assert!(c.get_no_promote(15.into()).is_none()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + + // Now the cache's contents are: + // 17@3008, 19@4007 + // Test system time being changed, which can lead to current time being less + // than entries' insert time. + set_time(2000); + c.insert(21.into(), 22.into(), now()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + assert_eq!(c.get_no_promote(21.into()).unwrap(), 22.into()); + set_time(3500); + c.insert(23.into(), 24.into(), now()); + assert_eq!(c.get_no_promote(21.into()).unwrap(), 22.into()); + assert_eq!(c.get(17.into()).unwrap(), 18.into()); + assert_eq!(c.get(19.into()).unwrap(), 20.into()); + assert_eq!(c.get(23.into()).unwrap(), 24.into()); + // `get` promotes the entries, and entry 21 is put to the tail. + c.insert(23.into(), 24.into(), now()); + assert_eq!(c.get_no_promote(17.into()).unwrap(), 18.into()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + assert!(c.get_no_promote(21.into()).is_none()); + assert_eq!(c.get_no_promote(23.into()).unwrap(), 24.into()); + + // Now the cache's contents are: + // 17@3008, 19@4007, 23@3500 + // The time passed to `insert` may differ from the time fetched in + // the `TxnStatusCacheEvictPolicy` as they are fetched at different time. + set_time(4009); + // Insert with time 4007, but check with time 4009 + c.insert(25.into(), 26.into(), now() - Duration::from_millis(2)); + assert!(c.get_no_promote(17.into()).is_none()); + assert_eq!(c.get_no_promote(19.into()).unwrap(), 20.into()); + + // The cache's contents: + // 19@4007, 23@3500, 25@4007 + set_time(4010); + c.insert(27.into(), 28.into(), now()); + // The cache's contents: + // 19@4007, 23@3500, 25@4007, 27@4010 + + // It's also possible to check with a lower time considering that system time + // may be changed. Insert with time 5018, but check with time 5008 + set_time(5008); + c.insert(29.into(), 30.into(), now() + Duration::from_millis(10)); + assert!(c.get_no_promote(19.into()).is_none()); + assert!(c.get_no_promote(23.into()).is_none()); + assert!(c.get_no_promote(25.into()).is_none()); + assert_eq!(c.get_no_promote(27.into()).unwrap(), 28.into()); + assert_eq!(c.get_no_promote(29.into()).unwrap(), 30.into()); + + // Now the the cache's contents are: + // 27@4010, 29@5018 + // Considering the case that system time is being changed, it's even + // possible that the entry being inserted is already expired + // comparing to the current time. It doesn't matter whether the + // entry will be dropped immediately or not. We just ensure it won't + // trigger more troubles. + set_time(7000); + c.insert(31.into(), 32.into(), now() - Duration::from_millis(1001)); + assert!(c.get_no_promote(27.into()).is_none()); + assert!(c.get_no_promote(29.into()).is_none()); + assert!(c.get_no_promote(31.into()).is_none()); + assert_eq!(c.slots[0].lock().size(), 0); + } + + #[test] + fn test_setting_capacity() { + let c = TxnStatusCache::new_impl(2, Duration::from_millis(1000), 10, None); + assert!(c.is_enabled); + assert_eq!(c.slots.len(), 2); + assert_eq!(c.slots[0].lock().capacity(), 5); + assert_eq!(c.slots[1].lock().capacity(), 5); + + let c = TxnStatusCache::new_impl(2, Duration::from_millis(1000), 0, None); + assert!(!c.is_enabled); + assert_eq!(c.slots.len(), 0); + // All operations are noops and won't cause panic or return any incorrect + // result. + c.insert(1.into(), 2.into(), SystemTime::now()); + assert!(c.get_no_promote(1.into()).is_none()); + assert!(c.get(1.into()).is_none()); + } + + #[test] + fn test_evicting_by_capacity() { + let (c, time) = + TxnStatusCache::with_simulated_system_time(1, Duration::from_millis(1000), 5); + let time_base = SystemTime::now(); + let set_time = |offset_millis: u64| { + time.store( + time_base.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + offset_millis, + Ordering::Release, + ) + }; + let now = || UNIX_EPOCH + Duration::from_millis(time.load(Ordering::Acquire)); + + set_time(0); + c.insert(1.into(), 2.into(), now()); + set_time(2); + c.insert(3.into(), 4.into(), now()); + set_time(4); + c.insert(5.into(), 6.into(), now()); + set_time(6); + c.insert(7.into(), 8.into(), now()); + + // The cache can keep at most 5 entries. + set_time(8); + c.insert(9.into(), 10.into(), now()); + // Entry 1 not evicted. 5 entries in the cache currently + assert_eq!(c.slots[0].lock().len(), 5); + assert_eq!(c.get_no_promote(1.into()).unwrap(), 2.into()); + set_time(10); + c.insert(11.into(), 12.into(), now()); + // Entry 1 evicted. Still 5 entries in the cache. + assert_eq!(c.slots[0].lock().len(), 5); + assert!(c.get_no_promote(1.into()).is_none()); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // Nothing will be evicted after trying to insert an existing key. + c.insert(11.into(), 12.into(), now()); + assert_eq!(c.slots[0].lock().len(), 5); + assert_eq!(c.get_no_promote(3.into()).unwrap(), 4.into()); + + // Current contents (key@time): + // 3@2, 5@4, 7@6. 9@8, 11@10 + // Evicting by time works as well. + set_time(1005); + c.insert(13.into(), 14.into(), now()); + assert_eq!(c.slots[0].lock().len(), 4); + assert!(c.get_no_promote(3.into()).is_none()); + assert!(c.get_no_promote(5.into()).is_none()); + assert_eq!(c.get_no_promote(7.into()).unwrap(), 8.into()); + + // Reorder the entries by `get` to prepare for testing the next case. + assert_eq!(c.get(7.into()).unwrap(), 8.into()); + assert_eq!(c.get(9.into()).unwrap(), 10.into()); + assert_eq!(c.get(11.into()).unwrap(), 12.into()); + + c.insert(15.into(), 16.into(), now()); + // Current contents: + // 13@1005, 7@6. 9@8, 11@10, 15@1005 + assert_eq!(c.slots[0].lock().len(), 5); + // Expired entries that are not the tail can be evicted after the tail + // is evicted due to capacity exceeded. + set_time(1011); + c.insert(17.into(), 18.into(), now()); + assert_eq!(c.slots[0].lock().len(), 2); + assert!(c.get_no_promote(13.into()).is_none()); + assert!(c.get_no_promote(7.into()).is_none()); + assert!(c.get_no_promote(9.into()).is_none()); + assert!(c.get_no_promote(11.into()).is_none()); + assert_eq!(c.get(15.into()).unwrap(), 16.into()); + assert_eq!(c.get(17.into()).unwrap(), 18.into()); + } +} diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 158e56abcb1..79f5439736d 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -40,11 +40,8 @@ harness = false path = "benches/deadlock_detector/mod.rs" [features] -default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] -cloud-aws = ["external_storage_export/cloud-aws"] -cloud-gcp = ["external_storage_export/cloud-gcp"] -cloud-azure = ["external_storage_export/cloud-azure"] testexport = ["raftstore/testexport", "tikv/testexport", "pd_client/testexport"] profiling = ["profiler/profiling"] @@ -120,7 +117,6 @@ uuid = { version = "0.8.1", features = ["serde", "v4"] } procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } [dev-dependencies] -arrow = "13.0" byteorder = "1.2" # See https://bheisler.github.io/criterion.rs/book/user_guide/known_limitations.html for the usage # of `real_blackbox` feature. @@ -131,7 +127,7 @@ criterion-cpu-time = "0.1" engine_rocks = { workspace = true } engine_test = { workspace = true } engine_traits = { workspace = true } -external_storage_export = { workspace = true } +external_storage ={ workspace = true } file_system = { workspace = true } hyper = { version = "0.14", default-features = false, features = ["runtime"] } keys = { workspace = true } diff --git a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs b/tests/benches/misc/coprocessor/codec/chunk/chunk.rs deleted file mode 100644 index 4c033f2a80d..00000000000 --- a/tests/benches/misc/coprocessor/codec/chunk/chunk.rs +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use arrow::{ - array, - datatypes::{self, DataType, Field}, - record_batch::RecordBatch, -}; -use tidb_query_datatype::{codec::Datum, prelude::*, FieldTypeFlag, FieldTypeTp}; -use tipb::FieldType; - -pub struct Chunk { - pub data: RecordBatch, -} - -impl Chunk { - pub fn get_datum(&self, col_id: usize, row_id: usize, field_type: &FieldType) -> Datum { - if self.data.column(col_id).is_null(row_id) { - return Datum::Null; - } - - match field_type.as_accessor().tp() { - FieldTypeTp::Tiny - | FieldTypeTp::Short - | FieldTypeTp::Int24 - | FieldTypeTp::Long - | FieldTypeTp::LongLong - | FieldTypeTp::Year => { - if field_type - .as_accessor() - .flag() - .contains(FieldTypeFlag::UNSIGNED) - { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - - Datum::U64(data.value(row_id)) - } else { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - - Datum::I64(data.value(row_id)) - } - } - FieldTypeTp::Float | FieldTypeTp::Double => { - let data = self - .data - .column(col_id) - .as_any() - .downcast_ref::() - .unwrap(); - Datum::F64(data.value(row_id)) - } - _ => unreachable!(), - } - } -} - -pub struct ChunkBuilder { - columns: Vec, -} - -impl ChunkBuilder { - pub fn new(cols: usize, rows: usize) -> ChunkBuilder { - ChunkBuilder { - columns: vec![ColumnsBuilder::new(rows); cols], - } - } - - pub fn build(self, tps: &[FieldType]) -> Chunk { - let mut fields = Vec::with_capacity(tps.len()); - let mut arrays: Vec> = Vec::with_capacity(tps.len()); - for (field_type, column) in tps.iter().zip(self.columns.into_iter()) { - match field_type.as_accessor().tp() { - FieldTypeTp::Tiny - | FieldTypeTp::Short - | FieldTypeTp::Int24 - | FieldTypeTp::Long - | FieldTypeTp::LongLong - | FieldTypeTp::Year => { - if field_type - .as_accessor() - .flag() - .contains(FieldTypeFlag::UNSIGNED) - { - let (f, d) = column.into_u64_array(); - fields.push(f); - arrays.push(d); - } else { - let (f, d) = column.into_i64_array(); - fields.push(f); - arrays.push(d); - } - } - FieldTypeTp::Float | FieldTypeTp::Double => { - let (f, d) = column.into_f64_array(); - fields.push(f); - arrays.push(d); - } - _ => unreachable!(), - }; - } - let schema = datatypes::Schema::new(fields); - let batch = RecordBatch::try_new(Arc::new(schema), arrays).unwrap(); - Chunk { data: batch } - } - - pub fn append_datum(&mut self, col_id: usize, data: Datum) { - self.columns[col_id].append_datum(data) - } -} - -#[derive(Clone)] -pub struct ColumnsBuilder { - data: Vec, -} - -impl ColumnsBuilder { - fn new(rows: usize) -> ColumnsBuilder { - ColumnsBuilder { - data: Vec::with_capacity(rows), - } - } - - fn append_datum(&mut self, data: Datum) { - self.data.push(data) - } - - fn into_i64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::Int64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::I64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } - - fn into_u64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::UInt64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::U64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } - - fn into_f64_array(self) -> (Field, Arc) { - let field = Field::new("", DataType::Float64, true); - let mut data: Vec> = Vec::with_capacity(self.data.len()); - for v in self.data { - match v { - Datum::Null => data.push(None), - Datum::F64(v) => data.push(Some(v)), - _ => unreachable!(), - } - } - (field, Arc::new(array::PrimitiveArray::from(data))) - } -} diff --git a/tests/benches/misc/coprocessor/codec/chunk/mod.rs b/tests/benches/misc/coprocessor/codec/chunk/mod.rs deleted file mode 100644 index f956e2cb14e..00000000000 --- a/tests/benches/misc/coprocessor/codec/chunk/mod.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -mod chunk; - -use test::Bencher; -use tidb_query_datatype::{ - codec::{ - chunk::{Chunk, ChunkEncoder}, - datum::Datum, - mysql::*, - }, - FieldTypeTp, -}; -use tipb::FieldType; - -#[bench] -fn bench_encode_chunk(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![ - FieldTypeTp::LongLong.into(), - FieldTypeTp::LongLong.into(), - FieldTypeTp::VarChar.into(), - FieldTypeTp::VarChar.into(), - FieldTypeTp::NewDecimal.into(), - FieldTypeTp::Json.into(), - ]; - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - let s = format!("{}.123435", row_id); - let bs = Datum::Bytes(s.as_bytes().to_vec()); - let dec = Datum::Dec(s.parse().unwrap()); - let json = Datum::Json(Json::from_string(s).unwrap()); - chunk.append_datum(0, &Datum::Null).unwrap(); - chunk.append_datum(1, &Datum::I64(row_id as i64)).unwrap(); - chunk.append_datum(2, &bs).unwrap(); - chunk.append_datum(3, &bs).unwrap(); - chunk.append_datum(4, &dec).unwrap(); - chunk.append_datum(5, &json).unwrap(); - } - - b.iter(|| { - let mut buf = vec![]; - buf.write_chunk(&chunk).unwrap(); - }); -} - -#[bench] -fn bench_chunk_build_tidb(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::LongLong.into()]; - - b.iter(|| { - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - chunk.append_datum(0, &Datum::Null).unwrap(); - chunk.append_datum(1, &Datum::I64(row_id as i64)).unwrap(); - } - }); -} - -#[bench] -fn bench_chunk_build_official(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::LongLong.into()]; - - b.iter(|| { - let mut chunk = chunk::ChunkBuilder::new(fields.len(), rows); - for row_id in 0..rows { - chunk.append_datum(0, Datum::Null); - chunk.append_datum(1, Datum::I64(row_id as i64)); - } - chunk.build(&fields); - }); -} - -#[bench] -fn bench_chunk_iter_tidb(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::Double.into()]; - let mut chunk = Chunk::new(&fields, rows); - for row_id in 0..rows { - if row_id & 1 == 0 { - chunk.append_datum(0, &Datum::Null).unwrap(); - } else { - chunk.append_datum(0, &Datum::I64(row_id as i64)).unwrap(); - } - chunk.append_datum(1, &Datum::F64(row_id as f64)).unwrap(); - } - - b.iter(|| { - let mut col1 = 0; - let mut col2 = 0.0; - for row in chunk.iter() { - col1 += match row.get_datum(0, &fields[0]).unwrap() { - Datum::I64(v) => v, - Datum::Null => 0, - _ => unreachable!(), - }; - col2 += match row.get_datum(1, &fields[1]).unwrap() { - Datum::F64(v) => v, - _ => unreachable!(), - }; - } - assert_eq!(col1, 262_144); - assert!(!(523_776.0 - col2).is_normal()); - }); -} - -#[bench] -fn bench_chunk_iter_official(b: &mut Bencher) { - let rows = 1024; - let fields: Vec = vec![FieldTypeTp::LongLong.into(), FieldTypeTp::Double.into()]; - let mut chunk = chunk::ChunkBuilder::new(fields.len(), rows); - for row_id in 0..rows { - if row_id & 1 == 0 { - chunk.append_datum(0, Datum::Null); - } else { - chunk.append_datum(0, Datum::I64(row_id as i64)); - } - - chunk.append_datum(1, Datum::F64(row_id as f64)); - } - let chunk = chunk.build(&fields); - b.iter(|| { - let (mut col1, mut col2) = (0, 0.0); - for row_id in 0..chunk.data.num_rows() { - col1 += match chunk.get_datum(0, row_id, &fields[0]) { - Datum::I64(v) => v, - Datum::Null => 0, - _ => unreachable!(), - }; - col2 += match chunk.get_datum(1, row_id, &fields[1]) { - Datum::F64(v) => v, - _ => unreachable!(), - }; - } - assert_eq!(col1, 262_144); - assert!(!(523_776.0 - col2).is_normal()); - }); -} diff --git a/tests/benches/misc/coprocessor/codec/mod.rs b/tests/benches/misc/coprocessor/codec/mod.rs index 274ec362377..082f1c55894 100644 --- a/tests/benches/misc/coprocessor/codec/mod.rs +++ b/tests/benches/misc/coprocessor/codec/mod.rs @@ -1,6 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -mod chunk; mod mysql; use byteorder::{BigEndian, ByteOrder, LittleEndian}; diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index d567edd5add..2650434c80f 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -5,7 +5,7 @@ use std::sync::{Arc, RwLock}; use collections::HashSet; use crossbeam::channel::TrySendError; use engine_rocks::{RocksEngine, RocksSnapshot}; -use engine_traits::{KvEngine, ALL_CFS, CF_DEFAULT}; +use engine_traits::{KvEngine, SnapshotContext, ALL_CFS, CF_DEFAULT}; use futures::future::FutureExt; use kvproto::{ kvrpcpb::{Context, ExtraOp as TxnExtraOp}, @@ -53,7 +53,7 @@ impl SyncBenchRouter { cmd_resp::bind_term(&mut response, 1); match cmd.callback { Callback::Read { cb, .. } => { - let snapshot = self.db.snapshot(); + let snapshot = self.db.snapshot(None); let region = Arc::new(self.region.to_owned()); cb(ReadResponse { response, @@ -121,6 +121,7 @@ impl RaftStoreRouter for SyncBenchRouter { impl LocalReadRouter for SyncBenchRouter { fn read( &mut self, + _: Option, _: Option, req: RaftCmdRequest, cb: Callback, @@ -142,7 +143,7 @@ fn new_engine() -> (TempDir, RocksEngine) { #[bench] fn bench_async_snapshots_noop(b: &mut test::Bencher) { let (_dir, db) = new_engine(); - let snapshot = db.snapshot(); + let snapshot = db.snapshot(None); let resp = ReadResponse { response: RaftCmdResponse::default(), snapshot: Some(RegionSnapshot::from_snapshot( diff --git a/tests/benches/raftstore/mod.rs b/tests/benches/raftstore/mod.rs index 05c602824c2..98b348722da 100644 --- a/tests/benches/raftstore/mod.rs +++ b/tests/benches/raftstore/mod.rs @@ -18,7 +18,10 @@ fn enc_write_kvs(db: &RocksEngine, kvs: &[(Vec, Vec)]) { wb.write().unwrap(); } -fn prepare_cluster(cluster: &mut Cluster, initial_kvs: &[(Vec, Vec)]) { +fn prepare_cluster>( + cluster: &mut Cluster, + initial_kvs: &[(Vec, Vec)], +) { cluster.run(); for engines in cluster.engines.values() { enc_write_kvs(&engines.kv, initial_kvs); @@ -35,7 +38,7 @@ struct SetConfig { fn bench_set(b: &mut Bencher<'_>, input: &SetConfig) where - T: Simulator, + T: Simulator, F: ClusterFactory, { let mut cluster = input.factory.build(input.nodes); @@ -57,7 +60,7 @@ struct GetConfig { fn bench_get(b: &mut Bencher<'_>, input: &GetConfig) where - T: Simulator, + T: Simulator, F: ClusterFactory, { let mut cluster = input.factory.build(input.nodes); @@ -84,7 +87,7 @@ struct DeleteConfig { fn bench_delete(b: &mut Bencher<'_>, input: &DeleteConfig) where - T: Simulator, + T: Simulator, F: ClusterFactory, { let mut cluster = input.factory.build(input.nodes); @@ -105,7 +108,7 @@ where fn bench_raft_cluster(c: &mut Criterion, factory: F, label: &str) where - T: Simulator + 'static, + T: Simulator + 'static, F: ClusterFactory, { let nodes_coll = vec![1, 3, 5]; @@ -136,15 +139,15 @@ where group.finish(); } -trait ClusterFactory: Clone + fmt::Debug + 'static { - fn build(&self, nodes: usize) -> Cluster; +trait ClusterFactory>: Clone + fmt::Debug + 'static { + fn build(&self, nodes: usize) -> Cluster; } #[derive(Clone)] struct NodeClusterFactory; -impl ClusterFactory for NodeClusterFactory { - fn build(&self, nodes: usize) -> Cluster { +impl ClusterFactory> for NodeClusterFactory { + fn build(&self, nodes: usize) -> Cluster> { new_node_cluster(1, nodes) } } @@ -158,8 +161,8 @@ impl fmt::Debug for NodeClusterFactory { #[derive(Clone)] struct ServerClusterFactory; -impl ClusterFactory for ServerClusterFactory { - fn build(&self, nodes: usize) -> Cluster { +impl ClusterFactory> for ServerClusterFactory { + fn build(&self, nodes: usize) -> Cluster> { new_server_cluster(1, nodes) } } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 9c90211c073..ed2b8d79f9c 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -7,14 +7,17 @@ mod test_bootstrap; mod test_cmd_epoch_checker; mod test_conf_change; mod test_coprocessor; +mod test_debugger; mod test_disk_full; mod test_early_apply; mod test_encryption; +mod test_engine; mod test_gc_metrics; mod test_gc_worker; mod test_hibernate; mod test_import_service; mod test_kv_service; +mod test_life; mod test_local_read; mod test_memory_usage_limit; mod test_merge; diff --git a/tests/failpoints/cases/test_async_io.rs b/tests/failpoints/cases/test_async_io.rs index 3d53b9c5f14..8ce349805b0 100644 --- a/tests/failpoints/cases/test_async_io.rs +++ b/tests/failpoints/cases/test_async_io.rs @@ -8,13 +8,15 @@ use std::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::HandyRwLock; // Test if the entries can be committed and applied on followers even when // leader's io is paused. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_async_io_commit_without_leader_persist() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.cmd_batch_concurrent_ready_max_count = 0; cluster.cfg.raft_store.store_io_pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); @@ -49,9 +51,10 @@ fn test_async_io_commit_without_leader_persist() { /// Test if the leader delays its destroy after applying conf change to /// remove itself. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_async_io_delay_destroy_after_conf_change() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.store_io_pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -93,6 +96,9 @@ fn test_async_io_delay_destroy_after_conf_change() { /// Test if the peer can be destroyed when it receives a tombstone msg and /// its snapshot is persisting. +/// +/// Note: snapshot flow is changed, so partitioend-raft-kv does not support this +/// test. #[test] fn test_async_io_cannot_destroy_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); @@ -176,6 +182,9 @@ fn test_async_io_cannot_destroy_when_persist_snapshot() { } /// Test if the peer can handle ready when its snapshot is persisting. +/// +/// Note: snapshot flow is changed, so partitioend-raft-kv does not support this +/// test. #[test] fn test_async_io_cannot_handle_ready_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_bootstrap.rs b/tests/failpoints/cases/test_bootstrap.rs index 8dc2eb8b371..9b4663616ed 100644 --- a/tests/failpoints/cases/test_bootstrap.rs +++ b/tests/failpoints/cases/test_bootstrap.rs @@ -2,6 +2,7 @@ use std::sync::{Arc, RwLock}; +use engine_rocks::RocksEngine; use engine_traits::Peekable; use kvproto::{kvrpcpb::ApiVersion, metapb, raft_serverpb}; use test_pd_client::TestPdClient; @@ -9,7 +10,9 @@ use test_raftstore::*; fn test_bootstrap_half_way_failure(fp: &str) { let pd_client = Arc::new(TestPdClient::new(0, false)); - let sim = Arc::new(RwLock::new(NodeCluster::new(pd_client.clone()))); + let sim = Arc::new(RwLock::new(NodeCluster::::new( + pd_client.clone(), + ))); let mut cluster = Cluster::new(0, 5, sim, pd_client, ApiVersion::V1); // Try to start this node, return after persisted some keys. diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index 73bc741d9bb..7c39dd2589b 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -5,12 +5,12 @@ use std::{ time::Duration, }; -use engine_rocks::RocksSnapshot; +use engine_rocks::{RocksEngine, RocksSnapshot}; use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; -use tikv_util::{mpsc::future, HandyRwLock}; +use tikv_util::{future::block_on_timeout, mpsc::future, HandyRwLock}; struct CbReceivers { proposed: mpsc::Receiver<()>, @@ -61,7 +61,7 @@ impl CbReceivers { fn make_cb(cmd: &RaftCmdRequest) -> (Callback, CbReceivers) { let (proposed_tx, proposed_rx) = mpsc::channel(); let (committed_tx, committed_rx) = mpsc::channel(); - let (cb, applied_rx) = make_cb_ext( + let (cb, applied_rx) = make_cb_ext::( cmd, Some(Box::new(move || proposed_tx.send(()).unwrap())), Some(Box::new(move || committed_tx.send(()).unwrap())), @@ -76,7 +76,10 @@ fn make_cb(cmd: &RaftCmdRequest) -> (Callback, CbReceivers) { ) } -fn make_write_req(cluster: &mut Cluster, k: &[u8]) -> RaftCmdRequest { +fn make_write_req( + cluster: &mut Cluster>, + k: &[u8], +) -> RaftCmdRequest { let r = cluster.get_region(k); let mut req = new_request( r.get_id(), @@ -399,9 +402,7 @@ fn test_accept_proposal_during_conf_change() { let conf_change_fp = "apply_on_conf_change_all_1"; fail::cfg(conf_change_fp, "pause").unwrap(); let mut add_peer_rx = cluster.async_add_peer(r, new_peer(2, 2)).unwrap(); - add_peer_rx - .recv_timeout(Duration::from_millis(100)) - .unwrap_err(); + block_on_timeout(add_peer_rx.as_mut(), Duration::from_millis(100)).unwrap_err(); // Conf change doesn't affect proposals. let write_req = make_write_req(&mut cluster, b"k"); @@ -419,8 +420,7 @@ fn test_accept_proposal_during_conf_change() { fail::remove(conf_change_fp); assert!( - !add_peer_rx - .recv_timeout(Duration::from_secs(1)) + !block_on_timeout(add_peer_rx, Duration::from_secs(1)) .unwrap() .get_header() .has_error() diff --git a/tests/failpoints/cases/test_conf_change.rs b/tests/failpoints/cases/test_conf_change.rs index c3612e64127..6f91a2ff55b 100644 --- a/tests/failpoints/cases/test_conf_change.rs +++ b/tests/failpoints/cases/test_conf_change.rs @@ -110,7 +110,7 @@ fn test_write_after_destroy() { let mut epoch = cluster.pd_client.get_region_epoch(r1); let mut admin_req = new_admin_request(r1, &epoch, conf_change); admin_req.mut_header().set_peer(new_peer(1, 1)); - let (cb1, mut rx1) = make_cb(&admin_req); + let (cb1, mut rx1) = make_cb_rocks(&admin_req); let engines_3 = cluster.get_all_engines(3); let region = block_on(cluster.pd_client.get_region_by_id(r1)) .unwrap() @@ -126,7 +126,7 @@ fn test_write_after_destroy() { .async_command_on_node(1, admin_req, cb1) .unwrap(); for _ in 0..100 { - let (cb2, _rx2) = make_cb(&put); + let (cb2, _rx2) = make_cb_rocks(&put); cluster .sim .rl() diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 0710f778aa7..be9d978b23a 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -31,8 +31,15 @@ fn test_deadline() { fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); - - assert!(resp.get_other_error().contains("exceeding the deadline")); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); } #[test] @@ -46,8 +53,15 @@ fn test_deadline_2() { fail::cfg("rockskv_async_snapshot", "panic").unwrap(); fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); - - assert!(resp.get_other_error().contains("exceeding the deadline")); + let region_err = resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" + ); } /// Test deadline exceeded when request is handling @@ -80,12 +94,14 @@ fn test_deadline_3() { let mut resp = SelectResponse::default(); resp.merge_from_bytes(cop_resp.get_data()).unwrap(); - assert!( - cop_resp.other_error.contains("exceeding the deadline") - || resp - .get_error() - .get_msg() - .contains("exceeding the deadline") + let region_err = cop_resp.get_region_error(); + assert_eq!( + region_err.get_server_is_busy().reason, + "deadline is exceeded".to_string() + ); + assert_eq!( + region_err.get_message(), + "Coprocessor task terminated due to exceeding the deadline" ); } diff --git a/tests/failpoints/cases/test_debugger.rs b/tests/failpoints/cases/test_debugger.rs new file mode 100644 index 00000000000..f70ebcb6d32 --- /dev/null +++ b/tests/failpoints/cases/test_debugger.rs @@ -0,0 +1,147 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use engine_traits::{RaftEngine, RaftLogBatch, TabletRegistry}; +use kvproto::{ + kvrpcpb::MvccInfo, + metapb, + raft_serverpb::{PeerState, RegionLocalState}, +}; +use raft_log_engine::RaftLogEngine; +use test_raftstore::new_peer; +use tikv::{ + config::TikvConfig, + server::{debug::Debugger, debug2::new_debugger, KvEngineFactoryBuilder}, + storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, +}; + +const INITIAL_TABLET_INDEX: u64 = 5; +const INITIAL_APPLY_INDEX: u64 = 5; + +// Prepare some data +// Region meta range and rocksdb range of each region: +// Region 1: k01 .. k04 rocksdb: zk00 .. zk04 +// Region 2: k05 .. k09 rocksdb: zk05 .. zk09 +// Region 3: k10 .. k14 rocksdb: zk10 .. zk14 +// Region 4: k15 .. k19 rocksdb: zk15 .. zk19 +// Region 5: k20 .. k24 rocksdb: zk20 .. zk24 +// Region 6: k26 .. k27 rocksdb: zk25 .. zk29 +fn prepare_data_on_disk(path: &Path) { + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + cfg.gc.enable_compaction_filter = false; + let cache = cfg.storage.block_cache.build_shared_cache(); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache, None).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + let mut wb = raft_engine.log_batch(5); + for i in 0..6 { + let mut region = metapb::Region::default(); + let start_key = if i != 0 { + format!("k{:02}", i * 5) + } else { + String::from("k01") + }; + let end_key = format!("k{:02}", (i + 1) * 5); + region.set_id(i + 1); + region.set_start_key(start_key.into_bytes()); + region.set_end_key(end_key.into_bytes()); + let mut region_state = RegionLocalState::default(); + region_state.set_tablet_index(INITIAL_TABLET_INDEX); + if region.get_id() == 4 { + region_state.set_state(PeerState::Tombstone); + } else if region.get_id() == 6 { + region.set_start_key(b"k26".to_vec()); + region.set_end_key(b"k28".to_vec()); + } + // add dummy peer to pass verification + region.mut_peers().push(new_peer(0, 0)); + region_state.set_region(region); + + let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); + // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to + // use prewrite method to prepare mvcc data + let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); + for i in i * 5..(i + 1) * 5 { + let key = format!("zk{:02}", i); + let val = format!("val{:02}", i); + // Use prewrite only is enough for preparing mvcc data + must_prewrite_put( + &mut engine, + key.as_bytes(), + val.as_bytes(), + key.as_bytes(), + 10, + ); + } + + wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + } + raft_engine.consume(&mut wb, true).unwrap(); +} + +// For simplicity, the format of the key is inline with data in +// prepare_data_on_disk +fn extract_key(key: &[u8]) -> &[u8] { + &key[1..4] +} + +#[test] +fn test_scan_mvcc() { + // We deliberately make region meta not match with rocksdb, set unlimited range + // compaction filter to avoid trim operation. + fail::cfg("unlimited_range_compaction_filter", "return").unwrap(); + + let dir = test_util::temp_dir("test-debugger", false); + prepare_data_on_disk(dir.path()); + let debugger = new_debugger(dir.path()); + // Test scan with bad start, end or limit. + assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); + assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); + + let verify_scanner = + |range, scanner: &mut dyn Iterator, MvccInfo)>>| { + for i in range { + let key = format!("k{:02}", i).into_bytes(); + assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); + } + }; + + // full scan + let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); + verify_scanner(1..15, &mut scanner); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has more elements than limit + let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); + verify_scanner(1..6, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has less elements than limit + let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); + verify_scanner(7..10, &mut scanner); + assert!(scanner.next().is_none()); + + // Start from the key where no region contains it + let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Scan a range not existed in the cluster + let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); + assert!(scanner.next().is_none()); + + // The end key is less than the start_key of the first region + let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); + assert!(scanner.next().is_none()); +} diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index bd4271be12d..d8b3fadb054 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -5,12 +5,12 @@ use std::{thread, time::Duration}; use kvproto::{ disk_usage::DiskUsage, kvrpcpb::{DiskFullOpt, Op}, - metapb::Region, raft_cmdpb::*, }; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, future::block_on_timeout, time::Instant}; fn assert_disk_full(resp: &RaftCmdResponse) { @@ -34,148 +34,147 @@ fn get_fp(usage: DiskUsage, store_id: u64) -> String { } // check the region new leader is elected. -fn assert_region_leader_changed( - cluster: &mut Cluster, - region_id: u64, - original_leader: u64, -) { - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("Leader cannot change when the only disk full node is leader"); +macro_rules! assert_region_leader_changed { + ($cluster:expr, $region_id:expr, $original_leader:expr) => {{ + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("Leader cannot change when the only disk full node is leader"); + } + let new_leader = $cluster.query_leader(1, $region_id, Duration::from_secs(1)); + if new_leader.is_none() { + sleep_ms(10); + continue; + } + if new_leader.unwrap().get_id() == $original_leader { + sleep_ms(10); + continue; + } else { + break; + } } - let new_leader = cluster.query_leader(1, region_id, Duration::from_secs(1)); - if new_leader.is_none() { - sleep_ms(10); - continue; - } - if new_leader.unwrap().get_id() == original_leader { - sleep_ms(10); - continue; - } else { - break; - } - } + }}; } -fn ensure_disk_usage_is_reported( - cluster: &mut Cluster, - peer_id: u64, - store_id: u64, - region: &Region, -) { - let peer = new_peer(store_id, peer_id); - let key = region.get_start_key(); - let ch = async_read_on_peer(cluster, peer, region.clone(), key, true, true); - block_on_timeout(ch, Duration::from_secs(1)).unwrap(); +macro_rules! ensure_disk_usage_is_reported { + ($cluster:expr, $peer_id:expr, $store_id:expr, $region:expr) => {{ + let peer = new_peer($store_id, $peer_id); + let key = $region.get_start_key(); + let ch = async_read_on_peer($cluster, peer, $region.clone(), key, true, true); + block_on_timeout(ch, Duration::from_secs(1)).unwrap(); + }}; } -fn test_disk_full_leader_behaviors(usage: DiskUsage) { - let mut cluster = new_node_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // To ensure all replicas are not pending. - cluster.must_put(b"k1", b"v1"); - must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // Test new normal proposals won't be allowed when disk is full. - let old_last_index = cluster.raft_local_state(1, 1).last_index; - let mut rx = cluster.async_put(b"k2", b"v2").unwrap(); - assert_disk_full(&rx.recv_timeout(Duration::from_secs(2)).unwrap()); - let new_last_index = cluster.raft_local_state(1, 1).last_index; - assert_eq!(old_last_index, new_last_index); - - assert_region_leader_changed(&mut cluster, 1, 1); - fail::remove(get_fp(usage, 1)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // merge/split is only allowed on disk almost full. - if usage != DiskUsage::AlreadyFull { - // Test split must be allowed when disk is full. - let region = cluster.get_region(b"k1"); - cluster.must_split(®ion, b"k1"); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_disk_full_leader_behaviors() { + for usage in [DiskUsage::AlmostFull, DiskUsage::AlreadyFull] { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); // set gc duration for v2 + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // To ensure all replicas are not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // Test new normal proposals won't be allowed when disk is full. + let old_last_index = cluster.raft_local_state(1, 1).last_index; + let rx = cluster.async_put(b"k2", b"v2").unwrap(); + assert_disk_full(&block_on_timeout(rx, Duration::from_secs(2)).unwrap()); + let new_last_index = cluster.raft_local_state(1, 1).last_index; + assert_eq!(old_last_index, new_last_index); + + assert_region_leader_changed!(&cluster, 1, 1); + fail::remove(get_fp(usage, 1)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // merge/split is only allowed on disk almost full. + if usage != DiskUsage::AlreadyFull { + // Test split must be allowed when disk is full. + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k1"); + } + // Test transfer leader should be allowed. + cluster.must_transfer_leader(1, new_peer(2, 2)); + + // Transfer the leadership back to store 1. + fail::remove(get_fp(usage, 1)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 1), "return").unwrap(); + + // Test remove peer should be allowed. + cluster.pd_client.must_remove_peer(1, new_peer(3, 3)); + // Sleep for a while until the disk usage and peer changes have been synced. + thread::sleep(Duration::from_secs(1)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // Test add peer should be allowed. It must be a higher peer-id in v2. + cluster.pd_client.must_add_peer(1, new_peer(3, 4)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + fail::remove(get_fp(usage, 1)); + // Sleep for a while before next case to make it clear. + thread::sleep(Duration::from_secs(1)); } - // Test transfer leader should be allowed. - cluster.must_transfer_leader(1, new_peer(2, 2)); - - // Transfer the leadership back to store 1. - fail::remove(get_fp(usage, 1)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 1), "return").unwrap(); - - // Test remove peer should be allowed. - cluster.pd_client.must_remove_peer(1, new_peer(3, 3)); - must_get_none(&cluster.get_engine(3), b"k1"); - - // Test add peer should be allowed. - cluster.pd_client.must_add_peer(1, new_peer(3, 3)); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - fail::remove(get_fp(usage, 1)); -} - -#[test] -fn test_disk_full_for_region_leader() { - test_disk_full_leader_behaviors(DiskUsage::AlmostFull); - test_disk_full_leader_behaviors(DiskUsage::AlreadyFull); -} - -fn test_disk_full_follower_behaviors(usage: DiskUsage) { - let mut cluster = new_node_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // To ensure all replicas are not pending. - cluster.must_put(b"k1", b"v1"); - must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - - cluster.must_transfer_leader(1, new_peer(1, 1)); - fail::cfg(get_fp(usage, 2), "return").unwrap(); - - // Test followers will reject pre-transfer-leader command. - let epoch = cluster.get_region_epoch(1); - let transfer = new_admin_request(1, &epoch, new_transfer_leader_cmd(new_peer(2, 2))); - cluster - .call_command_on_leader(transfer, Duration::from_secs(3)) - .unwrap(); - assert_eq!(cluster.leader_of_region(1).unwrap(), new_peer(1, 1)); - cluster.must_put(b"k2", b"v2"); - - // Test leader shouldn't append entries to disk full followers. - let old_last_index = cluster.raft_local_state(1, 2).last_index; - cluster.must_put(b"k3", b"v3"); - let new_last_index = cluster.raft_local_state(1, 2).last_index; - assert_eq!(old_last_index, new_last_index); - must_get_none(&cluster.get_engine(2), b"k3"); - - // Test followers will response votes when disk is full. - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(1, 1) - .direction(Direction::Send) - .msg_type(MessageType::MsgRequestVoteResponse), - )); - cluster.must_transfer_leader(1, new_peer(3, 3)); - - fail::remove(get_fp(usage, 2)); } -#[test] -fn test_disk_full_for_region_follower() { - test_disk_full_follower_behaviors(DiskUsage::AlmostFull); - test_disk_full_follower_behaviors(DiskUsage::AlreadyFull); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_disk_full_follower_behaviors() { + for usage in [DiskUsage::AlmostFull, DiskUsage::AlreadyFull] { + let mut cluster = new_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // To ensure all replicas are not pending. + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + fail::cfg(get_fp(usage, 2), "return").unwrap(); + + // Test followers will reject pre-transfer-leader command. + let epoch = cluster.get_region_epoch(1); + let transfer = new_admin_request(1, &epoch, new_transfer_leader_cmd(new_peer(2, 2))); + cluster + .call_command_on_leader(transfer, Duration::from_secs(3)) + .unwrap(); + assert_eq!(cluster.leader_of_region(1).unwrap(), new_peer(1, 1)); + cluster.must_put(b"k2", b"v2"); + + // Test leader shouldn't append entries to disk full followers. + let old_last_index = cluster.raft_local_state(1, 2).last_index; + cluster.must_put(b"k3", b"v3"); + let new_last_index = cluster.raft_local_state(1, 2).last_index; + assert_eq!(old_last_index, new_last_index); + must_get_none(&cluster.get_engine(2), b"k3"); + + // Test followers will response votes when disk is full. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 1) + .direction(Direction::Send) + .msg_type(MessageType::MsgRequestVoteResponse), + )); + cluster.must_transfer_leader(1, new_peer(3, 3)); + + fail::remove(get_fp(usage, 2)); + } } -fn test_disk_full_txn_behaviors(usage: DiskUsage) { - let mut cluster = new_server_cluster(0, 3); +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_disk_full_txn_behaviors() { + let usage = DiskUsage::AlmostFull; + let mut cluster = new_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.run(); @@ -199,7 +198,7 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { DiskFullOpt::NotAllowedOnFull, ); assert!(res.get_region_error().has_disk_full()); - assert_region_leader_changed(&mut cluster, 1, 1); + assert_region_leader_changed!(&cluster, 1, 1); fail::remove(get_fp(usage, 1)); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -269,16 +268,13 @@ fn test_disk_full_txn_behaviors(usage: DiskUsage) { fail::remove(get_fp(usage, 1)); } -#[test] -fn test_disk_full_for_txn_operations() { - test_disk_full_txn_behaviors(DiskUsage::AlmostFull); -} - -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_majority_disk_full() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); // set gc duration for v2 cluster.pd_client.disable_default_operator(); cluster.run(); @@ -295,12 +291,12 @@ fn test_majority_disk_full() { // To ensure followers have reported disk usages to the leader. for i in 1..3 { fail::cfg(get_fp(DiskUsage::AlmostFull, i + 1), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } // Normal proposals will be rejected because of majority peers' disk full. - let mut ch = cluster.async_put(b"k2", b"v2").unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_put(b"k2", b"v2").unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); // Proposals with special `DiskFullOpt`s can be accepted even if all peers are @@ -310,8 +306,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert!(!resp.get_header().has_error()); // Reset disk full status for peer 2 and 3. 2 follower reads must success @@ -319,14 +315,14 @@ fn test_majority_disk_full() { // new disk usages are reported. for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i + 1)); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); } // To ensure followers have reported disk usages to the leader. for i in 1..3 { fail::cfg(get_fp(DiskUsage::AlreadyFull, i + 1), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } // Proposals with special `DiskFullOpt`s will still be rejected if majority @@ -335,17 +331,19 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(10)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(10)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); // Peer 2 disk usage changes from already full to almost full. fail::remove(get_fp(DiskUsage::AlreadyFull, 2)); fail::cfg(get_fp(DiskUsage::AlmostFull, 2), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, 2, 2, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, 2, 2, ®ion); - // Configuration change should be alloed. + // Configuration change should be allowed. cluster.pd_client.must_remove_peer(1, new_peer(2, 2)); + // Sleep for a while until the disk usage and peer changes have been synced. + thread::sleep(Duration::from_secs(1)); // After the last configuration change is applied, the raft group will be like // `[(1, DiskUsage::AlmostFull), (3, DiskUsage::AlreadyFull)]`. So no more @@ -354,8 +352,8 @@ fn test_majority_disk_full() { let put = new_request(1, epoch, reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); - let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); + let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let resp = block_on_timeout(ch, Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![3]); for i in 0..3 { @@ -364,9 +362,10 @@ fn test_majority_disk_full() { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_disk_full_followers_with_hibernate_regions() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -391,31 +390,13 @@ fn test_disk_full_followers_with_hibernate_regions() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } -// check the region new leader is elected. -fn assert_region_merged( - cluster: &mut Cluster, - left_region_key: &[u8], - right_region_key: &[u8], -) { - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("region merge failed"); - } - let region_left = cluster.get_region(left_region_key); - let region_right = cluster.get_region(right_region_key); - if region_left.get_id() != region_right.get_id() { - sleep_ms(10); - continue; - } else { - break; - } - } -} - -#[test] +// #[test_case(test_raftstore_v2::new_server_cluster)] +// FIXME: #[test_case(test_raftstore_v2::new_server_cluster)] +// In v2 `must_try_merge` always return error. Also the last `must_merge` +// sometimes cannot get an updated min_matched. +#[test_case(test_raftstore::new_server_cluster)] fn test_merge_on_majority_disk_full() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -448,23 +429,42 @@ fn test_merge_on_majority_disk_full() { fail::cfg(get_fp(DiskUsage::AlmostFull, i), "return").unwrap(); } for peer in region1.get_peers().iter() { - ensure_disk_usage_is_reported(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion1); + ensure_disk_usage_is_reported!(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion1); } for peer in region2.get_peers().iter() { - ensure_disk_usage_is_reported(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion2); + ensure_disk_usage_is_reported!(&mut cluster, peer.get_id(), peer.get_store_id(), ®ion2); } cluster.must_try_merge(region1.get_id(), region2.get_id()); - assert_region_merged(&mut cluster, b"k1", b"k3"); + + // check the region new leader is elected. + let assert_region_merged = |left_region_key: &[u8], right_region_key: &[u8]| { + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("region merge failed"); + } + let region_left = cluster.get_region(left_region_key); + let region_right = cluster.get_region(right_region_key); + if region_left.get_id() != region_right.get_id() { + sleep_ms(10); + continue; + } else { + break; + } + } + }; + assert_region_merged(b"k1", b"k3"); for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i)); } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_almost_and_already_full_behavior() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.pd_client.disable_default_operator(); @@ -481,7 +481,7 @@ fn test_almost_and_already_full_behavior() { fail::cfg(get_fp(DiskUsage::AlreadyFull, i), "return").unwrap(); } for i in 1..5 { - ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i + 1, i + 1, ®ion); } let lead_client = PeerClient::new(&cluster, 1, new_peer(1, 1)); @@ -521,29 +521,10 @@ fn test_almost_and_already_full_behavior() { } } -fn wait_down_peers_reported( - cluster: &Cluster, - total_down_count: u64, - target_report_peer: u64, -) { - let mut peers = cluster.get_down_peers(); - let timer = Instant::now(); - loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!("Leader cannot change when the only disk full node is leader"); - } - - if peers.len() == total_down_count as usize && peers.contains_key(&target_report_peer) { - return; - } - sleep_ms(10); - peers = cluster.get_down_peers(); - } -} - -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_down_node_when_disk_full() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // To ensure the thread has full store disk usage infomation. cluster.cfg.raft_store.store_batch_system.pool_size = 1; cluster.cfg.raft_store.max_peer_down_duration = ReadableDuration::secs(1); @@ -555,7 +536,7 @@ fn test_down_node_when_disk_full() { let region = cluster.get_region(b"k1"); for i in 3..6 { fail::cfg(get_fp(DiskUsage::AlmostFull, i), "return").unwrap(); - ensure_disk_usage_is_reported(&mut cluster, i, i, ®ion); + ensure_disk_usage_is_reported!(&mut cluster, i, i, ®ion); } let lead_client = PeerClient::new(&cluster, 1, new_peer(1, 1)); @@ -574,7 +555,23 @@ fn test_down_node_when_disk_full() { ); cluster.stop_node(2); - wait_down_peers_reported(&cluster, 1, 2u64); + + let wait_down_peers_reported = |total_down_count: u64, target_report_peer: u64| { + let mut peers = cluster.get_down_peers(); + let timer = Instant::now(); + loop { + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!("Leader cannot change when the only disk full node is leader"); + } + + if peers.len() == total_down_count as usize && peers.contains_key(&target_report_peer) { + return; + } + sleep_ms(10); + peers = cluster.get_down_peers(); + } + }; + wait_down_peers_reported(1u64, 2u64); let prewrite_ts = get_tso(&cluster.pd_client); let res = lead_client.try_kv_prewrite( diff --git a/tests/failpoints/cases/test_early_apply.rs b/tests/failpoints/cases/test_early_apply.rs index a194ef74d8f..bf403fb4668 100644 --- a/tests/failpoints/cases/test_early_apply.rs +++ b/tests/failpoints/cases/test_early_apply.rs @@ -7,14 +7,16 @@ use std::sync::{ use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; // Test if a singleton can apply a log before persisting it. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_singleton_cannot_early_apply() { - let mut cluster = new_node_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); // Put one key first to cache leader. @@ -33,13 +35,14 @@ fn test_singleton_cannot_early_apply() { must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_multi_early_apply() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.cfg.raft_store.store_batch_system.pool_size = 1; // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run_conf_change(); // Check mixed regions can be scheduled correctly. @@ -68,9 +71,11 @@ fn test_multi_early_apply() { })), )); cluster.async_put(b"k4", b"v4").unwrap(); - // Sleep a while so that follower will send append response. + // Sleep a while so that follower will send append response sleep_ms(100); cluster.async_put(b"k11", b"v22").unwrap(); + // Sleep a while so that follower will send append response. + sleep_ms(100); // Now the store thread of store 1 pauses on `store_1_fp`. // Set `store_1_fp` again to make this store thread does not pause on it. // Then leader 1 will receive the append response and commit the log. @@ -92,6 +97,9 @@ fn test_multi_early_apply() { /// the peer to fix this issue. /// For simplicity, this test uses region merge to ensure that the apply state /// will be written to kv db before crash. +/// +/// Note: partitioned-raft-kv does not need this due to change in disk +/// persistence logic #[test] fn test_early_apply_yield_followed_with_many_entries() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_engine.rs b/tests/failpoints/cases/test_engine.rs new file mode 100644 index 00000000000..073f7276419 --- /dev/null +++ b/tests/failpoints/cases/test_engine.rs @@ -0,0 +1,139 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{mpsc::channel, Mutex}, + time::Duration, +}; + +use engine_traits::{MiscExt, CF_DEFAULT, CF_LOCK, CF_WRITE}; +use tikv_util::config::ReadableSize; + +fn dummy_string(len: usize) -> String { + String::from_utf8(vec![0; len]).unwrap() +} + +#[test] +fn test_write_buffer_manager() { + use test_raftstore_v2::*; + let count = 1; + let mut cluster = new_node_cluster(0, count); + cluster.cfg.rocksdb.lockcf.write_buffer_limit = Some(ReadableSize::kb(10)); + cluster.cfg.rocksdb.defaultcf.write_buffer_limit = Some(ReadableSize::kb(10)); + cluster.cfg.rocksdb.write_buffer_limit = Some(ReadableSize::kb(30)); + + // Let write buffer size small to make memtable request fewer memories. + // Otherwise, one single memory request can exceeds the write buffer limit set + // above. + cluster.cfg.rocksdb.lockcf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.cfg.rocksdb.writecf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.cfg.rocksdb.defaultcf.write_buffer_size = Some(ReadableSize::kb(64)); + cluster.run(); + + let dummy = dummy_string(500); + let fp = "on_memtable_sealed"; + fail::cfg(fp, "return(lock)").unwrap(); + + for i in 0..10 { + let key = format!("key-{:03}", i); + for cf in &[CF_WRITE, CF_LOCK] { + cluster.must_put_cf(cf, key.as_bytes(), dummy.as_bytes()); + } + } + + fail::cfg(fp, "return(default)").unwrap(); + + for i in 0..10 { + let key = format!("key-{:03}", i); + for cf in &[CF_WRITE, CF_DEFAULT] { + cluster.must_put_cf(cf, key.as_bytes(), dummy.as_bytes()); + } + } + + fail::cfg(fp, "return(write)").unwrap(); + let dummy = dummy_string(1000); + for i in 0..10 { + let key = format!("key-{:03}", i); + cluster.must_put_cf(CF_WRITE, key.as_bytes(), dummy.as_bytes()); + } +} + +// The test mocks the senario before https://github.com/tikv/rocksdb/pull/347: +// note: before rocksdb/pull/347, lock is called before on_memtable_sealed. +// Case: +// Assume FlushMemtable cf1 (schedule flush task) and BackgroundCallFlush cf1 +// (execute flush task) are performed concurrently. +// t FlushMemtable cf1 BackgroundCallFlush cf1 +// 1. lock +// 2. convert memtable t2(seqno. 10-20) +// to immemtable +// 3. unlock +// 4. lock +// 5. pick memtables to flush: +// t1(0-10), t2(10-20) +// flush job(0-20) +// 6. finish flush +// 7. unlock +// 8. on_flush_completed: +// update last_flushed to 20 +// 9. on_memtable_sealed +// 10 > 20 *panic* +#[test] +fn test_rocksdb_listener() { + use test_raftstore_v2::*; + let count = 1; + let mut cluster = new_node_cluster(0, count); + // make flush thread num 1 to be easy to construct the case + cluster.cfg.rocksdb.max_background_flushes = 1; + cluster.run(); + + let r = cluster.get_region(b"k10"); + cluster.must_split(&r, b"k10"); + + for i in 0..20 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + let r1 = cluster.get_region(b"k00").get_id(); + let r2 = cluster.get_region(b"k15").get_id(); + + let engine = cluster.get_engine(1); + let tablet1 = engine.get_tablet_by_id(r1).unwrap(); + let tablet2 = engine.get_tablet_by_id(r2).unwrap(); + + fail::cfg("on_flush_begin", "1*pause").unwrap(); + tablet1.flush_cf("default", false).unwrap(); // call flush 1 + std::thread::sleep(Duration::from_secs(1)); + + tablet2.flush_cf("default", false).unwrap(); // call flush 2 + for i in 20..30 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + fail::cfg("on_memtable_sealed", "pause").unwrap(); + + let h = std::thread::spawn(move || { + tablet2.flush_cf("default", true).unwrap(); + }); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_flush_completed", move || { + let _ = tx.lock().unwrap().send(true); // call flush 3 + }) + .unwrap(); + fail::remove("on_flush_begin"); + + let _ = rx.recv(); // flush 1 done + // Now, flush 1 has done, flush 3 is blocked at on_memtable_sealed. + // Before https://github.com/tikv/rocksdb/pull/347, unlock will be called + // before calling on_memtable_sealed, so flush 2 can pick the memtable sealed by + // flush 3 and thus make the order chaos. + // Now, unlock will not be called, so we have to remove failpoint to avoid + // deadlock. 2 seconds is long enough to make the test failed before + // rocksdb/pull/347. + std::thread::sleep(Duration::from_secs(2)); + fail::remove("on_memtable_sealed"); + + h.join().unwrap(); +} diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index d24ec85f040..50b71b59f47 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -14,9 +14,10 @@ use raftstore::coprocessor::{ RegionInfo, RegionInfoCallback, RegionInfoProvider, Result as CopResult, SeekRegionCallback, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ server::gc_worker::{ - AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, + sync_gc, AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, }, storage::{ kv::TestEngineBuilder, @@ -61,11 +62,38 @@ fn test_error_in_compaction_filter() { fail::remove(fp); } +#[derive(Clone)] +struct MockSafePointProvider; +impl GcSafePointProvider for MockSafePointProvider { + fn get_safe_point(&self) -> GcWorkerResult { + Ok(TimeStamp::from(0)) + } +} + +#[derive(Clone)] +struct MockRegionInfoProvider; +impl RegionInfoProvider for MockRegionInfoProvider { + fn seek_region(&self, _: &[u8], _: SeekRegionCallback) -> CopResult<()> { + Ok(()) + } + fn find_region_by_id( + &self, + _: u64, + _: RegionInfoCallback>, + ) -> CopResult<()> { + Ok(()) + } + fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> CopResult> { + Ok(vec![]) + } +} + // Test GC worker can receive and handle orphan versions emit from write CF's // compaction filter correctly. -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_orphan_versions_from_compaction_filter() { - let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { + let (cluster, leader, ctx) = new_cluster(|cluster| { cluster.cfg.gc.enable_compaction_filter = true; cluster.cfg.gc.compaction_filter_skip_version_check = true; cluster.pd_client.disable_default_operator(); @@ -76,8 +104,20 @@ fn test_orphan_versions_from_compaction_filter() { let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader_store)); let client = TikvClient::new(channel); - init_compaction_filter(&cluster, leader_store); - let engine = cluster.engines.get(&leader_store).unwrap(); + // Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine + // compaction filter and GC worker so that GC worker can help to process orphan + // versions on default CF. + { + let sim = cluster.sim.rl(); + let gc_worker = sim.get_gc_worker(leader_store); + gc_worker + .start_auto_gc( + AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), + Arc::new(AtomicU64::new(0)), + ) + .unwrap(); + } + let engine = cluster.get_engine(leader_store); let pk = b"k1".to_vec(); let large_value = vec![b'x'; 300]; @@ -91,22 +131,23 @@ fn test_orphan_versions_from_compaction_filter() { if start_ts < 40 { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_some()); + assert!(engine.get_value(&key).unwrap().is_some()); } } let fp = "write_compaction_filter_flush_write_batch"; fail::cfg(fp, "return").unwrap(); - let mut gc_runner = TestGcRunner::new(100); - gc_runner.gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); - gc_runner.gc(&engine.kv); + let gc_safe_ponit = TimeStamp::from(100); + let gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); + let region = cluster.get_region(&pk); + sync_gc(&gc_scheduler, region, gc_safe_ponit).unwrap(); 'IterKeys: for &start_ts in &[10, 20, 30] { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); for _ in 0..100 { - if engine.kv.get_value(&key).unwrap().is_some() { + if engine.get_value(&key).unwrap().is_some() { thread::sleep(Duration::from_millis(20)); continue; } @@ -117,47 +158,3 @@ fn test_orphan_versions_from_compaction_filter() { fail::remove(fp); } - -// Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine -// compaction filter and GC worker so that GC worker can help to process orphan -// versions on default CF. -fn init_compaction_filter(cluster: &Cluster, store_id: u64) { - #[derive(Clone)] - struct MockSafePointProvider; - impl GcSafePointProvider for MockSafePointProvider { - fn get_safe_point(&self) -> GcWorkerResult { - Ok(TimeStamp::from(0)) - } - } - - #[derive(Clone)] - struct MockRegionInfoProvider; - impl RegionInfoProvider for MockRegionInfoProvider { - fn seek_region(&self, _: &[u8], _: SeekRegionCallback) -> CopResult<()> { - Ok(()) - } - fn find_region_by_id( - &self, - _: u64, - _: RegionInfoCallback>, - ) -> CopResult<()> { - Ok(()) - } - fn get_regions_in_range( - &self, - _start_key: &[u8], - _end_key: &[u8], - ) -> CopResult> { - Ok(vec![]) - } - } - - let sim = cluster.sim.rl(); - let gc_worker = sim.get_gc_worker(store_id); - gc_worker - .start_auto_gc( - AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), - Arc::new(AtomicU64::new(0)), - ) - .unwrap(); -} diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 616a4e5e196..d2eb9aa10dd 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -101,10 +101,10 @@ fn test_store_disconnect_with_hibernate() { cluster.cfg.raft_store.raft_election_timeout_ticks = 10; cluster.cfg.raft_store.unreachable_backoff = ReadableDuration::millis(500); cluster.cfg.server.raft_client_max_backoff = ReadableDuration::millis(200); - // So the random election timeout will always be 10, which makes the case more - // stable. + // Use a small range but still random election timeouts, which makes the case + // more stable. cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; - cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 13; configure_for_hibernate(&mut cluster.cfg); cluster.pd_client.disable_default_operator(); let r = cluster.run_conf_change(); @@ -116,7 +116,7 @@ fn test_store_disconnect_with_hibernate() { must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); // Wait until all peers of region 1 hibernate. - thread::sleep(Duration::from_millis(base_tick_ms * 30)); + thread::sleep(Duration::from_millis(base_tick_ms * 40)); // Stop the region leader. fail::cfg("receive_raft_message_from_outside", "pause").unwrap(); @@ -128,7 +128,7 @@ fn test_store_disconnect_with_hibernate() { fail::remove("receive_raft_message_from_outside"); // Wait for a while. Peers of region 1 shouldn't hibernate. - thread::sleep(Duration::from_millis(base_tick_ms * 30)); + thread::sleep(Duration::from_millis(base_tick_ms * 40)); must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); } diff --git a/tests/failpoints/cases/test_import_service.rs b/tests/failpoints/cases/test_import_service.rs index a2487456108..010d12177b6 100644 --- a/tests/failpoints/cases/test_import_service.rs +++ b/tests/failpoints/cases/test_import_service.rs @@ -7,10 +7,10 @@ use std::{ use file_system::calc_crc32; use futures::{executor::block_on, stream, SinkExt}; -use grpcio::{Result, WriteFlags}; -use kvproto::import_sstpb::*; +use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; +use kvproto::{import_sstpb::*, tikvpb_grpc::TikvClient}; use tempfile::{Builder, TempDir}; -use test_raftstore::Simulator; +use test_raftstore::{must_raw_put, Simulator}; use test_sst_importer::*; use tikv::config::TikvConfig; use tikv_util::{config::ReadableSize, HandyRwLock}; @@ -46,7 +46,7 @@ fn test_download_sst_blocking_sst_writer() { // Now perform a proper download. let mut download = DownloadRequest::default(); download.set_sst(meta.clone()); - download.set_storage_backend(external_storage_export::make_local_backend(temp_dir.path())); + download.set_storage_backend(external_storage::make_local_backend(temp_dir.path())); download.set_name("test.sst".to_owned()); download.mut_sst().mut_range().set_start(vec![sst_range.1]); download @@ -455,3 +455,73 @@ fn sst_file_count(paths: &Vec) -> u64 { } count } + +#[test] +fn test_flushed_applied_index_after_ingset() { + // disable data flushed + fail::cfg("on_flush_completed", "return()").unwrap(); + // disable data flushed + let (mut cluster, ctx, _tikv, import) = open_cluster_and_tikv_import_client_v2(None); + let temp_dir = Builder::new().prefix("test_ingest_sst").tempdir().unwrap(); + let sst_path = temp_dir.path().join("test.sst"); + + // Create clients. + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(1)); + let client = TikvClient::new(channel); + + for i in 0..5 { + let sst_range = (i * 20, (i + 1) * 20); + let (mut meta, data) = gen_sst_file(sst_path.clone(), sst_range); + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta.clone()); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + } + + // only 1 sst left because there is no more event to trigger a raft ready flush. + let count = sst_file_count(&cluster.paths); + assert_eq!(1, count); + + for i in 5..8 { + let sst_range = (i * 20, (i + 1) * 20); + let (mut meta, data) = gen_sst_file(sst_path.clone(), sst_range); + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta.clone()); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + } + + // ingest more sst files, unflushed index still be 1. + let count = sst_file_count(&cluster.paths); + assert_eq!(1, count); + + // file a write to trigger ready flush, even if the write is not flushed. + must_raw_put(&client, ctx, b"key1".to_vec(), b"value1".to_vec()); + let count = sst_file_count(&cluster.paths); + assert_eq!(0, count); + + // restart node, should not tirgger any ingest + fail::cfg("on_apply_ingest", "panic").unwrap(); + cluster.stop_node(1); + cluster.start().unwrap(); + let count = sst_file_count(&cluster.paths); + assert_eq!(0, count); + + fail::remove("on_apply_ingest"); + fail::remove("on_flush_completed"); +} diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index f3831bb984b..c8777282787 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -3,15 +3,22 @@ use std::{sync::Arc, time::Duration}; use grpcio::{ChannelBuilder, Environment}; -use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; +use kvproto::{ + kvrpcpb::{PrewriteRequestPessimisticAction::SkipPessimisticCheck, *}, + tikvpb::TikvClient, +}; use test_raftstore::{ - must_kv_prewrite, must_new_cluster_and_kv_client, must_new_cluster_mul, + configure_for_lease_read, must_kv_commit, must_kv_have_locks, must_kv_prewrite, + must_kv_prewrite_with, must_new_cluster_mul, new_server_cluster, try_kv_prewrite_with, try_kv_prewrite_with_impl, }; +use test_raftstore_macro::test_case; +use tikv_util::{config::ReadableDuration, HandyRwLock}; -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_batch_get_memory_lock() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let mut req = BatchGetRequest::default(); req.set_context(ctx); @@ -27,9 +34,10 @@ fn test_batch_get_memory_lock() { fail::remove("raftkv_async_snapshot_err"); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_kv_scan_memory_lock() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let mut req = ScanRequest::default(); req.set_context(ctx); @@ -45,9 +53,10 @@ fn test_kv_scan_memory_lock() { fail::remove("raftkv_async_snapshot_err"); } -#[test] +#[test_case(test_raftstore::must_new_cluster_mul)] +#[test_case(test_raftstore_v2::must_new_cluster_mul)] fn test_snapshot_not_block_grpc() { - let (cluster, leader, ctx) = must_new_cluster_mul(1); + let (cluster, leader, ctx) = new_cluster(1); let env = Arc::new(Environment::new(1)); let channel = ChannelBuilder::new(env) .keepalive_time(Duration::from_millis(500)) @@ -72,6 +81,8 @@ fn test_snapshot_not_block_grpc() { fail::remove("after-snapshot"); } +// the result notify mechanism is different in raft-v2, so no need to add a +// equivalent case for v2. #[test] fn test_undetermined_write_err() { let (cluster, leader, ctx) = must_new_cluster_mul(1); @@ -91,6 +102,7 @@ fn test_undetermined_write_err() { &client, ctx, vec![mutation], + vec![], b"k".to_vec(), 10, 0, @@ -103,3 +115,158 @@ fn test_undetermined_write_err() { // The previous panic hasn't been captured. assert!(std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| drop(cluster))).is_err()); } + +#[test] +fn test_stale_read_on_local_leader() { + let mut cluster = new_server_cluster(0, 1); + // Increase the election tick to make this test case running reliably. + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); + let max_lease = Duration::from_secs(2); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); + cluster.pd_client.disable_default_operator(); + cluster.run(); + + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader.clone()); + ctx.set_region_epoch(epoch); + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + let v1 = b"value1".to_vec(); + + // Write record. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 10); + must_kv_commit(&client, ctx.clone(), vec![k.clone()], 10, 30, 30); + + // Prewrite and leave a lock. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v1); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 50); + + let mut req = GetRequest::default(); + req.set_context(ctx); + req.set_key(k); + req.version = 40; + req.mut_context().set_stale_read(true); + + // The stale read should fallback and succeed on the leader peer. + let resp = client.kv_get(&req).unwrap(); + assert!(resp.error.is_none()); + assert!(resp.region_error.is_none()); + assert_eq!(v, resp.get_value()); +} + +#[test] +fn test_storage_do_not_update_txn_status_cache_on_write_error() { + let cache_hit_fp = "before_prewrite_txn_status_cache_hit"; + let cache_miss_fp = "before_prewrite_txn_status_cache_miss"; + + let (cluster, leader, ctx) = must_new_cluster_mul(1); + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(env) + .connect(&cluster.sim.read().unwrap().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let pk = b"pk".to_vec(); + + // Case 1: Test write successfully. + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(b"k1".to_vec()); + mutation.set_value(b"v1".to_vec()); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation.clone()], + vec![SkipPessimisticCheck], + pk.clone(), + 10, + 10, + true, + false, + ); + must_kv_commit(&client, ctx.clone(), vec![b"k1".to_vec()], 10, 15, 15); + + // Expect cache hit + fail::cfg(cache_miss_fp, "panic").unwrap(); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![SkipPessimisticCheck], + pk.clone(), + 10, + 10, + true, + false, + ); + // Key not locked. + must_kv_have_locks(&client, ctx.clone(), 19, b"k1", b"k2", &[]); + fail::remove(cache_miss_fp); + + // Case 2: Write failed. + + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(b"k2".to_vec()); + mutation.set_value(b"v2".to_vec()); + + try_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation.clone()], + vec![SkipPessimisticCheck], + pk.clone(), + 20, + 20, + true, + false, + ); + fail::cfg("raftkv_early_error_report", "return").unwrap(); + let mut commit_req = CommitRequest::default(); + commit_req.set_context(ctx.clone()); + commit_req.set_start_version(20); + commit_req.set_commit_version(25); + commit_req.set_keys(vec![b"k2".to_vec()].into()); + let commit_resp = client.kv_commit(&commit_req).unwrap(); + assert!(commit_resp.has_region_error()); + fail::remove("raftkv_early_error_report"); + must_kv_have_locks( + &client, + ctx.clone(), + 29, + b"k2", + b"k3", + &[(b"k2", Op::Put, 20, 20)], + ); + + // Expect cache miss + fail::cfg(cache_hit_fp, "panic").unwrap(); + try_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![SkipPessimisticCheck], + pk, + 20, + 20, + true, + false, + ); + must_kv_have_locks(&client, ctx, 29, b"k2", b"k3", &[(b"k2", Op::Put, 20, 20)]); + fail::remove(cache_hit_fp); +} diff --git a/tests/failpoints/cases/test_life.rs b/tests/failpoints/cases/test_life.rs new file mode 100644 index 00000000000..2bc833075c6 --- /dev/null +++ b/tests/failpoints/cases/test_life.rs @@ -0,0 +1,36 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use test_raftstore::*; +use test_raftstore_macro::test_case; +use tikv_util::config::ReadableDuration; + +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_gc_peer_on_tombstone_store() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + + let peer_on_store1 = find_peer(®ion, 1).unwrap().clone(); + let peer_on_store3 = find_peer(®ion, 3).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.must_remove_peer(region.get_id(), peer_on_store3); + + // Immediately invalidate store address cache. + fail::cfg("mock_store_refresh_interval_secs", "return(0)").unwrap(); + + // Shutdown store 3 and wait for gc peer ticks. + cluster.stop_node(3); + cluster.clear_send_filters(); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + cluster.must_empty_region_removed_records(region.get_id()); +} diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 3cc72d44da1..cc7311bfe75 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -3,12 +3,14 @@ use std::{ sync::{ atomic::{AtomicBool, Ordering}, + mpsc::{channel, sync_channel, Sender}, *, }, thread, time::Duration, }; +use engine_rocks::RocksEngine; use engine_traits::{Peekable, CF_RAFT}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -19,15 +21,18 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::store::*; +use raftstore_v2::router::{PeerMsg, PeerTick}; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; -use tikv_util::{config::*, time::Instant, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is rollback as expected. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_rollback() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -51,8 +56,16 @@ fn test_node_merge_rollback() { let schedule_merge_fp = "on_schedule_merge"; fail::cfg(schedule_merge_fp, "return()").unwrap(); - // The call is finished when prepare_merge is applied. - cluster.must_try_merge(region.get_id(), target_region.get_id()); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_prepare_merge", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + cluster.merge_region(region.get_id(), target_region.get_id(), Callback::None); + // PrepareMerge is applied. + rx.recv().unwrap(); // Add a peer to trigger rollback. pd_client.must_add_peer(right.get_id(), new_peer(3, 5)); @@ -72,12 +85,7 @@ fn test_node_merge_rollback() { region.mut_region_epoch().set_version(4); for i in 1..3 { must_get_equal(&cluster.get_engine(i), b"k11", b"v11"); - let state_key = keys::region_state_key(region.get_id()); - let state: RegionLocalState = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(region.get_id(), i); assert_eq!(state.get_state(), PeerState::Normal); assert_eq!(*state.get_region(), region); } @@ -86,7 +94,10 @@ fn test_node_merge_rollback() { fail::cfg(schedule_merge_fp, "return()").unwrap(); let target_region = pd_client.get_region(b"k3").unwrap(); - cluster.must_try_merge(region.get_id(), target_region.get_id()); + cluster.merge_region(region.get_id(), target_region.get_id(), Callback::None); + // PrepareMerge is applied. + rx.recv().unwrap(); + let mut region = pd_client.get_region(b"k1").unwrap(); // Split to trigger rollback. @@ -101,12 +112,7 @@ fn test_node_merge_rollback() { region.mut_region_epoch().set_version(6); for i in 1..3 { must_get_equal(&cluster.get_engine(i), b"k12", b"v12"); - let state_key = keys::region_state_key(region.get_id()); - let state: RegionLocalState = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(region.get_id(), i); assert_eq!(state.get_state(), PeerState::Normal); assert_eq!(*state.get_region(), region); } @@ -1227,7 +1233,7 @@ fn test_prewrite_before_max_ts_is_synced() { let channel = ChannelBuilder::new(env).connect(&addr); let client = TikvClient::new(channel); - let do_prewrite = |cluster: &mut Cluster| { + let do_prewrite = |cluster: &mut Cluster>| { let region_id = right.get_id(); let leader = cluster.leader_of_region(region_id).unwrap(); let epoch = cluster.get_region_epoch(region_id); @@ -1532,7 +1538,7 @@ fn test_retry_pending_prepare_merge_fail() { let mut rx = cluster.async_put(b"k1", b"v11").unwrap(); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); + block_on_timeout(rx.as_mut(), Duration::from_millis(200)).unwrap_err(); // Then, start merging. PrepareMerge should become pending because applied_index // is smaller than proposed_index. @@ -1546,7 +1552,7 @@ fn test_retry_pending_prepare_merge_fail() { fail::cfg("disk_already_full_peer_1", "return").unwrap(); fail::cfg("disk_already_full_peer_2", "return").unwrap(); fail::remove("on_handle_apply"); - let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + let res = block_on_timeout(rx, Duration::from_secs(1)).unwrap(); assert!(!res.get_header().has_error(), "{:?}", res); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); @@ -1706,3 +1712,465 @@ fn test_destroy_source_peer_while_merging() { must_get_equal(&cluster.get_engine(i), b"k5", b"v5"); } } + +struct MsgTimeoutFilter { + // wrap with mutex to make tx Sync. + tx: Mutex>, +} + +impl Filter for MsgTimeoutFilter { + fn before(&self, msgs: &mut Vec) -> raftstore::Result<()> { + let mut res = Vec::with_capacity(msgs.len()); + for m in msgs.drain(..) { + if m.get_message().msg_type == MessageType::MsgTimeoutNow { + self.tx.lock().unwrap().send(m).unwrap(); + } else { + res.push(m); + } + } + + *msgs = res; + check_messages(msgs) + } +} + +// Concurrent execution between transfer leader and merge can cause rollback and +// commit merge at the same time before this fix which corrupt the region. +// It can happen as this: +// Assume at the begin, leader of source and target are both on node-1 +// 1. node-1 transfer leader to node-2: execute up to sending MsgTimeoutNow +// (leader_transferre has been set), but before becoming follower. +// 2. node-1 source region propose, and apply PrepareMerge +// 3. node-1 target region propose CommitMerge but fail (due to +// leader_transferre being set) +// 4. node-1 source region successfully proposed rollback merge +// 5. node-2 target region became leader and apply the first no-op entry +// 6. node-2 target region successfully proposed commit merge +// Now, rollback at source region and commit at target region are both proposed +// and will be executed which will cause region corrupt +#[test] +fn test_concurrent_between_transfer_leader_and_merge() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let right = pd_client.get_region(b"k1").unwrap(); + let left = pd_client.get_region(b"k3").unwrap(); + cluster.must_transfer_leader( + left.get_id(), + left.get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + cluster.must_transfer_leader( + right.get_id(), + right + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + // Source region: 1, Target Region: 1000 + // Let target region in leader_transfering status by interceptting MsgTimeoutNow + // msg by using Filter. So we make node-1-1000 be in leader_transferring status + // for some time. + let (tx, rx_msg) = channel(); + let filter = MsgTimeoutFilter { tx: Mutex::new(tx) }; + cluster.add_send_filter_on_node(1, Box::new(filter)); + + pd_client.transfer_leader( + right.get_id(), + right + .get_peers() + .iter() + .find(|p| p.store_id == 2) + .cloned() + .unwrap(), + vec![], + ); + + let msg = rx_msg.recv().unwrap(); + + // Now, node-1-1000 is in leader_transferring status. After it reject proposing + // commit merge, make node-1-1 block before proposing rollback merge until + // node-2-1000 propose commit merge. + + fail::cfg("on_reject_commit_merge_1", "pause").unwrap(); + + let router = cluster.get_router(2).unwrap(); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + let _ = fail::cfg_callback("propose_commit_merge_1", move || { + tx.lock().unwrap().send(()).unwrap(); + }); + + let (tx2, rx2) = channel(); + let tx2 = Mutex::new(tx2); + let _ = fail::cfg_callback("on_propose_commit_merge_success", move || { + tx2.lock().unwrap().send(()).unwrap(); + }); + + cluster.merge_region(left.get_id(), right.get_id(), Callback::None); + + // Actually, store 1 should not reach the line of propose_commit_merge_1 + let _ = rx.recv_timeout(Duration::from_secs(2)); + router + .force_send( + msg.get_region_id(), + PeerMsg::RaftMessage(Box::new(msg), None), + ) + .unwrap(); + + // Wait region 1 of node 2 to become leader + rx2.recv().unwrap(); + fail::remove("on_reject_commit_merge_1"); + + wait_region_epoch_change(&cluster, &right, Duration::from_secs(5)); + + let region = pd_client.get_region(b"k1").unwrap(); + assert_eq!(region.get_id(), right.get_id()); + assert_eq!(region.get_start_key(), right.get_start_key()); + assert_eq!(region.get_end_key(), left.get_end_key()); + + cluster.must_put(b"k4", b"v4"); +} + +#[test] +fn test_deterministic_commit_rollback_merge() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + // Use a large election tick to stable test. + configure_for_lease_read(&mut cluster.cfg, None, Some(1000)); + // Use 2 threads for polling peers, so that they can run concurrently. + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.run(); + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k3").unwrap(); + let right_1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_1); + let left_2 = find_peer(&left, 2).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_2); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + // Delay 1003 apply by dropping append response, so that proposal will fail + // due to applied_term != current_term. + let target_region_id = left.get_id(); + cluster.add_recv_filter_on_node( + 1, + Box::new(DropMessageFilter::new(Arc::new(move |m| { + if m.get_region_id() == target_region_id { + return m.get_message().get_msg_type() != MessageType::MsgAppendResponse; + } + true + }))), + ); + + let left_1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_1); + + // left(1000) <- right(1). + let (tx1, rx1) = channel(); + let (tx2, rx2) = channel(); + let tx1 = Mutex::new(tx1); + let rx2 = Mutex::new(rx2); + fail::cfg_callback("on_propose_commit_merge_fail_store_1", move || { + tx1.lock().unwrap().send(()).unwrap(); + rx2.lock().unwrap().recv().unwrap(); + }) + .unwrap(); + cluster.merge_region(right.get_id(), left.get_id(), Callback::None); + + // Wait for target fails to propose commit merge. + rx1.recv_timeout(Duration::from_secs(5)).unwrap(); + // Let target apply continue, and new AskCommitMerge messages will propose + // commit merge successfully. + cluster.clear_recv_filter_on_node(1); + + // Trigger a CheckMerge tick, so source will send a AskCommitMerge again. + fail::cfg("ask_target_peer_to_commit_merge_store_1", "pause").unwrap(); + let router = cluster.get_router(1).unwrap(); + router + .check_send(1, PeerMsg::Tick(PeerTick::CheckMerge)) + .unwrap(); + + // Send RejectCommitMerge to source. + tx2.send(()).unwrap(); + fail::remove("on_propose_commit_merge_fail_store_1"); + + // Wait for target applies to current term. + cluster.must_put(b"k1", b"v11"); + + // By remove the failpoint, CheckMerge tick sends a AskCommitMerge again. + fail::remove("ask_target_peer_to_commit_merge_store_1"); + // At this point, source region will propose rollback merge if commit merge + // is not deterministic. + + // Wait for source handle commit or rollback merge. + wait_region_epoch_change(&cluster, &left, Duration::from_secs(5)); + + // No matter commit merge or rollback merge, cluster must be available to + // process requests + cluster.must_put(b"k0", b"v0"); + cluster.must_put(b"k4", b"v4"); +} + +struct MsgVoteFilter {} + +impl Filter for MsgVoteFilter { + fn before(&self, msgs: &mut Vec) -> raftstore::Result<()> { + msgs.retain(|m| { + let msg_type = m.get_message().msg_type; + msg_type != MessageType::MsgRequestPreVote && msg_type != MessageType::MsgRequestVote + }); + check_messages(msgs) + } +} + +// Before the fix of this PR (#15649), after prepare merge, raft cmd can still +// be proposed if restart is involved. If the proposed raft cmd is CompactLog, +// panic can occur during fetch entries: see issue https://github.com/tikv/tikv/issues/15633. +// Consider the case: +// 1. node-1 apply PrepareMerge (assume log index 30), so it's in is_merging +// status which reject all proposals except for Rollback Merge +// 2. node-1 advance persisted_apply to 30 +// 3. node-1 restart and became leader. Now, it's not in is_merging status, so +// proposals can be proposed +// 4. node-1 propose CompactLog, replicate it to other nodes, and commit +// 5. node-0 apply PrepareMerge +// 6. node-0 apply CompactLog +// 6. node-0 fetches raft log entries which is required by +// AdminCmdType::CommitMerge and panic (due to compacted) +#[test] +fn test_restart_may_lose_merging_state() { + use test_raftstore_v2::*; + let mut cluster = new_node_cluster(0, 2); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(10); + + cluster.run(); + fail::cfg("maybe_propose_compact_log", "return").unwrap(); + fail::cfg("on_ask_commit_merge", "return").unwrap(); + fail::cfg("flush_before_close_threshold", "return(0)").unwrap(); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_prepare_merge", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + let region = cluster.get_region(b""); + cluster.must_split(®ion, b"k20"); + + let source = cluster.get_region(b"k05"); + let target = cluster.get_region(b"k25"); + + cluster.add_send_filter_on_node(2, Box::new(MsgVoteFilter {})); + + cluster.must_transfer_leader( + source.id, + source + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + cluster.must_transfer_leader( + target.id, + target + .get_peers() + .iter() + .find(|p| p.store_id == 1) + .cloned() + .unwrap(), + ); + + for i in 0..20 { + let k = format!("k{:02}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + cluster.merge_region(source.id, target.id, Callback::None); + + rx.recv().unwrap(); + let router = cluster.get_router(1).unwrap(); + let (tx, rx) = sync_channel(1); + let msg = PeerMsg::FlushBeforeClose { tx }; + router.force_send(source.id, msg).unwrap(); + rx.recv().unwrap(); + + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + fail::cfg_callback("on_apply_res_commit_merge_2", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + cluster.stop_node(1); + // Need to avoid propose commit merge, before node 1 becomes leader. Otherwise, + // the commit merge will be rejected. + let (tx2, rx2) = channel(); + let tx2 = Mutex::new(tx2); + fail::cfg_callback("on_applied_current_term", move || { + tx2.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + fail::remove("maybe_propose_compact_log"); + cluster.run_node(1).unwrap(); + + // we have two regions. + rx2.recv().unwrap(); + rx2.recv().unwrap(); + fail::remove("on_ask_commit_merge"); + // wait node 2 to apply commit merge + rx.recv_timeout(Duration::from_secs(10)).unwrap(); + + wait_region_epoch_change(&cluster, &target, Duration::from_secs(5)); + + let region = cluster.get_region(b"k1"); + assert_eq!(region.get_id(), target.get_id()); + assert_eq!(region.get_start_key(), source.get_start_key()); + assert_eq!(region.get_end_key(), target.get_end_key()); + + cluster.must_put(b"k400", b"v400"); +} + +// If a node is isolated during merge, and the target peer is replaced by a peer +// with a larger ID, then the snapshot of the target peer covers the source +// regions as well. +// In such cases, the snapshot becomes an "atomic_snapshot" which needs to +// destroy the source peer too. +// This test case checks the race between destroying the source peer by atomic +// snapshot and the gc message. The source peer must be successfully destroyed +// in this case. +#[test_case(test_raftstore::new_node_cluster)] +fn test_destroy_race_during_atomic_snapshot_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.run(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + // Allow raft messages to source peer on store 3 before PrepareMerge. + let left_filter_block = Arc::new(atomic::AtomicBool::new(false)); + let left_filter_block_ = left_filter_block.clone(); + let left_blocked_messages = Arc::new(Mutex::new(vec![])); + let left_filter = RegionPacketFilter::new(left.get_id(), 3) + .direction(Direction::Recv) + .when(left_filter_block.clone()) + .reserve_dropped(left_blocked_messages.clone()) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + debug!("dbg left msg_callback"; "msg" => ?msg); + if left_filter_block.load(atomic::Ordering::SeqCst) { + return; + } + for e in msg.get_message().get_entries() { + let ctx = raftstore::store::ProposalContext::from_bytes(&e.context); + if ctx.contains(raftstore::store::ProposalContext::PREPARE_MERGE) { + // Block further messages. + left_filter_block.store(true, atomic::Ordering::SeqCst); + } + } + })); + cluster.sim.wl().add_recv_filter(3, Box::new(left_filter)); + // Block messages to target peer on store 3. + let right_filter_block = Arc::new(atomic::AtomicBool::new(true)); + let new_peer_id = 1004; + let (new_peer_id_tx, new_peer_id_rx) = std::sync::mpsc::channel(); + let new_peer_id_tx = Mutex::new(Some(new_peer_id_tx)); + let (new_peer_snap_tx, new_peer_snap_rx) = std::sync::mpsc::channel(); + let new_peer_snap_tx = Mutex::new(new_peer_snap_tx); + let right_filter = RegionPacketFilter::new(right.get_id(), 3) + .direction(Direction::Recv) + .when(right_filter_block.clone()) + .set_msg_callback(Arc::new(move |msg: &RaftMessage| { + debug!("dbg right msg_callback"; "msg" => ?msg); + if msg.get_to_peer().get_id() == new_peer_id { + let _ = new_peer_id_tx.lock().unwrap().take().map(|tx| tx.send(())); + if msg.get_message().get_msg_type() == MessageType::MsgSnapshot { + let _ = new_peer_snap_tx.lock().unwrap().send(()); + } + } + })); + cluster.sim.wl().add_recv_filter(3, Box::new(right_filter)); + pd_client.must_merge(left.get_id(), right.get_id()); + + // Make target peer on store 3 a stale peer. + pd_client.must_remove_peer(right.get_id(), find_peer(&right, 3).unwrap().to_owned()); + pd_client.must_add_peer(right.get_id(), new_peer(3, new_peer_id)); + // Unblock messages to target peer on store 3. + right_filter_block.store(false, atomic::Ordering::SeqCst); + // Wait for receiving new peer id message to destroy stale target peer. + new_peer_id_rx.recv_timeout(Duration::from_secs(5)).unwrap(); + cluster.must_region_not_exist(right.get_id(), 3); + // Let source peer continue prepare merge. It will fails to schedule merge, + // because the target peer is destroyed. + left_filter_block_.store(false, atomic::Ordering::SeqCst); + // Before sending blocked messages, make sure source peer is paused at + // destroy apply delegate, so that the new right peer snapshot can will + // try to destroy source peer before applying snapshot. + fail::cfg("on_apply_handle_destroy", "pause").unwrap(); + // Send blocked messages to source peer. Prepare merge must fail to schedule + // CommitMerge because now target peer stale peer is destroyed. + let router = cluster.sim.wl().get_router(3).unwrap(); + for raft_msg in std::mem::take(&mut *left_blocked_messages.lock().unwrap()) { + router.send_raft_message(raft_msg).unwrap(); + } + // Wait the new right peer snapshot. + new_peer_snap_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // Give it some time to step snapshot message. + sleep_ms(500); + // Let source peer destroy continue, so it races with atomic snapshot destroy. + fail::remove("on_apply_handle_destroy"); + + // New peer applies snapshot eventually. + cluster.must_transfer_leader(right.get_id(), new_peer(3, new_peer_id)); + cluster.must_put(b"k4", b"v4"); +} diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index a795422c120..b7886ce8267 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -3,6 +3,7 @@ use std::{sync::Arc, thread, time::Duration}; use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; +use engine_rocks::RocksEngine; use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -14,7 +15,7 @@ use test_raftstore::*; use tikv_util::{time::Instant, HandyRwLock}; struct TestSuite { - pub cluster: Cluster, + pub cluster: Cluster>, api_version: ApiVersion, } diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs index c5ff93a70c1..7351044b297 100644 --- a/tests/failpoints/cases/test_read_execution_tracker.rs +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -2,13 +2,13 @@ use kvproto::kvrpcpb::*; use test_coprocessor::{init_with_data, DagSelect, ProductTable}; -use test_raftstore::{ - kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite, must_new_cluster_and_kv_client, -}; +use test_raftstore::{kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite}; +use test_raftstore_macro::test_case; -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_read_execution_tracking() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k1, v1) = (b"k1".to_vec(), b"v1".to_vec()); let (k2, v2) = (b"k2".to_vec(), b"v2".to_vec()); @@ -104,18 +104,21 @@ fn test_read_execution_tracking() { ); }; - fail::cfg("perform_read_index", "return()").unwrap(); + // return read_index twich: one for local reader and one for raftstore + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = kv_read(&client, ctx.clone(), k1.clone(), 100); read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = kv_batch_read(&client, ctx, vec![k1, k2], 100); read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::cfg("perform_read_index", "2*return()").unwrap(); // should perform read index let resp = client.coprocessor(&coprocessor_request).unwrap(); diff --git a/tests/failpoints/cases/test_replica_read.rs b/tests/failpoints/cases/test_replica_read.rs index 773d721da8b..624e7a6f788 100644 --- a/tests/failpoints/cases/test_replica_read.rs +++ b/tests/failpoints/cases/test_replica_read.rs @@ -315,7 +315,7 @@ fn test_read_after_cleanup_range_for_snap() { request.mut_header().set_peer(p3); request.mut_header().set_replica_read(true); // Send follower read request to peer 3 - let (cb1, mut rx1) = make_cb(&request); + let (cb1, mut rx1) = make_cb_rocks(&request); cluster .sim .rl() @@ -619,7 +619,7 @@ fn test_batch_read_index_after_transfer_leader() { let mut req = new_request(1, epoch, vec![new_read_index_cmd()], true); req.mut_header().set_peer(new_peer(2, 2)); - let (cb, rx) = make_cb(&req); + let (cb, rx) = make_cb_rocks(&req); cluster.sim.rl().async_command_on_node(2, req, cb).unwrap(); resps.push(rx); } diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index b7d436d92d7..30ccda4fe21 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -2,20 +2,31 @@ use std::{sync::Arc, time::Duration}; +use engine_rocks::RocksEngine; use kvproto::{kvrpcpb::Op, metapb::Peer}; use pd_client::PdClient; use raft::eraftpb::MessageType; use test_pd_client::TestPdClient; use test_raftstore::*; -fn prepare_for_stale_read(leader: Peer) -> (Cluster, Arc, PeerClient) { +fn prepare_for_stale_read( + leader: Peer, +) -> ( + Cluster>, + Arc, + PeerClient, +) { prepare_for_stale_read_before_run(leader, None) } fn prepare_for_stale_read_before_run( leader: Peer, before_run: Option>, -) -> (Cluster, Arc, PeerClient) { +) -> ( + Cluster>, + Arc, + PeerClient, +) { let mut cluster = new_server_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -288,9 +299,11 @@ fn test_update_resoved_ts_before_apply_index() { sleep_ms(100); // The leader can't handle stale read with `commit_ts2` because its `safe_ts` - // can't update due to its `apply_index` not update + // can't update due to its `apply_index` not update. + // The request would be handled as a snapshot read on the valid leader peer + // after fallback. let resp = leader_client.kv_read(b"key1".to_vec(), commit_ts2); - assert!(resp.get_region_error().has_data_is_not_ready(),); + assert_eq!(resp.get_value(), b"value2"); // The follower can't handle stale read with `commit_ts2` because it don't // have enough data let resp = follower_client2.kv_read(b"key1".to_vec(), commit_ts2); @@ -667,10 +680,10 @@ fn test_stale_read_future_ts_not_update_max_ts() { b"key1".to_vec(), ); - // Perform stale read with a future ts should return error + // Perform stale read with a future ts, the stale read could be processed + // falling back to snapshot read on the leader peer. let read_ts = get_tso(&pd_client) + 10000000; - let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); - assert!(resp.get_region_error().has_data_is_not_ready()); + leader_client.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), read_ts); // The `max_ts` should not updated by the stale read request, so we can prewrite // and commit `async_commit` transaction with a ts that smaller than the @@ -687,10 +700,10 @@ fn test_stale_read_future_ts_not_update_max_ts() { leader_client.must_kv_commit(vec![b"key2".to_vec()], prewrite_ts, commit_ts); leader_client.must_kv_read_equal(b"key2".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); - // Perform stale read with a future ts should return error + // Perform stale read with a future ts, the stale read could be processed + // falling back to snapshot read on the leader peer. let read_ts = get_tso(&pd_client) + 10000000; - let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); - assert!(resp.get_region_error().has_data_is_not_ready()); + leader_client.must_kv_read_equal(b"key2".to_vec(), b"value1".to_vec(), read_ts); // The `max_ts` should not updated by the stale read request, so 1pc transaction // with a ts that smaller than the `read_ts` should not be fallbacked to 2pc diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 7748b1d2985..ca23b4c5a17 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -992,3 +992,20 @@ fn test_snapshot_send_failed() { sleep_ms(100); assert!(mgr.list_snapshot().unwrap().is_empty()); } + +#[test] +/// Test a corrupted snapshot can be detected and retry to generate a new one. +fn test_retry_corrupted_snapshot() { + let mut cluster = new_node_cluster(0, 3); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + + let r = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + must_get_none(&cluster.get_engine(3), b"k1"); + pd_client.must_add_peer(r, new_peer(2, 2)); + fail::cfg("inject_sst_file_corruption", "return").unwrap(); + pd_client.must_add_peer(r, new_peer(3, 3)); + + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +} diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 3520de4e3ad..2ef3d499d22 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1,5 +1,4 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - use std::{ sync::{ atomic::{AtomicBool, Ordering}, @@ -18,6 +17,7 @@ use kvproto::{ Mutation, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, metapb::Region, + pdpb::CheckPolicy, raft_serverpb::{PeerState, RaftMessage}, tikvpb::TikvClient, }; @@ -32,6 +32,7 @@ use raftstore::{ Result, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{ config::{ReadableDuration, ReadableSize}, @@ -41,6 +42,85 @@ use tikv_util::{ }; use txn_types::{Key, LastChange, PessimisticLock, TimeStamp}; +#[test] +fn test_meta_inconsistency() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.apply_batch_system.pool_size = 2; + cluster.cfg.raft_store.apply_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.hibernate_regions = false; + cluster.cfg.raft_store.raft_log_gc_threshold = 1000; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + cluster.must_transfer_leader(region_id, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + + // Add new peer on node 3, its snapshot apply is paused. + fail::cfg("before_set_region_on_peer_3", "pause").unwrap(); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + + // Let only heartbeat msg to pass so a replicate peer could be created on node 3 + // for peer 1003. + let region_packet_filter_region_1000_peer_1003 = + RegionPacketFilter::new(1000, 3).skip(MessageType::MsgHeartbeat); + cluster + .sim + .wl() + .add_recv_filter(3, Box::new(region_packet_filter_region_1000_peer_1003)); + + // Trigger a region split to create region 1000 with peer 1001, 1002 and 1003. + let region = cluster.get_region(b""); + cluster.must_split(®ion, b"k5"); + + // Scheduler a larger peed id heartbeat msg to trigger peer destroy for peer + // 1003, pause it before the meta.lock operation so new region insertions by + // region split could go first. + // Thus a inconsistency could happen because the destroy is handled + // by a uninitialized peer but the new initialized region info is inserted into + // the meta by region split. + fail::cfg("before_destroy_peer_on_peer_1003", "pause").unwrap(); + let new_region = cluster.get_region(b"k4"); + let mut larger_id_msg = Box::::default(); + larger_id_msg.set_region_id(1000); + larger_id_msg.set_to_peer(new_peer(3, 1113)); + larger_id_msg.set_region_epoch(new_region.get_region_epoch().clone()); + larger_id_msg + .mut_region_epoch() + .set_conf_ver(new_region.get_region_epoch().get_conf_ver() + 1); + larger_id_msg.set_from_peer(new_peer(1, 1001)); + let raft_message = larger_id_msg.mut_message(); + raft_message.set_msg_type(MessageType::MsgHeartbeat); + raft_message.set_from(1001); + raft_message.set_to(1113); + raft_message.set_term(6); + cluster.sim.wl().send_raft_msg(*larger_id_msg).unwrap(); + thread::sleep(Duration::from_millis(500)); + + // Let snapshot apply continue on peer 3 from region 0, then region split would + // be applied too. + fail::remove("before_set_region_on_peer_3"); + thread::sleep(Duration::from_millis(2000)); + + // Let self destroy continue after the region split is finished. + fail::remove("before_destroy_peer_on_peer_1003"); + sleep_ms(1000); + + // Clear the network partition nemesis, trigger a new region split, panic would + // be encountered The thread 'raftstore-3-1::test_message_order_3' panicked + // at 'meta corrupted: no region for 1000 7A6B35 when creating 1004 + // region_id: 1004 from_peer { id: 1005 store_id: 1 } to_peer { id: 1007 + // store_id: 3 } message { msg_type: MsgRequestPreVote to: 1007 from: 1005 + // term: 6 log_term: 5 index: 5 commit: 5 commit_term: 5 } region_epoch { + // conf_ver: 3 version: 3 } end_key: 6B32'. + cluster.sim.wl().clear_recv_filters(3); + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + cluster.must_put(b"k1", b"v1"); +} + #[test] fn test_follower_slow_split() { let mut cluster = new_node_cluster(0, 3); @@ -268,6 +348,68 @@ impl Filter for PrevoteRangeFilter { } } +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_region_size_after_split() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_size = 1440; + let region_split_size = 960; + cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // insert 20 key value pairs into the cluster. + // from 000000001 to 000000020 + let mut range = 1..; + put_till_size(&mut cluster, region_max_size - 100, &mut range); + sleep_ms(100); + // disable check split. + fail::cfg("on_split_region_check_tick", "return").unwrap(); + let max_key = put_till_size(&mut cluster, region_max_size, &mut range); + // split by use key, split region 1 to region 1 and region 2. + // region 1: ["000000010",""] + // region 2: ["","000000010") + let region = pd_client.get_region(&max_key).unwrap(); + cluster.must_split(®ion, b"000000010"); + let size = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size >= region_max_size - 100, "{}", size); + + let region = pd_client.get_region(b"000000009").unwrap(); + let size1 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert_eq!(0, size1, "{}", size1); + + // split region by size check, the region 1 will be split to region 1 and region + // 3. and the region3 will contains one half region size data. + let region = pd_client.get_region(&max_key).unwrap(); + pd_client.split_region(region.clone(), CheckPolicy::Scan, vec![]); + sleep_ms(200); + let size2 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size > size2, "{}:{}", size, size2); + fail::remove("on_split_region_check_tick"); + + let region = pd_client.get_region(b"000000010").unwrap(); + let size3 = cluster + .pd_client + .get_region_approximate_size(region.get_id()) + .unwrap_or_default(); + assert!(size3 > 0, "{}", size3); +} + // Test if a peer is created from splitting when another initialized peer with // the same region id has already existed. In previous implementation, it can be // created and panic will happen because there are two initialized peer with the @@ -674,7 +816,7 @@ impl Filter for CollectSnapshotFilter { #[test] fn test_split_duplicated_batch() { let mut cluster = new_node_cluster(0, 3); - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); // Disable raft log gc in this test case. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); // Use one thread to make it more possible to be fetched into one batch. @@ -1407,3 +1549,65 @@ fn test_split_region_with_no_valid_split_keys() { rx.recv_timeout(Duration::from_secs(5)).unwrap(); rx.try_recv().unwrap_err(); } + +/// This test case test if a split failed for some reason, +/// it can continue run split check and eventually the split will finish +#[test_case(test_raftstore::new_node_cluster)] +fn test_split_by_split_check_on_size() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_size = 1440; + let region_split_size = 960; + cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // make first split fail + // 1*return means it would run "return" action once + fail::cfg("fail_pre_propose_split", "1*return").unwrap(); + + // Insert region_max_size into the cluster. + // It should trigger the split + let mut range = 1..; + let key = put_till_size(&mut cluster, region_max_size / 2, &mut range); + let region = pd_client.get_region(&key).unwrap(); + put_till_size(&mut cluster, region_max_size / 2 + 100, &mut range); + // waiting the split, + cluster.wait_region_split(®ion); +} + +/// This test case test if a split failed for some reason, +/// it can continue run split check and eventually the split will finish +#[test_case(test_raftstore::new_node_cluster)] +fn test_split_by_split_check_on_keys() { + let mut cluster = new_cluster(0, 1); + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); + let region_max_keys = 15; + let region_split_keys = 10; + cluster.cfg.coprocessor.region_max_keys = Some(region_max_keys); + cluster.cfg.coprocessor.region_split_keys = Some(region_split_keys); + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + let _r = cluster.run_conf_change(); + + // make first split fail + // 1*return means it would run "return" action once + fail::cfg("fail_pre_propose_split", "1*return").unwrap(); + + // Insert region_max_size into the cluster. + // It should trigger the split + let mut range = 1..; + let key = put_till_count(&mut cluster, region_max_keys / 2, &mut range); + let region = pd_client.get_region(&key).unwrap(); + put_till_count(&mut cluster, region_max_keys / 2 + 3, &mut range); + // waiting the split, + cluster.wait_region_split(®ion); +} diff --git a/tests/failpoints/cases/test_sst_recovery.rs b/tests/failpoints/cases/test_sst_recovery.rs index a4c1f10b5ae..05b0badd662 100644 --- a/tests/failpoints/cases/test_sst_recovery.rs +++ b/tests/failpoints/cases/test_sst_recovery.rs @@ -105,7 +105,7 @@ fn test_sst_recovery_overlap_range_sst_exist() { must_get_equal(&engine1, b"7", b"val_1"); // Validate the damaged sst has been deleted. - compact_files_to_target_level(&engine1, true, 3).unwrap(); + compact_files_to_target_level(&engine1, true, 6).unwrap(); let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 1); @@ -179,8 +179,11 @@ fn compact_files_to_target_level( engine.compact_files_cf(CF_DEFAULT, file_names, Some(level), 1, false) } -fn create_tikv_cluster_with_one_node_damaged() --> (Cluster, Arc, RocksEngine) { +fn create_tikv_cluster_with_one_node_damaged() -> ( + Cluster>, + Arc, + RocksEngine, +) { let mut cluster = new_server_cluster(0, 3); let pd_client = cluster.pd_client.clone(); pd_client.disable_default_operator(); @@ -252,7 +255,7 @@ fn create_tikv_cluster_with_one_node_damaged() disturb_sst_file(&sst_path); // The sst file is damaged, so this action will fail. - assert_corruption(compact_files_to_target_level(&engine1, true, 3)); + assert_corruption(compact_files_to_target_level(&engine1, true, 6)); (cluster, pd_client, engine1) } diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 39fa09ef014..80c73f03a16 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -12,6 +12,7 @@ use kvproto::raft_serverpb::{PeerState, RaftLocalState, RaftMessage}; use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; #[test] @@ -44,7 +45,8 @@ fn test_one_node_leader_missing() { fail::remove(check_stale_state); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_update_localreader_after_removed() { let mut cluster = new_node_cluster(0, 6); let pd_client = cluster.pd_client.clone(); @@ -90,7 +92,8 @@ fn test_node_update_localreader_after_removed() { cluster.must_region_not_exist(r1, 2); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_learner_restart() { let mut cluster = new_node_cluster(0, 2); cluster.pd_client.disable_default_operator(); @@ -133,9 +136,11 @@ fn test_stale_learner_restart() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } +/// pass /// Test if a peer can be destroyed through tombstone msg when applying /// snapshot. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_peer_destroy_when_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); @@ -210,9 +215,11 @@ fn test_stale_peer_destroy_when_apply_snapshot() { must_get_none(&cluster.get_engine(3), b"k1"); } +/// pass /// Test if destroy a uninitialized peer through tombstone msg would allow a /// staled peer be created again. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { // 4 stores cluster. let mut cluster = new_node_cluster(0, 4); @@ -291,7 +298,8 @@ fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { /// Logs scan are now moved to raftlog gc threads. The case is to test if logs /// are still cleaned up when there is stale logs before first index during /// destroy. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_clean_up_logs_with_unfinished_log_gc() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(15); diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 523bb54f7cb..ceb018fc610 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -6,6 +6,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use kvproto::metapb::{Peer, Region}; use pd_client::PdClient; use raft::eraftpb::MessageType; @@ -83,7 +84,7 @@ fn stale_read_during_splitting(right_derive: bool) { } fn must_not_stale_read( - cluster: &mut Cluster, + cluster: &mut Cluster>, stale_key: &[u8], old_region: &Region, old_leader: &Peer, @@ -166,7 +167,7 @@ fn must_not_stale_read( } fn must_not_eq_on_key( - cluster: &mut Cluster, + cluster: &mut Cluster>, key: &[u8], value: &[u8], read_quorum: bool, @@ -325,7 +326,7 @@ fn test_read_index_when_transfer_leader_2() { // Increase the election tick to make this test case running reliably. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); // Stop log compaction to transfer leader with filter easier. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); let max_lease = Duration::from_secs(2); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); @@ -455,7 +456,7 @@ fn test_read_after_peer_destroyed() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 57047bef9d4..fec1ccc931d 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -24,6 +24,7 @@ use kvproto::{ }; use resource_control::ResourceGroupManager; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ config::{ConfigController, Module}, storage::{ @@ -44,10 +45,11 @@ use tikv::{ use tikv_util::{future::paired_future_callback, worker::dummy_scheduler, HandyRwLock}; use txn_types::{Key, Mutation, TimeStamp}; -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_scheduler_leader_change_twice() { let snapshot_fp = "scheduler_async_snapshot_finish"; - let mut cluster = new_server_cluster(0, 2); + let mut cluster = new_cluster(0, 2); cluster.run(); let region0 = cluster.get_region(b""); let peers = region0.get_peers(); @@ -108,10 +110,11 @@ fn test_scheduler_leader_change_twice() { } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_server_catching_api_error() { let raftkv_fp = "raftkv_early_error_report"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); @@ -168,10 +171,11 @@ fn test_server_catching_api_error() { must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_raftkv_early_error_report() { let raftkv_fp = "raftkv_early_error_report"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); cluster.must_split(&cluster.get_region(b"k0"), b"k1"); @@ -233,10 +237,12 @@ fn test_raftkv_early_error_report() { fail::remove(raftkv_fp); } -#[test] +// FIXME: #[test_case(test_raftstore_v2::new_server_cluster)] +// Raftstore-v2 not support get the storage engine, returning `None` currently. +#[test_case(test_raftstore::new_server_cluster)] fn test_scale_scheduler_pool() { let snapshot_fp = "scheduler_start_execute"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let origin_pool_size = cluster.cfg.storage.scheduler_worker_pool_size; @@ -332,9 +338,10 @@ fn test_scale_scheduler_pool() { fail::remove(snapshot_fp); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_scheduler_pool_auto_switch_for_resource_ctl() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -345,12 +352,12 @@ fn test_scheduler_pool_auto_switch_for_resource_ctl() { .get(&1) .unwrap() .clone(); - let resource_manager = ResourceGroupManager::default(); + let resource_manager = Arc::new(ResourceGroupManager::default()); let resource_ctl = resource_manager.derive_controller("test".to_string(), true); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .config(cluster.cfg.tikv.storage.clone()) - .build_for_resource_controller(resource_ctl) + .build_for_resource_controller(resource_manager.clone(), resource_ctl) .unwrap(); let region = cluster.get_region(b"k1"); @@ -1090,9 +1097,10 @@ fn test_async_apply_prewrite_impl( } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1149,7 +1157,6 @@ fn test_async_apply_prewrite() { true, true, ); - test_async_apply_prewrite_impl( &storage, ctx.clone(), @@ -1188,9 +1195,10 @@ fn test_async_apply_prewrite() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite_fallback() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1378,9 +1386,10 @@ fn test_async_apply_prewrite_1pc_impl( } } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_async_apply_prewrite_1pc() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1405,9 +1414,10 @@ fn test_async_apply_prewrite_1pc() { test_async_apply_prewrite_1pc_impl(&storage, ctx, b"key", b"value2", 20, true); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_atomic_cas_lock_by_latch() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1493,9 +1503,10 @@ fn test_atomic_cas_lock_by_latch() { assert_eq!(b"v2".to_vec(), ret); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_before_async_write_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1532,12 +1543,13 @@ fn test_before_async_write_deadline() { )); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_deadline_exceeded_on_get_and_batch_get() { use tikv_util::time::Instant; use tracker::INVALID_TRACKER_TOKEN; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster @@ -1591,9 +1603,10 @@ fn test_deadline_exceeded_on_get_and_batch_get() { fail::remove("after-snapshot"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_before_propose_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); @@ -1629,9 +1642,10 @@ fn test_before_propose_deadline() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_resolve_lock_deadline() { - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); @@ -1789,10 +1803,11 @@ fn test_mvcc_concurrent_commit_and_rollback_at_shutdown() { assert_eq!(get_resp.value, v); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_raw_put_deadline() { let deadline_fp = "deadline_check_fail"; - let mut cluster = new_server_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.run(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 14f4161c7ae..0b6e6269e95 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -2,6 +2,7 @@ use std::{ sync::{ + atomic::{AtomicBool, Ordering}, mpsc::{channel, sync_channel}, Arc, Mutex, }, @@ -9,13 +10,15 @@ use std::{ time::Duration, }; -use futures::executor::block_on; +use engine_traits::CF_DEFAULT; +use futures::{executor::block_on, StreamExt}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::{ self as pb, AssertionLevel, Context, GetRequest, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, + raft_serverpb::RaftMessage, tikvpb::TikvClient, }; use raft::prelude::{ConfChangeType, MessageType}; @@ -45,7 +48,9 @@ use tikv::{ Snapshot, TestEngineBuilder, TestStorageBuilderApiV1, }, }; +use tikv_kv::{Engine, Modify, WriteData, WriteEvent}; use tikv_util::{ + config::ReadableDuration, store::{new_peer, peer::new_incoming_voter}, HandyRwLock, }; @@ -803,3 +808,97 @@ fn test_next_last_change_info_called_when_gc() { assert_eq!(h.join().unwrap().unwrap().as_slice(), b"v"); } + +fn must_put(ctx: &Context, engine: &E, key: &[u8], value: &[u8]) { + engine.put(ctx, Key::from_raw(key), value.to_vec()).unwrap(); +} + +fn must_delete(ctx: &Context, engine: &E, key: &[u8]) { + engine.delete(ctx, Key::from_raw(key)).unwrap(); +} + +// Before the fix, a proposal can be proposed twice, which is caused by that +// write proposal validation and propose are not atomic. So a raft message with +// higher term between them can make the proposal goes to msg proposal +// forwarding logic. However, raft proposal forawrd logic is not compatible with +// the raft store, as the failed proposal makes client retry. The retried +// proposal coupled with forward proposal makes the propsal applied twice. +#[test] +fn test_forbid_forward_propose() { + use test_raftstore_v2::*; + let count = 3; + let mut cluster = new_server_cluster(0, count); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + cluster.run(); + + let region = cluster.get_region(b""); + let peer1 = new_peer(1, 1); + let peer2 = new_peer(2, 2); + cluster.must_transfer_leader(region.id, peer2.clone()); + let storage = cluster.sim.rl().storages[&1].clone(); + let storage2 = cluster.sim.rl().storages[&2].clone(); + + let p = Arc::new(AtomicBool::new(false)); + let p2 = p.clone(); + let (tx, rx) = channel(); + let tx = Mutex::new(tx); + cluster.add_recv_filter_on_node( + 2, + Box::new(DropMessageFilter::new(Arc::new(move |_| { + if p2.load(Ordering::Relaxed) { + tx.lock().unwrap().send(()).unwrap(); + // One msg is enough + p2.store(false, Ordering::Relaxed); + true + } else { + false + } + }))), + ); + + let k = Key::from_raw(b"k"); + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(peer2); + + // block node when collecting message to make async write proposal and a raft + // message with higher term occured in a single batch. + fail::cfg("on_peer_collect_message_2", "pause").unwrap(); + let mut res = storage2.async_write( + &ctx, + WriteData::from_modifies(vec![Modify::Put(CF_DEFAULT, k.clone(), b"val".to_vec())]), + WriteEvent::EVENT_PROPOSED, + None, + ); + + // Make node 1 become leader + let router = cluster.get_router(1).unwrap(); + let mut raft_msg = RaftMessage::default(); + raft_msg.set_region_id(1); + raft_msg.set_to_peer(peer1.clone()); + raft_msg.set_region_epoch(region.get_region_epoch().clone()); + raft_msg + .mut_message() + .set_msg_type(MessageType::MsgTimeoutNow); + router.send_raft_message(Box::new(raft_msg)).unwrap(); + + std::thread::sleep(Duration::from_secs(1)); + + ctx.set_peer(peer1); + must_put(&ctx, &storage, b"k", b"val"); + must_delete(&ctx, &storage, b"k"); + + p.store(true, Ordering::Release); + rx.recv().unwrap(); + // Ensure the msg is sent by router. + std::thread::sleep(Duration::from_millis(100)); + fail::remove("on_peer_collect_message_2"); + + let r = block_on(async { res.next().await }).unwrap(); + assert!(matches!(r, WriteEvent::Finished(Err { .. }))); + + std::thread::sleep(Duration::from_secs(1)); + assert_eq!(cluster.get(k.as_encoded()), None); +} diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index cc33a01ff03..95d45c8e99c 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -440,3 +440,110 @@ fn test_unsafe_recovery_demotion_reentrancy() { assert_eq!(demoted, true); fail::remove("on_handle_apply_store_1"); } + +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_unsafe_recovery_rollback_merge() { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + for i in 0..10 { + cluster.must_put(format!("k{}", i).as_bytes(), b"v"); + } + + // Block merge commit, let go of the merge prepare. + fail::cfg("on_schedule_merge", "return()").unwrap(); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Makes the leadership definite. + let left_peer_2 = find_peer(&left, nodes[2]).unwrap().to_owned(); + let right_peer_2 = find_peer(&right, nodes[2]).unwrap().to_owned(); + cluster.must_transfer_leader(left.get_id(), left_peer_2); + cluster.must_transfer_leader(right.get_id(), right_peer_2); + cluster.try_merge(left.get_id(), right.get_id()); + + let right_peer_0 = find_peer(&right, nodes[0]).unwrap().to_owned(); + pd_client.must_remove_peer(right.get_id(), right_peer_0); + cluster.must_remove_region(nodes[0], right.get_id()); + // Makes the group lose its quorum. + cluster.stop_node(nodes[1]); + cluster.stop_node(nodes[2]); + fail::remove("on_schedule_merge"); + { + let put = new_put_cmd(b"k2", b"v2"); + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![put], + true, + ); + // marjority is lost, can't propose command successfully. + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); + } + + cluster.must_enter_force_leader(left.get_id(), nodes[0], vec![nodes[1], nodes[2]]); + // Allow rollback merge to finish. + sleep_ms(100); + + // Construct recovery plan. + let mut plan = pdpb::RecoveryPlan::default(); + + let left_demote_peers: Vec = left + .get_peers() + .iter() + .filter(|&peer| peer.get_store_id() != nodes[0]) + .cloned() + .collect(); + let mut left_demote = pdpb::DemoteFailedVoters::default(); + left_demote.set_region_id(left.get_id()); + left_demote.set_failed_voters(left_demote_peers.into()); + plan.mut_demotes().push(left_demote); + + // Triggers the unsafe recovery plan execution. + pd_client.must_set_unsafe_recovery_plan(nodes[0], plan.clone()); + cluster.must_send_store_heartbeat(nodes[0]); + + let mut store_report = None; + for _ in 0..20 { + store_report = pd_client.must_get_store_report(nodes[0]); + if store_report.is_some() { + break; + } + sleep_ms(100); + } + assert_ne!(store_report, None); + // Demotion is done + let mut demoted = false; + for _ in 0..10 { + let new_left = block_on(pd_client.get_region_by_id(left.get_id())) + .unwrap() + .unwrap(); + assert_eq!(new_left.get_peers().len(), 3); + demoted = new_left + .get_peers() + .iter() + .filter(|peer| peer.get_store_id() != nodes[0]) + .all(|peer| peer.get_role() == metapb::PeerRole::Learner); + if demoted { + break; + } + sleep_ms(100); + } + assert_eq!(demoted, true); + + fail::remove("on_schedule_merge_ret_err"); +} diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 02411ba1b76..f6fec8b35de 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -3,6 +3,7 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; use collections::HashMap; +use engine_rocks::RocksEngine; use futures::executor::block_on; use kvproto::{metapb, raft_serverpb::RaftApplyState}; use pd_client::PdClient; @@ -16,6 +17,7 @@ fn test_witness_update_region_in_local_reader() { cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); + assert_eq!(nodes[2], 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -51,7 +53,7 @@ fn test_witness_update_region_in_local_reader() { request.mut_header().set_replica_read(true); let resp = cluster - .read(None, request.clone(), Duration::from_millis(100)) + .read(None, None, request.clone(), Duration::from_millis(100)) .unwrap(); assert_eq!( resp.get_header().get_error().get_is_witness(), @@ -64,6 +66,52 @@ fn test_witness_update_region_in_local_reader() { fail::remove("change_peer_after_update_region_store_3"); } +// This case is almost the same as `test_witness_update_region_in_local_reader`, +// but this omitted changing the peer to witness, for ensuring `peer_is_witness` +// won't be returned in a cluster without witnesses. +#[test] +fn test_witness_not_reported_while_disabled() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + assert_eq!(nodes[2], 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + + cluster.must_put(b"k0", b"v0"); + + // update region but the peer is not destroyed yet + fail::cfg("change_peer_after_update_region_store_3", "pause").unwrap(); + + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, None, request.clone(), Duration::from_millis(100)) + .unwrap(); + assert!(resp.get_header().has_error()); + assert!(!resp.get_header().get_error().has_is_witness()); + fail::remove("change_peer_after_update_region_store_3"); +} + // Test the case witness pull voter_replicated_index when has pending compact // cmd. #[test] @@ -444,7 +492,7 @@ fn test_non_witness_replica_read() { request.mut_header().set_replica_read(true); let resp = cluster - .read(None, request, Duration::from_millis(100)) + .read(None, None, request, Duration::from_millis(100)) .unwrap(); assert_eq!( resp.get_header().get_error().get_is_witness(), @@ -469,13 +517,13 @@ fn test_non_witness_replica_read() { request.mut_header().set_replica_read(true); let resp = cluster - .read(None, request, Duration::from_millis(100)) + .read(None, None, request, Duration::from_millis(100)) .unwrap(); assert_eq!(resp.get_header().has_error(), false); } -fn must_get_error_is_witness( - cluster: &mut Cluster, +fn must_get_error_is_witness>( + cluster: &mut Cluster, region: &metapb::Region, cmd: kvproto::raft_cmdpb::Request, ) { diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 4cfd4be07be..f89ef0c6faa 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -3,7 +3,7 @@ use std::{fs::File, time::Duration}; use engine_traits::{CF_DEFAULT, CF_WRITE}; -use external_storage_export::{create_storage, make_local_backend}; +use external_storage::{create_storage, make_local_backend}; use file_system::calc_crc32_bytes; use futures::{executor::block_on, AsyncReadExt, StreamExt}; use kvproto::{ diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index 7af5455a199..dc88bbd93a3 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -9,11 +9,7 @@ use security::SecurityManager; use test_pd_client::TestPdClient; use tikv::{ config::*, - server::{ - lock_manager::*, - resolve::{Callback, StoreAddrResolver}, - Error, Result, - }, + server::{lock_manager::*, resolve}, }; use tikv_util::config::ReadableDuration; @@ -27,14 +23,6 @@ fn test_config_validate() { invalid_cfg.validate().unwrap_err(); } -#[derive(Clone)] -struct MockResolver; -impl StoreAddrResolver for MockResolver { - fn resolve(&self, _store_id: u64, _cb: Callback) -> Result<()> { - Err(Error::Other(box_err!("unimplemented"))) - } -} - fn setup( cfg: TikvConfig, ) -> ( @@ -50,7 +38,7 @@ fn setup( .start( 1, pd_client, - MockResolver, + resolve::MockStoreAddrResolver::default(), security_mgr, &cfg.pessimistic_txn, ) diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a65d4cfb46c..05cbde827d2 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -37,7 +37,7 @@ use tikv::{ BlockCacheConfig, Config as StorageConfig, EngineType, FlowControlConfig, IoRateLimitConfig, }, }; -use tikv_util::config::{LogFormat, ReadableDuration, ReadableSize}; +use tikv_util::config::{LogFormat, ReadableDuration, ReadableSchedule, ReadableSize}; mod dynamic; mod test_config_client; @@ -65,7 +65,6 @@ fn read_file_in_project_dir(path: &str) -> String { #[test] fn test_serde_custom_tikv_config() { let mut value = TikvConfig::default(); - value.log_rotation_timespan = ReadableDuration::days(1); value.log.level = Level::Critical.into(); value.log.file.filename = "foo".to_owned(); value.log.format = LogFormat::Json; @@ -77,6 +76,8 @@ fn test_serde_custom_tikv_config() { value.abort_on_panic = true; value.memory_usage_limit = Some(ReadableSize::gb(10)); value.memory_usage_high_water = 0.65; + value.memory.enable_heap_profiling = false; + value.memory.profiling_sample_per_bytes = ReadableSize::mb(1); value.server = ServerConfig { cluster_id: 0, // KEEP IT ZERO, it is skipped by serde. addr: "example.com:443".to_owned(), @@ -103,9 +104,6 @@ fn test_serde_custom_tikv_config() { grpc_stream_initial_window_size: ReadableSize(12_345), grpc_keepalive_time: ReadableDuration::secs(3), grpc_keepalive_timeout: ReadableDuration::secs(60), - end_point_concurrency: None, - end_point_max_tasks: None, - end_point_stack_size: None, end_point_recursion_limit: 100, end_point_stream_channel_size: 16, end_point_batch_row_limit: 64, @@ -125,6 +123,7 @@ fn test_serde_custom_tikv_config() { forward_max_connections_per_address: 5, reject_messages_on_memory_ratio: 0.8, simplify_metrics: false, + ..Default::default() }; value.readpool = ReadPoolConfig { unified: UnifiedReadPoolConfig { @@ -191,18 +190,17 @@ fn test_serde_custom_tikv_config() { raft_engine_purge_interval: ReadableDuration::minutes(20), max_manual_flush_rate: 5.0, raft_entry_cache_life_time: ReadableDuration::secs(12), - raft_reject_transfer_leader_duration: ReadableDuration::secs(3), split_region_check_tick_interval: ReadableDuration::secs(12), region_split_check_diff: Some(ReadableSize::mb(20)), region_compact_check_interval: ReadableDuration::secs(12), - clean_stale_peer_delay: ReadableDuration::secs(0), region_compact_check_step: Some(1_234), region_compact_min_tombstones: 999, region_compact_tombstones_percent: 33, region_compact_min_redundant_rows: 999, - region_compact_redundant_rows_percent: 33, + region_compact_redundant_rows_percent: Some(33), pd_heartbeat_tick_interval: ReadableDuration::minutes(12), pd_store_heartbeat_tick_interval: ReadableDuration::secs(12), + pd_report_min_resolved_ts_interval: ReadableDuration::millis(233), notify_capacity: 12_345, snap_mgr_gc_tick_interval: ReadableDuration::minutes(12), snap_gc_timeout: ReadableDuration::hours(12), @@ -212,6 +210,7 @@ fn test_serde_custom_tikv_config() { max_leader_missing_duration: ReadableDuration::hours(12), abnormal_leader_missing_duration: ReadableDuration::hours(6), peer_stale_state_check_interval: ReadableDuration::hours(2), + gc_peer_check_interval: ReadableDuration::days(1), leader_transfer_max_log_lag: 123, snap_apply_batch_size: ReadableSize::mb(12), snap_apply_copy_symlink: true, @@ -230,8 +229,6 @@ fn test_serde_custom_tikv_config() { use_delete_range: true, snap_generator_pool_size: 2, cleanup_import_sst_interval: ReadableDuration::minutes(12), - region_max_size: ReadableSize(0), - region_split_size: ReadableSize(0), local_read_batch_size: 33, apply_batch_system, store_batch_system, @@ -251,8 +248,7 @@ fn test_serde_custom_tikv_config() { io_reschedule_concurrent_max_count: 1234, io_reschedule_hotpot_duration: ReadableDuration::secs(4321), inspect_interval: ReadableDuration::millis(444), - report_min_resolved_ts_interval: ReadableDuration::millis(233), - raft_msg_flush_interval: ReadableDuration::micros(250), + inspect_cpu_util_thd: 0.666, check_leader_lease_interval: ReadableDuration::millis(123), renew_leader_lease_advance_duration: ReadableDuration::millis(456), reactive_memory_lock_tick_interval: ReadableDuration::millis(566), @@ -267,24 +263,28 @@ fn test_serde_custom_tikv_config() { check_request_snapshot_interval: ReadableDuration::minutes(1), slow_trend_unsensitive_cause: 10.0, slow_trend_unsensitive_result: 0.5, + slow_trend_network_io_factor: 0.0, enable_v2_compatible_learner: false, unsafe_disable_check_quorum: false, + periodic_full_compact_start_times: ReadableSchedule::default(), + periodic_full_compact_start_max_cpu: 0.1, + ..Default::default() }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { min_blob_size: ReadableSize(2018), - blob_file_compression: CompressionType::Zstd, + blob_file_compression: CompressionType::Lz4, + zstd_dict_size: ReadableSize::kb(16), blob_cache_size: ReadableSize::gb(12), min_gc_batch_size: ReadableSize::kb(12), max_gc_batch_size: ReadableSize::mb(12), discardable_ratio: 0.00156, - sample_ratio: None, merge_small_file_threshold: ReadableSize::kb(21), blob_run_mode: BlobRunMode::Fallback, level_merge: true, range_merge: true, max_sorted_runs: 100, - gc_merge_rewrite: false, + ..Default::default() }; let titan_db_config = TitanDbConfig { enabled: true, @@ -315,7 +315,6 @@ fn test_serde_custom_tikv_config() { rate_bytes_per_sec: ReadableSize::kb(1), rate_limiter_refill_period: ReadableDuration::millis(10), rate_limiter_mode: DBRateLimiterMode::AllIo, - auto_tuned: None, rate_limiter_auto_tuned: false, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(32), @@ -388,6 +387,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, writecf: WriteCfConfig { block_size: ReadableSize::kb(12), @@ -433,18 +433,18 @@ fn test_serde_custom_tikv_config() { force_consistency_checks: true, titan: TitanCfConfig { min_blob_size: ReadableSize(1024), // default value - blob_file_compression: CompressionType::Lz4, + blob_file_compression: CompressionType::Zstd, + zstd_dict_size: ReadableSize::kb(0), blob_cache_size: ReadableSize::mb(0), min_gc_batch_size: ReadableSize::mb(16), max_gc_batch_size: ReadableSize::mb(64), discardable_ratio: 0.5, - sample_ratio: None, merge_small_file_threshold: ReadableSize::mb(8), blob_run_mode: BlobRunMode::ReadOnly, level_merge: false, range_merge: true, max_sorted_runs: 20, - gc_merge_rewrite: false, + ..Default::default() }, prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, @@ -461,6 +461,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, lockcf: LockCfConfig { block_size: ReadableSize::kb(12), @@ -506,18 +507,18 @@ fn test_serde_custom_tikv_config() { force_consistency_checks: true, titan: TitanCfConfig { min_blob_size: ReadableSize(1024), // default value - blob_file_compression: CompressionType::Lz4, + blob_file_compression: CompressionType::Zstd, + zstd_dict_size: ReadableSize::kb(0), blob_cache_size: ReadableSize::mb(0), min_gc_batch_size: ReadableSize::mb(16), max_gc_batch_size: ReadableSize::mb(64), discardable_ratio: 0.5, - sample_ratio: None, merge_small_file_threshold: ReadableSize::mb(8), blob_run_mode: BlobRunMode::ReadOnly, // default value level_merge: false, range_merge: true, max_sorted_runs: 20, - gc_merge_rewrite: false, + ..Default::default() }, prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, @@ -534,6 +535,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: Some(ReadableSize::mb(16)), }, raftcf: RaftCfConfig { block_size: ReadableSize::kb(12), @@ -579,18 +581,18 @@ fn test_serde_custom_tikv_config() { force_consistency_checks: true, titan: TitanCfConfig { min_blob_size: ReadableSize(1024), // default value - blob_file_compression: CompressionType::Lz4, + blob_file_compression: CompressionType::Zstd, + zstd_dict_size: ReadableSize::kb(0), blob_cache_size: ReadableSize::mb(0), min_gc_batch_size: ReadableSize::mb(16), max_gc_batch_size: ReadableSize::mb(64), discardable_ratio: 0.5, - sample_ratio: None, merge_small_file_threshold: ReadableSize::mb(8), blob_run_mode: BlobRunMode::ReadOnly, // default value level_merge: false, range_merge: true, max_sorted_runs: 20, - gc_merge_rewrite: false, + ..Default::default() }, prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, @@ -607,8 +609,10 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: Some(ReadableDuration::days(10)), periodic_compaction_seconds: Some(ReadableDuration::days(10)), + write_buffer_limit: None, }, titan: titan_db_config.clone(), + ..Default::default() }; value.raftdb = RaftDbConfig { info_log_level: LogLevel::Info, @@ -695,6 +699,7 @@ fn test_serde_custom_tikv_config() { max_compactions: Some(3), ttl: None, periodic_compaction_seconds: None, + write_buffer_limit: None, }, titan: titan_db_config, }; @@ -755,6 +760,7 @@ fn test_serde_custom_tikv_config() { other_priority: IoPriority::Low, }, background_error_recovery_window: ReadableDuration::hours(1), + txn_status_cache_capacity: 1000, }; value.coprocessor = CopConfig { split_region_on_table: false, @@ -828,6 +834,7 @@ fn test_serde_custom_tikv_config() { max_write_bytes_per_sec: ReadableSize::mb(10), enable_compaction_filter: false, compaction_filter_skip_version_check: true, + num_threads: 2, }; value.pessimistic_txn = PessimisticTxnConfig { wait_for_lock_timeout: ReadableDuration::millis(10), @@ -837,20 +844,24 @@ fn test_serde_custom_tikv_config() { }; value.cdc = CdcConfig { min_ts_interval: ReadableDuration::secs(4), - old_value_cache_size: 0, hibernate_regions_compatible: false, incremental_scan_threads: 3, incremental_scan_concurrency: 4, + incremental_scan_concurrency_limit: 5, incremental_scan_speed_limit: ReadableSize(7), + incremental_fetch_speed_limit: ReadableSize(8), incremental_scan_ts_filter_ratio: 0.7, tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), + ..Default::default() }; value.resolved_ts = ResolvedTsConfig { enable: true, advance_ts_interval: ReadableDuration::secs(5), scan_lock_pool_size: 1, + memory_quota: ReadableSize::mb(1), + incremental_scan_concurrency: 7, }; value.causal_ts = CausalTsConfig { renew_interval: ReadableDuration::millis(100), @@ -858,10 +869,14 @@ fn test_serde_custom_tikv_config() { renew_batch_max_size: 8192, alloc_ahead_buffer: ReadableDuration::millis(3000), }; + value + .split + .optimize_for(value.coprocessor.region_max_size()); value.resource_control = ResourceControlConfig { enabled: false }; let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); - let load = toml::from_str(&custom).unwrap(); + let mut load: TikvConfig = toml::from_str(&custom).unwrap(); + load.split.optimize_for(load.coprocessor.region_max_size()); assert_eq_debug(&value, &load); let dump = toml::to_string_pretty(&load).unwrap(); diff --git a/tests/integrations/config/test-cache-compatible.toml b/tests/integrations/config/test-cache-compatible.toml index 9fce88833ed..f91b5cdafc3 100644 --- a/tests/integrations/config/test-cache-compatible.toml +++ b/tests/integrations/config/test-cache-compatible.toml @@ -2,6 +2,8 @@ [log.file] +[memory] + [readpool.coprocessor] [readpool.storage] diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 053e7c45939..9eb628b8dc5 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -1,13 +1,9 @@ -log-level = "info" -log-file = "" -log-format = "text" slow-log-file = "slow_foo" slow-log-threshold = "1s" -log-rotation-timespan = "1d" panic-when-unexpected-key-or-data = true abort-on-panic = true memory-usage-limit = "10GB" -memory-usage-high-water= 0.65 +memory-usage-high-water = 0.65 [log] level = "fatal" @@ -19,6 +15,10 @@ max-size = 1 max-backups = 2 max-days = 3 +[memory] +enable-heap-profiling = false +profiling-sample-per-bytes = "1MB" + [readpool.unified] min-thread-count = 5 max-thread-count = 10 @@ -101,6 +101,7 @@ reserve-space = "10GB" reserve-raft-space = "2GB" enable-ttl = true ttl-check-poll-interval = "0s" +txn-status-cache-capacity = 1000 [storage.block-cache] capacity = "40GB" @@ -133,9 +134,7 @@ export-priority = "high" other-priority = "low" [pd] -endpoints = [ - "example.com:443", -] +endpoints = ["example.com:443"] [metric] job = "tikv_1" @@ -172,6 +171,7 @@ region-compact-min-redundant-rows = 999 region-compact-redundant-rows-percent = 33 pd-heartbeat-tick-interval = "12m" pd-store-heartbeat-tick-interval = "12s" +pd-report-min-resolved-ts-interval = "233ms" snap-mgr-gc-tick-interval = "12m" snap-gc-timeout = "12h" snap-wait-split-duration = "12h" @@ -183,6 +183,7 @@ max-peer-down-duration = "12m" max-leader-missing-duration = "12h" abnormal-leader-missing-duration = "6h" peer-stale-state-check-interval = "2h" +gc-peer-check-interval = "1d" leader-transfer-max-log-lag = 123 snap-apply-batch-size = "12MB" snap-apply-copy-symlink = true @@ -220,13 +221,13 @@ waterfall-metrics = true io-reschedule-concurrent-max-count = 1234 io-reschedule-hotpot-duration = "4321s" inspect-interval = "444ms" +inspect-cpu-util-thd = 0.666 check-leader-lease-interval = "123ms" renew-leader-lease-advance-duration = "456ms" reactive-memory-lock-tick-interval = "566ms" reactive-memory-lock-timeout-tick = 8 check-long-uncommitted-interval = "1s" long-uncommitted-base-threshold = "1s" -report-min-resolved-ts-interval = "233ms" report-region-buckets-tick-interval = "1234s" max-snapshot-file-raw-size = "10GB" unreachable-backoff = "111s" @@ -300,15 +301,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] bottommost-level-compression = "disable" bottommost-zstd-compression-dict-size = 1024 bottommost-zstd-compression-sample-size = 1024 @@ -346,7 +339,8 @@ periodic-compaction-seconds = "10d" [rocksdb.defaultcf.titan] min-blob-size = "2018B" -blob-file-compression = "zstd" +blob-file-compression = "lz4" +zstd-dict-size = "16KB" blob-cache-size = "12GB" min-gc-batch-size = "12KB" max-gc-batch-size = "12MB" @@ -371,15 +365,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -425,16 +411,9 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" +write-buffer-limit = "16MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" @@ -479,15 +458,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -565,15 +536,7 @@ bloom-filter-bits-per-key = 123 block-based-bloom-filter = true ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 -compression-per-level = [ - "no", - "no", - "zstd", - "zstd", - "no", - "zstd", - "lz4", -] +compression-per-level = ["no", "no", "zstd", "zstd", "no", "zstd", "lz4"] write-buffer-size = "1MB" max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 @@ -605,7 +568,8 @@ max-compactions = 3 [raftdb.defaultcf.titan] min-blob-size = "2018B" -blob-file-compression = "zstd" +blob-file-compression = "lz4" +zstd-dict-size = "16KB" blob-cache-size = "12GB" min-gc-batch-size = "12KB" max-gc-batch-size = "12MB" @@ -633,9 +597,7 @@ ca-path = "invalid path" cert-path = "invalid path" key-path = "invalid path" redact-info-log = true -cert-allowed-cn = [ - "example.tikv.com", -] +cert-allowed-cn = ["example.tikv.com"] [security.encryption] data-encryption-method = "aes128-ctr" @@ -681,11 +643,12 @@ batch-keys = 256 max-write-bytes-per-sec = "10MB" enable-compaction-filter = false compaction-filter-skip-version-check = true +num-threads = 2 [pessimistic-txn] -enabled = false # test backward compatibility +enabled = false # test backward compatibility wait-for-lock-timeout = "10ms" -wake-up-delay-duration = 100 # test backward compatibility +wake-up-delay-duration = 100 # test backward compatibility pipelined = false in-memory = false @@ -695,7 +658,9 @@ old-value-cache-size = 0 hibernate-regions-compatible = false incremental-scan-threads = 3 incremental-scan-concurrency = 4 +incremental-scan-concurrency-limit = 5 incremental-scan-speed-limit = 7 +incremental-fetch-speed-limit = 8 incremental-scan-ts-filter-ratio = 0.7 tso-worker-threads = 2 old-value-cache-memory-quota = "14MB" @@ -705,6 +670,8 @@ sink-memory-quota = "7MB" enable = true advance-ts-interval = "5s" scan-lock-pool-size = 1 +memory-quota = "1MB" +incremental-scan-concurrency = 7 [split] detect-times = 10 diff --git a/tests/integrations/config/test-default.toml b/tests/integrations/config/test-default.toml index 23e53b9daf3..ca1abc0081b 100644 --- a/tests/integrations/config/test-default.toml +++ b/tests/integrations/config/test-default.toml @@ -2,6 +2,8 @@ [log.file] +[memory] + [readpool.unified] [readpool.storage] diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 9af28b6e3d6..5bcd258947c 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2,6 +2,7 @@ use std::{cmp, thread, time::Duration}; +use engine_rocks::RocksEngine; use engine_traits::CF_LOCK; use kvproto::{ coprocessor::{Request, Response, StoreBatchTask, StoreBatchTaskResponse}, @@ -2208,43 +2209,44 @@ fn test_batch_request() { true, ), ]; - let prepare_req = - |cluster: &mut Cluster, ranges: &Vec| -> Request { - let original_range = ranges.get(0).unwrap(); - let key_range = product.get_record_range(original_range.start, original_range.end); - let region_key = Key::from_raw(&key_range.start); - let mut req = DagSelect::from(&product) - .key_ranges(vec![key_range]) - .build_with(ctx.clone(), &[0]); - let mut new_ctx = Context::default(); - let new_region = cluster.get_region(region_key.as_encoded()); - let leader = cluster.leader_of_region(new_region.get_id()).unwrap(); - new_ctx.set_region_id(new_region.get_id()); - new_ctx.set_region_epoch(new_region.get_region_epoch().clone()); - new_ctx.set_peer(leader); - req.set_context(new_ctx); - req.set_start_ts(100); - - let batch_handle_ranges = &ranges.as_slice()[1..]; - for handle_range in batch_handle_ranges.iter() { - let range_start_key = Key::from_raw( - &product - .get_record_range(handle_range.start, handle_range.end) - .start, - ); - let batch_region = cluster.get_region(range_start_key.as_encoded()); - let batch_leader = cluster.leader_of_region(batch_region.get_id()).unwrap(); - let batch_key_ranges = - vec![product.get_record_range(handle_range.start, handle_range.end)]; - let mut store_batch_task = StoreBatchTask::new(); - store_batch_task.set_region_id(batch_region.get_id()); - store_batch_task.set_region_epoch(batch_region.get_region_epoch().clone()); - store_batch_task.set_peer(batch_leader); - store_batch_task.set_ranges(batch_key_ranges.into()); - req.tasks.push(store_batch_task); - } - req - }; + let prepare_req = |cluster: &mut Cluster>, + ranges: &Vec| + -> Request { + let original_range = ranges.get(0).unwrap(); + let key_range = product.get_record_range(original_range.start, original_range.end); + let region_key = Key::from_raw(&key_range.start); + let mut req = DagSelect::from(&product) + .key_ranges(vec![key_range]) + .build_with(ctx.clone(), &[0]); + let mut new_ctx = Context::default(); + let new_region = cluster.get_region(region_key.as_encoded()); + let leader = cluster.leader_of_region(new_region.get_id()).unwrap(); + new_ctx.set_region_id(new_region.get_id()); + new_ctx.set_region_epoch(new_region.get_region_epoch().clone()); + new_ctx.set_peer(leader); + req.set_context(new_ctx); + req.set_start_ts(100); + + let batch_handle_ranges = &ranges.as_slice()[1..]; + for handle_range in batch_handle_ranges.iter() { + let range_start_key = Key::from_raw( + &product + .get_record_range(handle_range.start, handle_range.end) + .start, + ); + let batch_region = cluster.get_region(range_start_key.as_encoded()); + let batch_leader = cluster.leader_of_region(batch_region.get_id()).unwrap(); + let batch_key_ranges = + vec![product.get_record_range(handle_range.start, handle_range.end)]; + let mut store_batch_task = StoreBatchTask::new(); + store_batch_task.set_region_id(batch_region.get_id()); + store_batch_task.set_region_epoch(batch_region.get_region_epoch().clone()); + store_batch_task.set_peer(batch_leader); + store_batch_task.set_ranges(batch_key_ranges.into()); + req.tasks.push(store_batch_task); + } + req + }; let verify_response = |result: &QueryResult, resp: &Response| { let (data, details, region_err, locked, other_err) = ( resp.get_data(), diff --git a/tests/integrations/import/mod.rs b/tests/integrations/import/mod.rs index 96e2c655e18..4de0fa26472 100644 --- a/tests/integrations/import/mod.rs +++ b/tests/integrations/import/mod.rs @@ -1,4 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +mod test_apply_log; mod test_sst_service; mod util; diff --git a/tests/integrations/import/test_apply_log.rs b/tests/integrations/import/test_apply_log.rs new file mode 100644 index 00000000000..0b11a12002e --- /dev/null +++ b/tests/integrations/import/test_apply_log.rs @@ -0,0 +1,72 @@ +use engine_traits::CF_DEFAULT; +use external_storage::LocalStorage; +use kvproto::import_sstpb::ApplyRequest; +use tempfile::TempDir; + +use crate::import::util; + +#[test] +fn test_basic_apply() { + let (_cluster, ctx, tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [ + (b"k1", b"v1", 1), + (b"k2", b"v2", 2), + (b"k3", b"v3", 3), + (b"k4", b"v4", 4), + ]; + let default_rewritten = [(b"r1", b"v1", 1), (b"r2", b"v2", 2), (b"r3", b"v3", 3)]; + let mut sst_meta = util::make_plain_file(&storage, "file1.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k3a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx.clone()); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta].into()); + req.set_storage_backend(util::local_storage(&tmp)); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf(&tikv, &ctx, CF_DEFAULT, default_rewritten.into_iter()); +} + +#[test] +fn test_apply_twice() { + let (_cluster, ctx, tikv, import) = util::new_cluster_and_tikv_import_client(); + let tmp = TempDir::new().unwrap(); + let storage = LocalStorage::new(tmp.path()).unwrap(); + let default = [( + b"k1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + let default_fst = [( + b"r1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + let default_snd = [( + b"z1", + b"In this case, we are going to test write twice, but with different rewrite rule.", + 1, + )]; + + let mut sst_meta = util::make_plain_file(&storage, "file2.log", default.into_iter()); + util::register_range_for(&mut sst_meta, b"k1", b"k1a"); + let mut req = ApplyRequest::new(); + req.set_context(ctx.clone()); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"r")].into()); + req.set_metas(vec![sst_meta.clone()].into()); + req.set_storage_backend(util::local_storage(&tmp)); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf(&tikv, &ctx, CF_DEFAULT, default_fst.into_iter()); + + util::register_range_for(&mut sst_meta, b"k1", b"k1a"); + req.set_rewrite_rules(vec![util::rewrite_for(&mut sst_meta, b"k", b"z")].into()); + req.set_metas(vec![sst_meta].into()); + import.apply(&req).unwrap(); + util::check_applied_kvs_cf( + &tikv, + &ctx, + CF_DEFAULT, + default_fst.into_iter().chain(default_snd.into_iter()), + ); +} diff --git a/tests/integrations/import/test_sst_service.rs b/tests/integrations/import/test_sst_service.rs index 22ab9c7d7fe..2eb1c10c72d 100644 --- a/tests/integrations/import/test_sst_service.rs +++ b/tests/integrations/import/test_sst_service.rs @@ -298,7 +298,7 @@ fn test_download_sst() { // Checks that downloading a non-existing storage returns error. let mut download = DownloadRequest::default(); download.set_sst(meta.clone()); - download.set_storage_backend(external_storage_export::make_local_backend(temp_dir.path())); + download.set_storage_backend(external_storage::make_local_backend(temp_dir.path())); download.set_name("missing.sst".to_owned()); let result = import.download(&download).unwrap(); @@ -555,3 +555,97 @@ fn test_duplicate_and_close() { req.set_mode(SwitchMode::Normal); import.switch_mode(&req).unwrap(); } + +#[test] +fn test_suspend_import() { + let (_cluster, ctx, tikv, import) = new_cluster_and_tikv_import_client(); + let sst_range = (0, 10); + let write = |sst_range: (u8, u8)| { + let mut meta = new_sst_meta(0, 0); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + + let mut keys = vec![]; + let mut values = vec![]; + for i in sst_range.0..sst_range.1 { + keys.push(vec![i]); + values.push(vec![i]); + } + send_write_sst(&import, &meta, keys, values, 1) + }; + let ingest = |sst_meta: &SstMeta| { + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(sst_meta.clone()); + import.ingest(&ingest) + }; + let multi_ingest = |sst_metas: &[SstMeta]| { + let mut multi_ingest = MultiIngestRequest::default(); + multi_ingest.set_context(ctx.clone()); + multi_ingest.set_ssts(sst_metas.to_vec().into()); + import.multi_ingest(&multi_ingest) + }; + let suspendctl = |for_time| { + let mut req = SuspendImportRpcRequest::default(); + req.set_caller("test_suspend_import".to_owned()); + if for_time == 0 { + req.set_should_suspend_imports(false); + } else { + req.set_should_suspend_imports(true); + req.set_duration_in_secs(for_time); + } + req + }; + + let write_res = write(sst_range).unwrap(); + assert_eq!(write_res.metas.len(), 1); + let sst = write_res.metas[0].clone(); + + assert!( + !import + .suspend_import_rpc(&suspendctl(6000)) + .unwrap() + .already_suspended + ); + let write_res = write(sst_range); + write_res.unwrap(); + let ingest_res = ingest(&sst); + assert_to_string_contains!(ingest_res.unwrap_err(), "Suspended"); + let multi_ingest_res = multi_ingest(&[sst.clone()]); + assert_to_string_contains!(multi_ingest_res.unwrap_err(), "Suspended"); + + assert!( + import + .suspend_import_rpc(&suspendctl(0)) + .unwrap() + .already_suspended + ); + + let ingest_res = ingest(&sst); + assert!(ingest_res.is_ok(), "{:?} => {:?}", sst, ingest_res); + + check_ingested_txn_kvs(&tikv, &ctx, sst_range, 2); + + // test timeout. + assert!( + !import + .suspend_import_rpc(&suspendctl(1)) + .unwrap() + .already_suspended + ); + let sst_range = (10, 20); + let write_res = write(sst_range); + let sst = write_res.unwrap().metas; + let res = multi_ingest(&sst); + assert_to_string_contains!(res.unwrap_err(), "Suspended"); + std::thread::sleep(Duration::from_secs(1)); + multi_ingest(&sst).unwrap(); + + // check an insane value should be rejected. + import + .suspend_import_rpc(&suspendctl(u64::MAX - 42)) + .unwrap_err(); + let sst_range = (20, 30); + let ssts = write(sst_range).unwrap(); + multi_ingest(ssts.get_metas()).unwrap(); +} diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index cc5d22d517d..92804860dd9 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -1,21 +1,36 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; - +use std::{ + io::{Cursor, Write}, + sync::Arc, + thread, + time::Duration, +}; + +use collections::HashMap; use engine_rocks::RocksEngine; -use futures::{executor::block_on, stream, SinkExt}; +use engine_traits::CF_DEFAULT; +use external_storage::{ExternalStorage, UnpinReader}; +use futures::{executor::block_on, io::Cursor as AsyncCursor, stream, SinkExt}; use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; -use kvproto::{import_sstpb::*, kvrpcpb::*, tikvpb::*}; +use kvproto::{ + brpb::{Local, StorageBackend}, + import_sstpb::{KvMeta, *}, + kvrpcpb::*, + tikvpb::*, +}; use security::SecurityConfig; +use tempfile::TempDir; use test_raftstore::*; use test_raftstore_v2::{Cluster as ClusterV2, ServerCluster as ServerClusterV2}; use tikv::config::TikvConfig; -use tikv_util::HandyRwLock; +use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io, HandyRwLock}; +use txn_types::Key; use uuid::Uuid; const CLEANUP_SST_MILLIS: u64 = 10; -pub fn new_cluster(cfg: TikvConfig) -> (Cluster, Context) { +pub fn new_cluster(cfg: TikvConfig) -> (Cluster>, Context) { let count = 1; let mut cluster = new_server_cluster(0, count); cluster.cfg = Config { @@ -62,7 +77,12 @@ pub fn new_cluster_v2( pub fn open_cluster_and_tikv_import_client( cfg: Option, -) -> (Cluster, Context, TikvClient, ImportSstClient) { +) -> ( + Cluster>, + Context, + TikvClient, + ImportSstClient, +) { let cfg = cfg.unwrap_or_else(|| { let mut config = TikvConfig::default(); config.server.addr = "127.0.0.1:0".to_owned(); @@ -135,14 +155,18 @@ pub fn open_cluster_and_tikv_import_client_v2( (cluster, ctx, tikv, import) } -pub fn new_cluster_and_tikv_import_client() --> (Cluster, Context, TikvClient, ImportSstClient) { +pub fn new_cluster_and_tikv_import_client() -> ( + Cluster>, + Context, + TikvClient, + ImportSstClient, +) { open_cluster_and_tikv_import_client(None) } pub fn new_cluster_and_tikv_import_client_tde() -> ( tempfile::TempDir, - Cluster, + Cluster>, Context, TikvClient, ImportSstClient, @@ -246,6 +270,40 @@ pub fn check_ingested_kvs_cf(tikv: &TikvClient, ctx: &Context, cf: &str, sst_ran } } +#[track_caller] +pub fn check_applied_kvs_cf, V: AsRef<[u8]> + std::fmt::Debug>( + tikv: &TikvClient, + ctx: &Context, + cf: &str, + entries: impl Iterator, +) { + let mut get = RawBatchGetRequest::default(); + get.set_cf(cf.to_owned()); + get.set_context(ctx.clone()); + let mut keymap = HashMap::default(); + for (key, value, ts) in entries { + let the_key = Key::from_raw(key.as_ref()) + .append_ts(ts.into()) + .into_encoded(); + keymap.insert(the_key.clone(), value); + get.mut_keys().push(the_key); + } + for pair in tikv.raw_batch_get(&get).unwrap().get_pairs() { + let entry = keymap.remove(pair.get_key()).expect("unexpected key"); + assert_eq!( + entry.as_ref(), + pair.get_value(), + "key is {:?}", + pair.get_key() + ); + } + assert!( + keymap.is_empty(), + "not all keys consumed, remained {:?}", + keymap + ); +} + pub fn check_ingested_txn_kvs( tikv: &TikvClient, ctx: &Context, @@ -273,3 +331,67 @@ pub fn check_sst_deleted(client: &ImportSstClient, meta: &SstMeta, data: &[u8]) } send_upload_sst(client, meta, data).unwrap(); } + +pub fn make_plain_file(storage: &dyn ExternalStorage, name: &str, kvs: I) -> KvMeta +where + I: Iterator, + K: AsRef<[u8]>, + V: AsRef<[u8]>, +{ + let mut buf = vec![]; + let mut file = Cursor::new(&mut buf); + let mut start_ts: Option = None; + for (key, value, ts) in kvs { + let the_key = Key::from_raw(key.as_ref()) + .append_ts(ts.into()) + .into_encoded(); + start_ts = Some(start_ts.map_or(ts, |ts0| ts0.min(ts))); + for segment in EventEncoder::encode_event(&the_key, value.as_ref()) { + file.write_all(segment.as_ref()).unwrap(); + } + } + file.flush().unwrap(); + let len = buf.len() as u64; + block_on_external_io(storage.write(name, UnpinReader(Box::new(AsyncCursor::new(buf))), len)) + .unwrap(); + let mut meta = KvMeta::new(); + meta.set_start_ts(start_ts.unwrap_or_default()); + meta.set_length(len); + meta.set_restore_ts(u64::MAX); + meta.set_compression_type(kvproto::brpb::CompressionType::Unknown); + meta.set_name(name.to_owned()); + meta.set_cf(CF_DEFAULT.to_owned()); + meta +} + +pub fn rewrite_for(meta: &mut KvMeta, old_prefix: &[u8], new_prefix: &[u8]) -> RewriteRule { + assert_eq!(old_prefix.len(), new_prefix.len()); + fn rewrite(key: &mut Vec, old_prefix: &[u8], new_prefix: &[u8]) { + assert!(key.starts_with(old_prefix)); + let len = old_prefix.len(); + key.splice(..len, new_prefix.iter().cloned()); + } + rewrite(meta.mut_start_key(), old_prefix, new_prefix); + rewrite(meta.mut_end_key(), old_prefix, new_prefix); + let mut rule = RewriteRule::default(); + rule.set_old_key_prefix(old_prefix.to_vec()); + rule.set_new_key_prefix(new_prefix.to_vec()); + rule +} + +pub fn register_range_for(meta: &mut KvMeta, start: &[u8], end: &[u8]) { + let start = Key::from_raw(start); + let end = Key::from_raw(end); + meta.set_start_key(start.into_encoded()); + meta.set_end_key(end.into_encoded()); +} + +pub fn local_storage(tmp: &TempDir) -> StorageBackend { + let mut backend = StorageBackend::default(); + backend.set_local({ + let mut local = Local::default(); + local.set_path(tmp.path().to_str().unwrap().to_owned()); + local + }); + backend +} diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index 3bb93f6809b..998269afb98 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -15,6 +15,7 @@ mod test_life; mod test_merge; mod test_multi; mod test_prevote; +mod test_region_cache; mod test_region_change_observer; mod test_region_heartbeat; mod test_region_info_accessor; diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index b43a3d00d16..74b4a73da43 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -6,6 +6,7 @@ use std::{ }; use concurrency_manager::ConcurrencyManager; +use engine_rocks::RocksEngine; use engine_traits::{ DbOptionsExt, Engines, MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, @@ -27,7 +28,7 @@ use tikv_util::{ worker::{dummy_scheduler, Builder as WorkerBuilder, LazyWorker}, }; -fn test_bootstrap_idempotent(cluster: &mut Cluster) { +fn test_bootstrap_idempotent>(cluster: &mut Cluster) { // assume that there is a node bootstrap the cluster and add region in pd // successfully cluster.add_first_region().unwrap(); @@ -49,7 +50,8 @@ fn test_node_bootstrap_with_prepared_data() { let cfg = new_tikv_config(0); let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store, &None); - let simulate_trans = SimulateTransport::new(ChannelTransport::new()); + let simulate_trans = + SimulateTransport::<_, RocksEngine>::new(ChannelTransport::::new()); let tmp_path = Builder::new().prefix("test_cluster").tempdir().unwrap(); let engine = engine_rocks::util::new_engine(tmp_path.path().to_str().unwrap(), ALL_CFS).unwrap(); @@ -216,7 +218,7 @@ fn test_flush_before_stop() { let region = cluster.get_region(b"k60"); cluster.must_split(®ion, b"k070"); - fail::cfg("flush_before_cluse_threshold", "return(10)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(10)").unwrap(); for i in 0..100 { let key = format!("k{:03}", i); @@ -252,6 +254,36 @@ fn test_flush_before_stop() { .unwrap(); } +// test flush_before_close will not flush forever +#[test] +fn test_flush_before_stop2() { + use test_raftstore_v2::*; + + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + fail::cfg("flush_before_close_threshold", "return(10)").unwrap(); + fail::cfg("on_flush_completed", "return").unwrap(); + + for i in 0..20 { + let key = format!("k{:03}", i); + cluster.must_put_cf(CF_WRITE, key.as_bytes(), b"val"); + cluster.must_put_cf(CF_LOCK, key.as_bytes(), b"val"); + } + + let router = cluster.get_router(1).unwrap(); + let raft_engine = cluster.get_raft_engine(1); + + let (tx, rx) = sync_channel(1); + let msg = PeerMsg::FlushBeforeClose { tx }; + router.force_send(1, msg).unwrap(); + + rx.recv().unwrap(); + + let admin_flush = raft_engine.get_flushed_index(1, CF_RAFT).unwrap().unwrap(); + assert!(admin_flush < 10); +} + // We cannot use a flushed index to call `maybe_advance_admin_flushed` // consider a case: // 1. lock `k` with index 6 @@ -301,7 +333,7 @@ fn test_flush_index_exceed_last_modified() { ) .unwrap(); - fail::cfg("flush_before_cluse_threshold", "return(1)").unwrap(); + fail::cfg("flush_before_close_threshold", "return(1)").unwrap(); let router = cluster.get_router(1).unwrap(); let (tx, rx) = sync_channel(1); let msg = PeerMsg::FlushBeforeClose { tx }; diff --git a/tests/integrations/raftstore/test_clear_stale_data.rs b/tests/integrations/raftstore/test_clear_stale_data.rs index 8010d4c956c..69696a191d4 100644 --- a/tests/integrations/raftstore/test_clear_stale_data.rs +++ b/tests/integrations/raftstore/test_clear_stale_data.rs @@ -47,7 +47,7 @@ fn check_kv_in_all_cfs(db: &RocksEngine, i: u8, found: bool) { } } -fn test_clear_stale_data(cluster: &mut Cluster) { +fn test_clear_stale_data>(cluster: &mut Cluster) { // Disable compaction at level 0. cluster .cfg diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index 6ba405bb918..1bea73d85ea 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -6,7 +6,7 @@ use std::{ }; use collections::HashMap; -use engine_rocks::{raw::Range, util::get_cf_handle}; +use engine_rocks::{raw::Range, util::get_cf_handle, RocksEngine}; use engine_traits::{CachedTablet, MiscExt, CF_WRITE}; use keys::{data_key, DATA_MAX_KEY}; use test_raftstore::*; @@ -32,10 +32,11 @@ fn gen_delete_k(k: &[u8], commit_ts: TimeStamp) -> Vec { k.as_encoded().clone() } -fn test_compact_after_delete(cluster: &mut Cluster) { +fn test_compact_after_delete>(cluster: &mut Cluster) { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); cluster.cfg.raft_store.region_compact_min_tombstones = 500; cluster.cfg.raft_store.region_compact_tombstones_percent = 50; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(1); cluster.cfg.raft_store.region_compact_check_step = Some(1); cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); @@ -97,8 +98,10 @@ fn test_node_compact_after_delete_v2() { cluster.cfg.raft_store.region_compact_tombstones_percent = 50; // disable it cluster.cfg.raft_store.region_compact_min_redundant_rows = 10000000; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(100); cluster.cfg.raft_store.region_compact_check_step = Some(2); - cluster.cfg.rocksdb.titan.enabled = true; + // TODO: v2 doesn't support titan. + // cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); let region = cluster.get_region(b""); @@ -166,10 +169,11 @@ fn test_node_compact_after_update_v2() { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); // disable it cluster.cfg.raft_store.region_compact_min_tombstones = 1000000; - cluster.cfg.raft_store.region_compact_redundant_rows_percent = 40; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = Some(40); cluster.cfg.raft_store.region_compact_min_redundant_rows = 50; cluster.cfg.raft_store.region_compact_check_step = Some(2); - cluster.cfg.rocksdb.titan.enabled = true; + // TODO: titan is not supported in v2. + // cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); let region = cluster.get_region(b""); diff --git a/tests/integrations/raftstore/test_compact_lock_cf.rs b/tests/integrations/raftstore/test_compact_lock_cf.rs index fbc7629c73f..2f3f882927e 100644 --- a/tests/integrations/raftstore/test_compact_lock_cf.rs +++ b/tests/integrations/raftstore/test_compact_lock_cf.rs @@ -1,17 +1,21 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::raw::DBStatisticsTickerType; +use engine_rocks::{raw::DBStatisticsTickerType, RocksEngine}; use engine_traits::{MiscExt, CF_LOCK}; use test_raftstore::*; use tikv_util::config::*; -fn flush(cluster: &mut Cluster) { +fn flush>(cluster: &mut Cluster) { for engines in cluster.engines.values() { engines.kv.flush_cf(CF_LOCK, true).unwrap(); } } -fn flush_then_check(cluster: &mut Cluster, interval: u64, written: bool) { +fn flush_then_check>( + cluster: &mut Cluster, + interval: u64, + written: bool, +) { flush(cluster); // Wait for compaction. sleep_ms(interval * 2); @@ -26,7 +30,7 @@ fn flush_then_check(cluster: &mut Cluster, interval: u64, writt } } -fn test_compact_lock_cf(cluster: &mut Cluster) { +fn test_compact_lock_cf>(cluster: &mut Cluster) { let interval = 500; // Set lock_cf_compact_interval. cluster.cfg.raft_store.lock_cf_compact_interval = ReadableDuration::millis(interval); diff --git a/tests/integrations/raftstore/test_compact_log.rs b/tests/integrations/raftstore/test_compact_log.rs index bc097dd27e9..fcafec4a82e 100644 --- a/tests/integrations/raftstore/test_compact_log.rs +++ b/tests/integrations/raftstore/test_compact_log.rs @@ -1,12 +1,13 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use collections::HashMap; +use engine_rocks::RocksEngine; use kvproto::raft_serverpb::RaftApplyState; use raftstore::store::*; use test_raftstore::*; use tikv_util::config::*; -fn test_compact_log(cluster: &mut Cluster) { +fn test_compact_log>(cluster: &mut Cluster) { cluster.run(); let mut before_states = HashMap::default(); @@ -42,7 +43,7 @@ fn test_compact_log(cluster: &mut Cluster) { ); } -fn test_compact_count_limit(cluster: &mut Cluster) { +fn test_compact_count_limit>(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); cluster.cfg.raft_store.raft_log_gc_threshold = 500; cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); @@ -107,7 +108,7 @@ fn test_compact_count_limit(cluster: &mut Cluster) { ); } -fn test_compact_many_times(cluster: &mut Cluster) { +fn test_compact_many_times>(cluster: &mut Cluster) { let gc_limit: u64 = 100; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(gc_limit); cluster.cfg.raft_store.raft_log_gc_threshold = 500; @@ -176,7 +177,7 @@ fn test_node_compact_many_times() { test_compact_many_times(&mut cluster); } -fn test_compact_size_limit(cluster: &mut Cluster) { +fn test_compact_size_limit>(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100000); cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(1)); cluster.run(); @@ -251,7 +252,9 @@ fn test_node_compact_size_limit() { test_compact_size_limit(&mut cluster); } -fn test_compact_reserve_max_ticks(cluster: &mut Cluster) { +fn test_compact_reserve_max_ticks>( + cluster: &mut Cluster, +) { cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); cluster.cfg.raft_store.raft_log_gc_threshold = 500; cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); diff --git a/tests/integrations/raftstore/test_early_apply.rs b/tests/integrations/raftstore/test_early_apply.rs index b30a861e2fe..91a63b1878c 100644 --- a/tests/integrations/raftstore/test_early_apply.rs +++ b/tests/integrations/raftstore/test_early_apply.rs @@ -2,6 +2,7 @@ use std::time::Duration; +use engine_rocks::RocksEngine; use engine_traits::{RaftEngine, RaftEngineDebug}; use kvproto::raft_serverpb::RaftLocalState; use raft::eraftpb::MessageType; @@ -43,10 +44,14 @@ enum DataLost { AllLost, } -fn test(cluster: &mut Cluster, action: A, check: C, mode: DataLost) -where - A: FnOnce(&mut Cluster), - C: FnOnce(&mut Cluster), +fn test( + cluster: &mut Cluster>, + action: A, + check: C, + mode: DataLost, +) where + A: FnOnce(&mut Cluster>), + C: FnOnce(&mut Cluster>), { let filter = match mode { DataLost::AllLost | DataLost::LeaderCommit => RegionPacketFilter::new(1, 1) @@ -109,7 +114,7 @@ fn test_early_apply(mode: DataLost) { let mut cluster = new_node_cluster(0, 3); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); if mode == DataLost::LeaderCommit || mode == DataLost::AllLost { cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -175,7 +180,7 @@ fn test_update_internal_apply_index() { let mut cluster = new_node_cluster(0, 4); cluster.pd_client.disable_default_operator(); // So compact log will not be triggered automatically. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(3, 3)); cluster.must_put(b"k1", b"v1"); diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 5a28646db65..9ca6092e624 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -555,9 +555,11 @@ trait ClusterI { ) -> raftstore::Result; } -impl ClusterI for Cluster { +impl ClusterI for Cluster> { fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { - Cluster::::region_local_state(self, region_id, store_id) + Cluster::>::region_local_state( + self, region_id, store_id, + ) } fn query_leader( &self, @@ -565,14 +567,16 @@ impl ClusterI for Cluster { region_id: u64, timeout: Duration, ) -> Option { - Cluster::::query_leader(self, store_id, region_id, timeout) + Cluster::>::query_leader( + self, store_id, region_id, timeout, + ) } fn call_command( &self, request: RaftCmdRequest, timeout: Duration, ) -> raftstore::Result { - Cluster::::call_command(self, request, timeout) + Cluster::>::call_command(self, request, timeout) } } diff --git a/tests/integrations/raftstore/test_hibernate.rs b/tests/integrations/raftstore/test_hibernate.rs index 86962330f0f..6e3c64d7851 100644 --- a/tests/integrations/raftstore/test_hibernate.rs +++ b/tests/integrations/raftstore/test_hibernate.rs @@ -62,7 +62,7 @@ fn test_proposal_prevent_sleep() { true, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); // send to peer 2 cluster .sim @@ -90,7 +90,7 @@ fn test_proposal_prevent_sleep() { let conf_change = new_change_peer_request(ConfChangeType::RemoveNode, new_peer(3, 3)); let mut admin_req = new_admin_request(1, region.get_region_epoch(), conf_change); admin_req.mut_header().set_peer(new_peer(1, 1)); - let (cb, _rx) = make_cb(&admin_req); + let (cb, _rx) = make_cb_rocks(&admin_req); cluster .sim .rl() @@ -482,7 +482,7 @@ fn test_leader_demoted_when_hibernated() { ); request.mut_header().set_peer(new_peer(3, 3)); // In case peer 3 is hibernated. - let (cb, _rx) = make_cb(&request); + let (cb, _rx) = make_cb_rocks(&request); cluster .sim .rl() diff --git a/tests/integrations/raftstore/test_joint_consensus.rs b/tests/integrations/raftstore/test_joint_consensus.rs index 282d0d0525c..e682aa9a656 100644 --- a/tests/integrations/raftstore/test_joint_consensus.rs +++ b/tests/integrations/raftstore/test_joint_consensus.rs @@ -2,6 +2,7 @@ use std::{sync::Arc, time::*}; +use engine_rocks::RocksEngine; use kvproto::{ metapb::{self, PeerRole, Region}, raft_cmdpb::{ChangePeerRequest, RaftCmdRequest, RaftCmdResponse}, @@ -10,7 +11,7 @@ use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use raftstore::Result; use test_raftstore::*; -use tikv_util::{mpsc::future, store::find_peer}; +use tikv_util::{future::block_on_timeout, store::find_peer}; /// Tests multiple confchange commands can be done by one request #[test] @@ -164,24 +165,18 @@ fn test_request_in_joint_state() { // Isolated peer 2, so the old configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(2)); - let mut rx = cluster + let rx = cluster .async_request(put_request(®ion, 1, b"k3", b"v3")) .unwrap(); - assert_eq!( - rx.recv_timeout(Duration::from_millis(100)), - Err(future::RecvTimeoutError::Timeout) - ); + block_on_timeout(rx, Duration::from_millis(100)).unwrap_err(); cluster.clear_send_filters(); // Isolated peer 3, so the new configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(3)); - let mut rx = cluster + let rx = cluster .async_request(put_request(®ion, 1, b"k4", b"v4")) .unwrap(); - assert_eq!( - rx.recv_timeout(Duration::from_millis(100)), - Err(future::RecvTimeoutError::Timeout) - ); + block_on_timeout(rx, Duration::from_millis(100)).unwrap_err(); cluster.clear_send_filters(); // Leave joint @@ -479,12 +474,12 @@ fn test_leader_down_in_joint_state() { } fn call_conf_change_v2( - cluster: &mut Cluster, + cluster: &mut Cluster, region_id: u64, changes: Vec, ) -> Result where - T: Simulator, + T: Simulator, { let conf_change = new_change_peer_v2_request(changes); let epoch = cluster.pd_client.get_region_epoch(region_id); @@ -493,13 +488,13 @@ where } fn call_conf_change( - cluster: &mut Cluster, + cluster: &mut Cluster, region_id: u64, conf_change_type: ConfChangeType, peer: metapb::Peer, ) -> Result where - T: Simulator, + T: Simulator, { let conf_change = new_change_peer_request(conf_change_type, peer); let epoch = cluster.pd_client.get_region_epoch(region_id); @@ -507,9 +502,9 @@ where cluster.call_command_on_leader(admin_req, Duration::from_secs(3)) } -fn leave_joint(cluster: &mut Cluster, region_id: u64) -> Result +fn leave_joint(cluster: &mut Cluster, region_id: u64) -> Result where - T: Simulator, + T: Simulator, { call_conf_change_v2(cluster, region_id, vec![]) } diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 60c87fd4e00..f9e6747b660 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -427,7 +427,7 @@ fn test_node_callback_when_destroyed() { let get = new_get_cmd(b"k1"); let mut req = new_request(1, epoch, vec![get], true); req.mut_header().set_peer(leader); - let (cb, mut rx) = make_cb(&req); + let (cb, mut rx) = make_cb_rocks(&req); cluster .sim .rl() @@ -481,7 +481,7 @@ fn test_read_index_stale_in_suspect_lease() { configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let max_lease = Duration::from_secs(2); // Stop log compaction to transfer leader with filter easier. - configure_for_request_snapshot(&mut cluster); + configure_for_request_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); cluster.pd_client.disable_default_operator(); @@ -648,7 +648,7 @@ fn test_not_leader_read_lease() { true, ); req.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&req); + let (cb, mut rx) = make_cb_rocks(&req); cluster.sim.rl().async_command_on_node(1, req, cb).unwrap(); cluster.must_transfer_leader(region_id, new_peer(3, 3)); @@ -701,7 +701,7 @@ fn test_read_index_after_write() { req.mut_header() .set_peer(new_peer(1, region_on_store1.get_id())); // Don't care about the first one's read index - let (cb, _) = make_cb(&req); + let (cb, _) = make_cb_rocks(&req); cluster.sim.rl().async_command_on_node(1, req, cb).unwrap(); cluster.must_put(b"k2", b"v2"); @@ -715,7 +715,7 @@ fn test_read_index_after_write() { ); req.mut_header() .set_peer(new_peer(1, region_on_store1.get_id())); - let (cb, mut rx) = make_cb(&req); + let (cb, mut rx) = make_cb_rocks(&req); cluster.sim.rl().async_command_on_node(1, req, cb).unwrap(); cluster.sim.wl().clear_recv_filters(2); diff --git a/tests/integrations/raftstore/test_life.rs b/tests/integrations/raftstore/test_life.rs index e940ca30a7c..809904c7f46 100644 --- a/tests/integrations/raftstore/test_life.rs +++ b/tests/integrations/raftstore/test_life.rs @@ -7,11 +7,9 @@ use std::{ use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}; use raftstore::errors::Result; -use test_raftstore::{ - new_learner_peer, new_peer, sleep_ms, Filter, FilterFactory, Simulator as S1, -}; +use test_raftstore::{new_learner_peer, new_peer, Filter, FilterFactory, Simulator as S1}; use test_raftstore_v2::Simulator as S2; -use tikv_util::{time::Instant, HandyRwLock}; +use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; struct ForwardFactory { node_id: u64, @@ -64,6 +62,7 @@ fn test_gc_peer_tiflash_engine() { let mut cluster_v1 = test_raftstore::new_node_cluster(1, 2); let mut cluster_v2 = test_raftstore_v2::new_node_cluster(1, 2); cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + cluster_v2.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); cluster_v1.pd_client.disable_default_operator(); cluster_v2.pd_client.disable_default_operator(); let r11 = cluster_v1.run_conf_change(); @@ -124,26 +123,14 @@ fn test_gc_peer_tiflash_engine() { .must_remove_peer(r21, new_learner_peer(2, 10)); // Make sure leader cleans up removed_records. - let start = Instant::now(); - loop { - sleep_ms(500); - if cluster_v2 - .region_local_state(r21, 1) - .get_removed_records() - .is_empty() - { - break; - } - if start.saturating_elapsed() > Duration::from_secs(5) { - panic!("timeout"); - } - } + cluster_v2.must_empty_region_removed_records(r21); } #[test] fn test_gc_removed_peer() { let mut cluster = test_raftstore::new_node_cluster(1, 2); cluster.cfg.raft_store.enable_v2_compatible_learner = true; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); cluster.pd_client.disable_default_operator(); let region_id = cluster.run_conf_change(); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index afc0c9afab4..8482feb8481 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -6,7 +6,7 @@ use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::{CF_LOCK, CF_WRITE}; use kvproto::{ raft_cmdpb::CmdType, - raft_serverpb::{PeerState, RaftMessage, RegionLocalState}, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, }; use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; @@ -14,7 +14,7 @@ use raftstore::store::{Callback, LocksStatus}; use test_raftstore::*; use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; -use tikv_util::{config::*, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; /// Test if merge is working as expected in a general condition. @@ -407,6 +407,122 @@ fn test_node_check_merged_message() { must_get_none(&engine3, b"v5"); } +/// Test if an uninitialized stale peer will be handled properly after merge. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_gc_uninitialized_peer_after_merge() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::millis(150); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(left.get_id(), new_peer(4, 4)); + cluster.must_region_exist(left.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(left.get_id(), new_peer(4, 4)); + pd_client.must_merge(left.get_id(), right.get_id()); + cluster.clear_send_filters(); + + // Wait for the peer (4,4) to be destroyed. + sleep_ms( + 2 * cluster + .cfg + .raft_store + .max_leader_missing_duration + .as_millis(), + ); + cluster.must_region_not_exist(left.get_id(), 4); +} + +/// Test leader missing should issue check stale peer requests. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_gc_uninitialized_peer_after_merge_on_leader_missing() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + // Set a large max_leader_missing_duration so that check stale peer will + // only be triggered by leader missing. + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(1); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(left.get_id(), new_peer(4, 4)); + cluster.must_region_exist(left.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(left.get_id(), new_peer(4, 4)); + pd_client.must_merge(left.get_id(), right.get_id()); + cluster.clear_send_filters(); + + // Wait for the peer (4,4) to be destroyed. + sleep_ms( + 3 * cluster + .cfg + .raft_store + .abnormal_leader_missing_duration + .as_millis(), + ); + cluster.must_region_not_exist(left.get_id(), 4); +} + // Test if a merge handled properly when there is a unfinished slow split before // merge. // No v2, it requires all peers to be available to check trim status. @@ -1444,10 +1560,10 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { // The gap is too large, so the previous merge should fail. And this new put // request should be allowed. - let mut res = cluster.async_put(b"k1", b"new_val").unwrap(); + let res = cluster.async_put(b"k1", b"new_val").unwrap(); cluster.clear_send_filters(); - res.recv_timeout(Duration::from_secs(5)).unwrap(); + block_on_timeout(res, Duration::from_secs(5)).unwrap(); assert_eq!(cluster.must_get(b"k1").unwrap(), b"new_val"); } @@ -1731,3 +1847,243 @@ fn test_prepare_merge_with_5_nodes_snapshot() { // Now leader should replicate more logs and figure out a safe index. pd_client.must_merge(left.get_id(), right.get_id()); } + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_removed_records_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + let left_peer_on_store3 = find_peer(&left, 3).unwrap().clone(); + pd_client.must_remove_peer(left.get_id(), left_peer_on_store3); + must_get_none(&cluster.get_engine(3), b"k1"); + + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3.clone()); + + // So cluster becomes + // left region: 1(leader) 2 | + // right region: 1(leader) 2 | 3 (removed but not yet destroyed) + // | means isolation. + + // Merge right to left. + pd_client.must_merge(right.get_id(), left.get_id()); + let region_state = cluster.region_local_state(left.get_id(), 1); + assert!( + !region_state.get_merged_records()[0] + .get_source_removed_records() + .is_empty(), + "{:?}", + region_state + ); + assert!( + !region_state + .get_removed_records() + .iter() + .any(|p| p.get_id() == right_peer_on_store3.get_id()), + "{:?}", + region_state + ); + + // Cluster filters and wait for gc peer ticks. + cluster.clear_send_filters(); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Right region replica on store 3 must be removed. + cluster.must_region_not_exist(right.get_id(), 3); + + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(left.get_id()); + cluster.must_empty_region_removed_records(left.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_peers_forward_by_target_peer_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.raft_log_gc_threshold = 40; + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(40); + cluster.cfg.raft_store.merge_max_log_gap = 15; + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Use DropMessageFilter to drop messages to store 3 without reporting error. + cluster.add_recv_filter_on_node( + 3, + Box::new(DropMessageFilter::new(Arc::new(|m| { + // Do not drop MsgAvailabilityRequest and MsgAvailabilityResponse + // messages, otherwise merge is blocked. + matches!( + m.get_extra_msg().get_type(), + ExtraMessageType::MsgAvailabilityRequest + | ExtraMessageType::MsgAvailabilityResponse + ) + }))), + ); + + // So cluster becomes + // left region: 1(leader) 2 | 3 + // right region: 1(leader) 2 | 3 + // | means isolation. + + // Merge left to right and remove left peer on store 3. + pd_client.must_merge(left.get_id(), right.get_id()); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3); + let region_state = cluster.region_local_state(right.get_id(), 1); + assert!( + !region_state.get_merged_records().is_empty(), + "{:?}", + region_state + ); + + // So cluster becomes + // left region: merged + // right region: 1(leader) 2 | 3 (removed but not yet destroyed) + // | means isolation. + + let state1 = cluster.truncated_state(right.get_id(), 1); + (0..50).for_each(|i| cluster.must_put(b"k2", format!("v{}", i).as_bytes())); + // Wait to trigger compact raft log + cluster.wait_log_truncated(right.get_id(), 1, state1.get_index() + 1); + + // Cluster filters and wait for gc peer ticks. + cluster.clear_recv_filter_on_node(3); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Left region replica on store 3 must be removed. + cluster.must_region_not_exist(left.get_id(), 3); + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(right.get_id()); + cluster.must_empty_region_removed_records(right.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_source_peers_forward_by_store_after_merge() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(500); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Drop GcPeerResponse. + cluster.add_recv_filter_on_node( + 1, + Box::new(DropMessageFilter::new(Arc::new(|m| { + m.get_extra_msg().get_type() != ExtraMessageType::MsgGcPeerResponse + }))), + ); + + // So cluster becomes + // left region: 1(leader) 2 | 3 + // right region: 1(leader) 2 | 3 + // | means isolation. + + // Merge left to right and remove left peer on store 3. + pd_client.must_merge(left.get_id(), right.get_id()); + let right_peer_on_store3 = find_peer(&right, 3).unwrap().clone(); + pd_client.must_remove_peer(right.get_id(), right_peer_on_store3); + // Right region replica on store 3 must be removed. + cluster.must_region_not_exist(right.get_id(), 3); + let region_state = cluster.region_local_state(right.get_id(), 1); + assert!( + !region_state.get_merged_records().is_empty(), + "{:?}", + region_state + ); + assert!( + !region_state.get_removed_records().is_empty(), + "{:?}", + region_state + ); + + // So cluster becomes + // left region: merged + // right region: 1(leader) 2 | 3 (destroyed but not yet cleaned in removed + // records) + // | means isolation. + + // Cluster filters and wait for gc peer ticks. + cluster.clear_recv_filter_on_node(1); + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Right region must clean up removed and merged records. + cluster.must_empty_region_merged_records(right.get_id()); + cluster.must_empty_region_removed_records(right.get_id()); +} + +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_gc_merged_record_in_time() { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + cluster.cfg.raft_store.gc_peer_check_interval = ReadableDuration::millis(100); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let left_peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), left_peer_on_store1); + let right_peer_on_store1 = find_peer(&right, 1).unwrap().clone(); + cluster.must_transfer_leader(right.get_id(), right_peer_on_store1); + + // Wait enough time to trigger gc peer, and if there is nothing to gc, + // leader skips registering gc peer tick. + sleep_ms(3 * cluster.cfg.raft_store.gc_peer_check_interval.as_millis()); + + // Merge left to right. + pd_client.must_merge(left.get_id(), right.get_id()); + + // Once merge complete, gc peer tick should be registered and merged record + // will be cleaned up in time. + cluster.must_empty_region_merged_records(right.get_id()); +} diff --git a/tests/integrations/raftstore/test_multi.rs b/tests/integrations/raftstore/test_multi.rs index 8093a30872d..f40e6695599 100644 --- a/tests/integrations/raftstore/test_multi.rs +++ b/tests/integrations/raftstore/test_multi.rs @@ -6,6 +6,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use engine_traits::Peekable; use kvproto::raft_cmdpb::RaftCmdResponse; use raft::eraftpb::MessageType; @@ -16,13 +17,15 @@ use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{config::*, HandyRwLock}; use txn_types::{Key, LastChange, PessimisticLock}; -fn test_multi_base(cluster: &mut Cluster) { +fn test_multi_base>(cluster: &mut Cluster) { cluster.run(); test_multi_base_after_bootstrap(cluster); } -fn test_multi_base_after_bootstrap(cluster: &mut Cluster) { +fn test_multi_base_after_bootstrap>( + cluster: &mut Cluster, +) { let (key, value) = (b"k1", b"v1"); cluster.must_put(key, value); @@ -49,7 +52,7 @@ fn test_multi_base_after_bootstrap(cluster: &mut Cluster) { // TODO add epoch not match test cases. } -fn test_multi_leader_crash(cluster: &mut Cluster) { +fn test_multi_leader_crash>(cluster: &mut Cluster) { cluster.run(); let (key1, value1) = (b"k1", b"v1"); @@ -90,7 +93,7 @@ fn test_multi_leader_crash(cluster: &mut Cluster) { must_get_none(&cluster.engines[&last_leader.get_store_id()].kv, key1); } -fn test_multi_cluster_restart(cluster: &mut Cluster) { +fn test_multi_cluster_restart>(cluster: &mut Cluster) { cluster.run(); let (key, value) = (b"k1", b"v1"); @@ -110,7 +113,10 @@ fn test_multi_cluster_restart(cluster: &mut Cluster) { assert_eq!(cluster.get(key), Some(value.to_vec())); } -fn test_multi_lost_majority(cluster: &mut Cluster, count: usize) { +fn test_multi_lost_majority>( + cluster: &mut Cluster, + count: usize, +) { cluster.run(); let leader = cluster.leader_of_region(1); @@ -129,8 +135,8 @@ fn test_multi_lost_majority(cluster: &mut Cluster, count: usize assert!(cluster.leader_of_region(1).is_none()); } -fn test_multi_random_restart( - cluster: &mut Cluster, +fn test_multi_random_restart>( + cluster: &mut Cluster, node_count: usize, restart_count: u32, ) { @@ -173,7 +179,7 @@ fn test_multi_server_base() { test_multi_base(&mut cluster) } -fn test_multi_latency(cluster: &mut Cluster) { +fn test_multi_latency>(cluster: &mut Cluster) { cluster.run(); cluster.add_send_filter(CloneFilterFactory(DelayFilter::new(Duration::from_millis( 30, @@ -195,7 +201,7 @@ fn test_multi_server_latency() { test_multi_latency(&mut cluster); } -fn test_multi_random_latency(cluster: &mut Cluster) { +fn test_multi_random_latency>(cluster: &mut Cluster) { cluster.run(); cluster.add_send_filter(CloneFilterFactory(RandomLatencyFilter::new(50))); test_multi_base_after_bootstrap(cluster); @@ -215,7 +221,7 @@ fn test_multi_server_random_latency() { test_multi_random_latency(&mut cluster); } -fn test_multi_drop_packet(cluster: &mut Cluster) { +fn test_multi_drop_packet>(cluster: &mut Cluster) { cluster.run(); cluster.add_send_filter(CloneFilterFactory(DropPacketFilter::new(30))); test_multi_base_after_bootstrap(cluster); @@ -295,7 +301,9 @@ fn test_multi_server_random_restart() { test_multi_random_restart(&mut cluster, count, 10); } -fn test_leader_change_with_uncommitted_log(cluster: &mut Cluster) { +fn test_leader_change_with_uncommitted_log>( + cluster: &mut Cluster, +) { cluster.cfg.raft_store.raft_election_timeout_ticks = 50; // disable compact log to make test more stable. cluster.cfg.raft_store.raft_log_gc_threshold = 1000; @@ -485,7 +493,9 @@ fn test_node_leader_change_with_log_overlap() { panic!("callback has not been called after 5s."); } -fn test_read_leader_with_unapplied_log(cluster: &mut Cluster) { +fn test_read_leader_with_unapplied_log>( + cluster: &mut Cluster, +) { cluster.cfg.raft_store.raft_election_timeout_ticks = 50; // disable compact log to make test more stable. cluster.cfg.raft_store.raft_log_gc_threshold = 1000; @@ -574,8 +584,8 @@ fn test_server_read_leader_with_unapplied_log() { test_read_leader_with_unapplied_log(&mut cluster); } -fn get_with_timeout( - cluster: &mut Cluster, +fn get_with_timeout>( + cluster: &mut Cluster, key: &[u8], read_quorum: bool, timeout: Duration, @@ -591,7 +601,9 @@ fn get_with_timeout( cluster.call_command_on_leader(req, timeout) } -fn test_remove_leader_with_uncommitted_log(cluster: &mut Cluster) { +fn test_remove_leader_with_uncommitted_log>( + cluster: &mut Cluster, +) { cluster.cfg.raft_store.raft_election_timeout_ticks = 50; // disable compact log to make test more stable. cluster.cfg.raft_store.raft_log_gc_threshold = 1000; @@ -717,7 +729,7 @@ fn test_node_dropped_proposal() { .expect("callback should have been called with in 5s."); } -fn test_consistency_check(cluster: &mut Cluster) { +fn test_consistency_check>(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_election_timeout_ticks = 50; // disable compact log to make test more stable. cluster.cfg.raft_store.raft_log_gc_threshold = 1000; @@ -740,7 +752,7 @@ fn test_node_consistency_check() { test_consistency_check(&mut cluster); } -fn test_batch_write(cluster: &mut Cluster) { +fn test_batch_write>(cluster: &mut Cluster) { cluster.run(); let r = cluster.get_region(b""); cluster.must_split(&r, b"k3"); diff --git a/tests/integrations/raftstore/test_prevote.rs b/tests/integrations/raftstore/test_prevote.rs index c81b34f0435..c843154b121 100644 --- a/tests/integrations/raftstore/test_prevote.rs +++ b/tests/integrations/raftstore/test_prevote.rs @@ -6,6 +6,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use raft::eraftpb::MessageType; use test_raftstore::*; use tikv_util::HandyRwLock; @@ -15,7 +16,10 @@ enum FailureType<'a> { Reboot(&'a [u64]), } -fn attach_prevote_notifiers(cluster: &Cluster, peer: u64) -> mpsc::Receiver<()> { +fn attach_prevote_notifiers>( + cluster: &Cluster, + peer: u64, +) -> mpsc::Receiver<()> { // Setup a notifier let (tx, rx) = mpsc::channel(); let response_notifier = Box::new(MessageTypeNotifier::new( @@ -37,8 +41,8 @@ fn attach_prevote_notifiers(cluster: &Cluster, peer: u64) -> mp // Validate that prevote is used in elections after partition or reboot of some // nodes. -fn test_prevote( - cluster: &mut Cluster, +fn test_prevote>( + cluster: &mut Cluster, failure_type: FailureType<'_>, leader_after_failure_id: impl Into>, detect_during_failure: impl Into>, @@ -219,7 +223,7 @@ fn test_prevote_reboot_minority_followers() { // Test isolating a minority of the cluster and make sure that the remove // themselves. -fn test_pair_isolated(cluster: &mut Cluster) { +fn test_pair_isolated>(cluster: &mut Cluster) { let region = 1; let pd_client = Arc::clone(&cluster.pd_client); @@ -246,7 +250,9 @@ fn test_server_pair_isolated() { test_pair_isolated(&mut cluster); } -fn test_isolated_follower_leader_does_not_change(cluster: &mut Cluster) { +fn test_isolated_follower_leader_does_not_change>( + cluster: &mut Cluster, +) { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v1"); @@ -282,7 +288,9 @@ fn test_server_isolated_follower_leader_does_not_change() { test_isolated_follower_leader_does_not_change(&mut cluster); } -fn test_create_peer_from_pre_vote(cluster: &mut Cluster) { +fn test_create_peer_from_pre_vote>( + cluster: &mut Cluster, +) { let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_region_cache.rs b/tests/integrations/raftstore/test_region_cache.rs new file mode 100644 index 00000000000..4d95ff6701c --- /dev/null +++ b/tests/integrations/raftstore/test_region_cache.rs @@ -0,0 +1,17 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use test_raftstore::new_node_cluster_with_hybrid_engine; + +#[test] +fn test_basic_read() { + let _cluster = new_node_cluster_with_hybrid_engine(1, 3); + // todo(SpadeA): add test logic +} + +#[test] +fn test_read_index() { + let _cluster = new_node_cluster_with_hybrid_engine(1, 3); + // todo(SpadeA): add test logic +} + +// todo(SpadeA): more tests when other relevant modules are ready. diff --git a/tests/integrations/raftstore/test_region_change_observer.rs b/tests/integrations/raftstore/test_region_change_observer.rs index 72bbfdc9b8f..4b37e8aa962 100644 --- a/tests/integrations/raftstore/test_region_change_observer.rs +++ b/tests/integrations/raftstore/test_region_change_observer.rs @@ -9,6 +9,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use kvproto::metapb::Region; use raft::StateRole; use raftstore::coprocessor::{ @@ -39,7 +40,7 @@ impl RegionChangeObserver for TestObserver { } } -fn test_region_change_observer_impl(mut cluster: Cluster) { +fn test_region_change_observer_impl(mut cluster: Cluster>) { let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_region_heartbeat.rs b/tests/integrations/raftstore/test_region_heartbeat.rs index 29f7e8b10dd..1f9b7cb1eb8 100644 --- a/tests/integrations/raftstore/test_region_heartbeat.rs +++ b/tests/integrations/raftstore/test_region_heartbeat.rs @@ -6,6 +6,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use test_raftstore::*; use test_raftstore_macro::test_case; use tikv_util::{ @@ -91,7 +92,7 @@ fn test_server_down_peers_without_hibernate_regions() { test_down_peers!(&mut cluster); } -fn test_pending_peers(cluster: &mut Cluster) { +fn test_pending_peers>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_region_info_accessor.rs b/tests/integrations/raftstore/test_region_info_accessor.rs index 24d90b66327..6da6c062e9e 100644 --- a/tests/integrations/raftstore/test_region_info_accessor.rs +++ b/tests/integrations/raftstore/test_region_info_accessor.rs @@ -6,6 +6,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use kvproto::metapb::Region; use raft::StateRole; use raftstore::coprocessor::{RangeKey, RegionInfo, RegionInfoAccessor}; @@ -47,7 +48,10 @@ fn check_region_ranges(regions: &[(Region, StateRole)], ranges: &[(&[u8], &[u8]) }) } -fn test_region_info_accessor_impl(cluster: &mut Cluster, c: &RegionInfoAccessor) { +fn test_region_info_accessor_impl( + cluster: &mut Cluster>, + c: &RegionInfoAccessor, +) { for i in 0..9 { let k = format!("k{}", i).into_bytes(); let v = format!("v{}", i).into_bytes(); diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index 367ac63aabb..db373106402 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -1,14 +1,15 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; +use std::{iter::FromIterator, sync::Arc, thread, time::Duration}; +use engine_rocks::RocksEngine; use kvproto::replication_modepb::*; use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use test_raftstore::*; use tikv_util::{config::*, mpsc::future, HandyRwLock}; -fn prepare_cluster() -> Cluster { +fn prepare_cluster() -> Cluster> { let mut cluster = new_server_cluster(0, 3); cluster.pd_client.disable_default_operator(); cluster.pd_client.configure_dr_auto_sync("zone"); @@ -20,7 +21,7 @@ fn prepare_cluster() -> Cluster { cluster } -fn configure_for_snapshot(cluster: &mut Cluster) { +fn configure_for_snapshot(cluster: &mut Cluster>) { // Truncate the log quickly so that we can force sending snapshot. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(2); @@ -28,12 +29,24 @@ fn configure_for_snapshot(cluster: &mut Cluster) { cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); } -fn run_cluster(cluster: &mut Cluster) { +fn run_cluster(cluster: &mut Cluster>) { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v0"); } +fn prepare_labels(cluster: &mut Cluster>) { + cluster.add_label(1, "dc", "dc1"); + cluster.add_label(2, "dc", "dc1"); + cluster.add_label(3, "dc", "dc2"); + cluster.add_label(1, "zone", "z1"); + cluster.add_label(2, "zone", "z2"); + cluster.add_label(3, "zone", "z3"); + cluster.add_label(1, "host", "h1"); + cluster.add_label(2, "host", "h2"); + cluster.add_label(3, "host", "h3"); +} + /// When using DrAutoSync replication mode, data should be replicated to /// different labels before committed. #[test] @@ -49,7 +62,7 @@ fn test_dr_auto_sync() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -71,7 +84,7 @@ fn test_dr_auto_sync() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -87,6 +100,67 @@ fn test_dr_auto_sync() { assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); } +// When in sync recover state, and the region is in joint state. The leave joint +// state should be committed successfully. +#[test] +fn test_sync_recover_joint_state() { + let mut cluster = new_server_cluster(0, 5); + cluster.pd_client.disable_default_operator(); + cluster.pd_client.configure_dr_auto_sync("zone"); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_gc_threshold = 1; + cluster.add_label(1, "zone", "ES"); + cluster.add_label(2, "zone", "ES"); + cluster.add_label(3, "zone", "ES"); + cluster.add_label(4, "zone", "WS"); // old dr + cluster.add_label(5, "zone", "WS"); // new dr + + let pd_client = Arc::clone(&cluster.pd_client); + let region_id = cluster.run_conf_change(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 5); + cluster.must_put(b"k1", b"v1"); + + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); + + pd_client.must_add_peer(region_id, new_peer(2, 2)); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + pd_client.must_add_peer(region_id, new_peer(4, 4)); + pd_client.must_add_peer(region_id, new_learner_peer(5, 5)); + + // Make one node down + cluster.stop_node(4); + + // Switch to sync recover + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); + + cluster.must_put(b"k2", b"v2"); + assert_eq!(cluster.must_get(b"k2").unwrap(), b"v2"); + + // Enter joint, now we have C_old(1, 2, 3, 4) and C_new(1, 2, 3, 5) + pd_client.must_joint_confchange( + region_id, + vec![ + (ConfChangeType::AddLearnerNode, new_learner_peer(4, 4)), + (ConfChangeType::AddNode, new_peer(5, 5)), + ], + ); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + assert_ne!(left.get_id(), right.get_id()); + + // Leave joint + pd_client.must_leave_joint(left.get_id()); + pd_client.must_leave_joint(right.get_id()); +} + #[test] fn test_sync_recover_after_apply_snapshot() { let mut cluster = prepare_cluster(); @@ -101,7 +175,7 @@ fn test_sync_recover_after_apply_snapshot() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -119,7 +193,7 @@ fn test_sync_recover_after_apply_snapshot() { // swith to async cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); thread::sleep(Duration::from_millis(100)); @@ -136,7 +210,7 @@ fn test_sync_recover_after_apply_snapshot() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::SyncRecover, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); // Add node 3 back, snapshot will apply cluster.clear_send_filters(); @@ -248,7 +322,7 @@ fn test_switching_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -265,7 +339,7 @@ fn test_switching_replication_mode() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); thread::sleep(Duration::from_millis(100)); @@ -275,7 +349,7 @@ fn test_switching_replication_mode() { cluster .pd_client - .switch_replication_mode(DrAutoSyncState::SyncRecover, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); thread::sleep(Duration::from_millis(100)); let mut request = new_request( region.get_id(), @@ -284,7 +358,7 @@ fn test_switching_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -312,7 +386,7 @@ fn test_switching_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -331,7 +405,7 @@ fn test_replication_mode_allowlist() { run_cluster(&mut cluster); cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![1]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![1]); thread::sleep(Duration::from_millis(100)); // 2,3 are paused, so it should not be able to write. @@ -343,7 +417,7 @@ fn test_replication_mode_allowlist() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -357,7 +431,7 @@ fn test_replication_mode_allowlist() { // clear allowlist. cluster .pd_client - .switch_replication_mode(DrAutoSyncState::Async, vec![]); + .switch_replication_mode(Some(DrAutoSyncState::Async), vec![]); rx.recv_timeout(Duration::from_millis(100)).unwrap(); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); } @@ -431,7 +505,7 @@ fn test_migrate_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb_rocks(&request); cluster .sim .rl() @@ -456,6 +530,70 @@ fn test_migrate_replication_mode() { assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); } +#[test] +fn test_migrate_majority_to_drautosync() { + // 1. start cluster, enable dr-auto-sync and set labels. + let mut cluster = new_server_cluster(0, 3); + cluster.pd_client.disable_default_operator(); + cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_gc_threshold = 10; + prepare_labels(&mut cluster); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(b"k1", b"v0"); + cluster.pd_client.configure_dr_auto_sync("dc"); + thread::sleep(Duration::from_millis(100)); + let region = cluster.get_region(b"k1"); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_put_cf_cmd("default", b"k2", b"v2")], + false, + ); + request.mut_header().set_peer(new_peer(1, 1)); + let (cb, mut rx) = make_cb_rocks(&request); + cluster + .sim + .rl() + .async_command_on_node(1, request, cb) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_millis(100)).is_ok(), true); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); + let state = cluster.pd_client.region_replication_status(region.get_id()); + assert_eq!(state.state_id, 1); + assert_eq!(state.state, RegionReplicationState::IntegrityOverLabel); + + // 2. switch to majority mode. + cluster.pd_client.switch_replication_mode(None, vec![]); + thread::sleep(Duration::from_millis(150)); + + // 3. spilt the region and make a new region, the regions status must be + // SimpleMajority. + cluster.must_split(®ion, b"m1"); + thread::sleep(Duration::from_millis(150)); + cluster.must_put(b"n4", b"v4"); + must_get_equal(&cluster.get_engine(1), b"n4", b"v4"); + let region_m = cluster.get_region(b"n4"); + let region_k = cluster.get_region(b"k1"); + + // 4. switch to dy-auto-sync mode, the new region generated at majority mode + // becomes IntegrityOverLabel again. + cluster + .pd_client + .switch_replication_mode(Some(DrAutoSyncState::SyncRecover), vec![]); + thread::sleep(Duration::from_millis(100)); + let state_m = cluster + .pd_client + .region_replication_status(region_m.get_id()); + let state_k = cluster + .pd_client + .region_replication_status(region_k.get_id()); + assert_eq!(state_m.state_id, 3); + assert_eq!(state_m.state, RegionReplicationState::IntegrityOverLabel); + assert_eq!(state_k.state_id, 3); + assert_eq!(state_k.state, RegionReplicationState::IntegrityOverLabel); +} + /// Tests if labels are loaded correctly after rolling start. #[test] fn test_loading_label_after_rolling_start() { diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 9eda281e9e4..23b2a37e6c9 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -12,6 +12,7 @@ use std::{ }; use collections::HashMap; +use engine_rocks::RocksEngine; use engine_traits::{Checkpointer, KvEngine, RaftEngineDebug}; use file_system::{IoOp, IoType}; use futures::executor::block_on; @@ -39,7 +40,10 @@ use tikv_util::{ HandyRwLock, }; -fn test_huge_snapshot(cluster: &mut Cluster, max_snapshot_file_size: u64) { +fn test_huge_snapshot>( + cluster: &mut Cluster, + max_snapshot_file_size: u64, +) { cluster.cfg.rocksdb.titan.enabled = true; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); @@ -227,8 +231,6 @@ fn test_server_snap_gc() { #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] -#[test_case(test_raftstore_v2::new_server_cluster)] fn test_concurrent_snap() { let mut cluster = new_cluster(0, 3); // Test that the handling of snapshot is correct when there are multiple @@ -279,6 +281,59 @@ fn test_concurrent_snap() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_concurrent_snap_v2() { + let mut cluster = new_cluster(0, 3); + // TODO: v2 doesn't support titan. + // Test that the handling of snapshot is correct when there are multiple + // snapshots which have overlapped region ranges arrive at the same + // raftstore. + // cluster.cfg.rocksdb.titan.enabled = true; + // Disable raft log gc in this test case. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); + // For raftstore v2, after split, follower delays first messages (see + // is_first_message() for details), so leader does not send snapshot to + // follower and CollectSnapshotFilter holds parent region snapshot forever. + // We need to set a short wait duration so that leader can send snapshot + // in time and thus CollectSnapshotFilter can send parent region snapshot. + cluster.cfg.raft_store.snap_wait_split_duration = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + // Force peer 2 to be followers all the way. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 2) + .msg_type(MessageType::MsgRequestVote) + .direction(Direction::Send), + )); + cluster.must_transfer_leader(r1, new_peer(1, 1)); + cluster.must_put(b"k3", b"v3"); + // Pile up snapshots of overlapped region ranges and deliver them all at once. + let (tx, rx) = mpsc::channel(); + cluster.add_recv_filter_on_node(3, Box::new(CollectSnapshotFilter::new(tx))); + pd_client.must_add_peer(r1, new_peer(3, 3)); + let region = cluster.get_region(b"k1"); + // Ensure the snapshot of range ("", "") is sent and piled in filter. + if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { + panic!("the snapshot is not sent before split, e: {:?}", e); + } + // Split the region range and then there should be another snapshot for the + // split ranges. + cluster.must_split(®ion, b"k2"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Ensure the regions work after split. + cluster.must_put(b"k11", b"v11"); + must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); + cluster.must_put(b"k4", b"v4"); + must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); +} + #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] @@ -568,7 +623,7 @@ fn test_gen_during_heavy_recv() { let snap = do_snapshot( snap_mgr.clone(), &engine, - engine.snapshot(), + engine.snapshot(None), r2, snap_term, snap_apply_state, diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index c0f75487998..1dd5e7db6d0 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -7,6 +7,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use engine_traits::{Peekable, CF_DEFAULT, CF_WRITE}; use keys::data_key; use kvproto::{ @@ -609,7 +610,7 @@ fn test_node_split_region_after_reboot_with_config_change() { sleep_ms(200); assert_eq!(pd_client.get_split_count(), 0); - // change the config to make the region splittable + // change the config to make the region splitable cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size / 3)); cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size / 3)); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(region_split_size / 3); @@ -629,7 +630,10 @@ fn test_node_split_region_after_reboot_with_config_change() { } } -fn test_split_epoch_not_match(cluster: &mut Cluster, right_derive: bool) { +fn test_split_epoch_not_match>( + cluster: &mut Cluster, + right_derive: bool, +) { cluster.cfg.raft_store.right_derive_when_split = right_derive; cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -976,14 +980,13 @@ fn test_refresh_region_bucket_keys() { cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); + // case: init bucket info cluster.must_put(b"k11", b"v1"); let mut region = pd_client.get_region(b"k11").unwrap(); - let bucket = Bucket { keys: vec![b"k11".to_vec()], size: 1024 * 1024 * 200, }; - let mut expected_buckets = metapb::Buckets::default(); expected_buckets.set_keys(bucket.clone().keys.into()); expected_buckets @@ -997,6 +1000,8 @@ fn test_refresh_region_bucket_keys() { Option::None, Some(expected_buckets.clone()), ); + + // case: bucket range should refresh if epoch changed let conf_ver = region.get_region_epoch().get_conf_ver() + 1; region.mut_region_epoch().set_conf_ver(conf_ver); @@ -1018,6 +1023,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version2, bucket_version + 1); + // case: stale epoch will not refresh buckets info let conf_ver = 0; region.mut_region_epoch().set_conf_ver(conf_ver); let bucket_version3 = cluster.refresh_region_bucket_keys( @@ -1028,6 +1034,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version3, bucket_version2); + // case: bucket split // now the buckets is ["", "k12", ""]. further split ["", k12], [k12, ""] // buckets into more buckets let region = pd_client.get_region(b"k11").unwrap(); @@ -1066,6 +1073,7 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version4, bucket_version3 + 1); + // case: merge buckets // remove k11~k12, k12~k121, k122~[] bucket let buckets = vec![ Bucket { @@ -1107,7 +1115,7 @@ fn test_refresh_region_bucket_keys() { assert_eq!(bucket_version5, bucket_version4 + 1); - // split the region + // case: split the region pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![b"k11".to_vec()]); let mut buckets = vec![Bucket { keys: vec![b"k10".to_vec()], @@ -1132,7 +1140,7 @@ fn test_refresh_region_bucket_keys() { cluster.refresh_region_bucket_keys(®ion, buckets, None, Some(expected_buckets.clone())); assert_eq!(bucket_version6, bucket_version5 + 1); - // merge the region + // case: merge the region pd_client.must_merge(left_id, right.get_id()); let region = pd_client.get_region(b"k10").unwrap(); let buckets = vec![Bucket { @@ -1145,6 +1153,7 @@ fn test_refresh_region_bucket_keys() { cluster.refresh_region_bucket_keys(®ion, buckets, None, Some(expected_buckets.clone())); assert_eq!(bucket_version7, bucket_version6 + 1); + // case: nothing changed let bucket_version8 = cluster.refresh_region_bucket_keys( ®ion, vec![], @@ -1157,9 +1166,9 @@ fn test_refresh_region_bucket_keys() { #[test] fn test_gen_split_check_bucket_ranges() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); + let mut cluster = new_server_cluster(0, 1); + let region_bucket_size = ReadableSize::kb(1); + cluster.cfg.coprocessor.region_bucket_size = region_bucket_size; cluster.cfg.coprocessor.enable_region_bucket = Some(true); // disable report buckets; as it will reset the user traffic stats to randomize // the test result @@ -1169,14 +1178,15 @@ fn test_gen_split_check_bucket_ranges() { cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); - cluster.must_put(b"k11", b"v1"); - let region = pd_client.get_region(b"k11").unwrap(); + let mut range = 1..; + let mid_key = put_till_size(&mut cluster, region_bucket_size.0, &mut range); + let second_key = put_till_size(&mut cluster, region_bucket_size.0, &mut range); + let region = pd_client.get_region(&second_key).unwrap(); let bucket = Bucket { - keys: vec![b"k11".to_vec()], - size: 1024 * 1024 * 200, + keys: vec![mid_key.clone()], + size: region_bucket_size.0 * 2, }; - let mut expected_buckets = metapb::Buckets::default(); expected_buckets.set_keys(bucket.clone().keys.into()); expected_buckets @@ -1192,32 +1202,28 @@ fn test_gen_split_check_bucket_ranges() { Option::None, Some(expected_buckets.clone()), ); - cluster.must_put(b"k10", b"v1"); - cluster.must_put(b"k12", b"v1"); - let expected_bucket_ranges = vec![ - BucketRange(vec![], b"k11".to_vec()), - BucketRange(b"k11".to_vec(), vec![]), - ]; + // put some data into the right buckets, so the bucket range will be check by + // split check. + let latest_key = put_till_size(&mut cluster, region_bucket_size.0 + 100, &mut range); + let expected_bucket_ranges = vec![BucketRange(mid_key.clone(), vec![])]; cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); - // set fsm.peer.last_bucket_regions + // reset bucket stats. cluster.refresh_region_bucket_keys( ®ion, buckets, Option::None, Some(expected_buckets.clone()), ); - // because the diff between last_bucket_regions and bucket_regions is zero, - // bucket range for split check should be empty. - let expected_bucket_ranges = vec![]; - cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); - // split the region - pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![b"k11".to_vec()]); + thread::sleep(Duration::from_millis(100)); + cluster.send_half_split_region_message(®ion, Some(vec![])); - let left = pd_client.get_region(b"k10").unwrap(); - let right = pd_client.get_region(b"k12").unwrap(); + // split the region + pd_client.must_split_region(region, pdpb::CheckPolicy::Usekey, vec![second_key]); + let left = pd_client.get_region(&mid_key).unwrap(); + let right = pd_client.get_region(&latest_key).unwrap(); if right.get_id() == 1 { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(&right, None); @@ -1225,11 +1231,10 @@ fn test_gen_split_check_bucket_ranges() { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(&left, None); } - + thread::sleep(Duration::from_millis(300)); // merge the region pd_client.must_merge(left.get_id(), right.get_id()); - let region = pd_client.get_region(b"k10").unwrap(); - // the bucket_ranges should be None to refresh the bucket + let region = pd_client.get_region(&mid_key).unwrap(); cluster.send_half_split_region_message(®ion, None); } diff --git a/tests/integrations/raftstore/test_stale_peer.rs b/tests/integrations/raftstore/test_stale_peer.rs index e12584d6c60..5ef90e30e94 100644 --- a/tests/integrations/raftstore/test_stale_peer.rs +++ b/tests/integrations/raftstore/test_stale_peer.rs @@ -4,10 +4,13 @@ use std::{sync::Arc, thread, time::*}; +use engine_rocks::RocksEngine; use engine_traits::{Peekable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RegionLocalState}; +use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, HandyRwLock}; /// A helper function for testing the behaviour of the gc of stale peer @@ -28,7 +31,7 @@ use tikv_util::{config::ReadableDuration, HandyRwLock}; /// time, and it would check with pd to confirm whether it's still a member of /// the cluster. If not, it should destroy itself as a stale peer which is /// removed out already. -fn test_stale_peer_out_of_region(cluster: &mut Cluster) { +fn test_stale_peer_out_of_region>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer number check. pd_client.disable_default_operator(); @@ -111,7 +114,10 @@ fn test_server_stale_peer_out_of_region() { /// time, and it's an initialized peer without any data. It would destroy itself /// as stale peer directly and should not impact other region data on the /// same store. -fn test_stale_peer_without_data(cluster: &mut Cluster, right_derive: bool) { +fn test_stale_peer_without_data>( + cluster: &mut Cluster, + right_derive: bool, +) { cluster.cfg.raft_store.right_derive_when_split = right_derive; let pd_client = Arc::clone(&cluster.pd_client); @@ -297,7 +303,7 @@ fn test_stale_learner_with_read_index() { ); request.mut_header().set_peer(new_peer(3, 3)); request.mut_header().set_replica_read(true); - let (cb, _) = make_cb(&request); + let (cb, _) = make_cb_rocks(&request); cluster .sim .rl() @@ -310,3 +316,48 @@ fn test_stale_learner_with_read_index() { let state: RegionLocalState = engine3.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); } + +/// Test if an uninitialized stale peer will be removed after restart. +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_restart_gc_uninitialized_peer_after_merge() { + let mut cluster = new_cluster(0, 4); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + cluster.cfg.raft_store.raft_election_timeout_ticks = 5; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(40); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::millis(150); + cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::millis(100); + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(100); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + + // test if an uninitialized stale peer before conf removal is destroyed + // automatically + let region = pd_client.get_region(b"k1").unwrap(); + pd_client.must_add_peer(region.get_id(), new_peer(2, 2)); + pd_client.must_add_peer(region.get_id(), new_peer(3, 3)); + + // Block snapshot messages, so that new peers will never be initialized. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(region.get_id(), 4) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + // Add peer (4,4), remove peer (4,4) and then merge regions. + // Peer (4,4) will be an an uninitialized stale peer. + pd_client.must_add_peer(region.get_id(), new_peer(4, 4)); + cluster.must_region_exist(region.get_id(), 4); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + pd_client.must_remove_peer(region.get_id(), new_peer(4, 4)); + + // An uninitialized stale peer is removed automatically after restart. + cluster.stop_node(4); + cluster.run_node(4).unwrap(); + cluster.must_region_not_exist(region.get_id(), 4); +} diff --git a/tests/integrations/raftstore/test_stale_read.rs b/tests/integrations/raftstore/test_stale_read.rs index 24e13003f7e..5de9bda1f64 100644 --- a/tests/integrations/raftstore/test_stale_read.rs +++ b/tests/integrations/raftstore/test_stale_read.rs @@ -8,7 +8,7 @@ use kvproto::{ metapb::{Peer, Region}, tikvpb_grpc::TikvClient, }; -use test_raftstore::{must_get_equal, new_mutation, new_peer, PeerClient}; +use test_raftstore::{must_get_equal, new_mutation, new_peer}; use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, time::Instant}; diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 67e5e261dab..821fc19dff8 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -7,16 +7,18 @@ use std::{ }; use api_version::{test_kv_format_impl, KvFormat}; +use engine_rocks::RocksEngine; use engine_traits::MiscExt; use futures::{executor::block_on, SinkExt, StreamExt}; use grpcio::*; use kvproto::{kvrpcpb::*, pdpb::QueryKind, tikvpb::*, tikvpb_grpc::TikvClient}; use pd_client::PdClient; +use test_coprocessor::{DagSelect, ProductTable}; use test_raftstore::*; use tikv_util::{config::*, store::QueryStats}; use txn_types::Key; -fn check_available(cluster: &mut Cluster) { +fn check_available>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); let engine = cluster.get_engine(1); @@ -42,7 +44,7 @@ fn check_available(cluster: &mut Cluster) { panic!("available not changed") } -fn test_simple_store_stats(cluster: &mut Cluster) { +fn test_simple_store_stats>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(20); @@ -141,7 +143,14 @@ fn test_store_heartbeat_report_hotspots() { fail::remove("mock_hotspot_threshold"); } -type Query = dyn Fn(Context, &Cluster, TikvClient, u64, u64, Vec); +type Query = dyn Fn( + Context, + &Cluster>, + TikvClient, + u64, + u64, + Vec, +); #[test] fn test_query_stats() { @@ -262,19 +271,10 @@ fn test_raw_query_stats_tmpl() { req.set_raw_get(get_req); req }); - batch_commands(&ctx, &client, get_command, &start_key); - assert!(check_split_key( - cluster, - F::encode_raw_key_owned(start_key.clone(), None).into_encoded(), - None - )); - if check_query_num_read( - cluster, - store_id, - region_id, - QueryKind::Get, - (i + 1) * 1000, - ) { + if i == 0 { + batch_commands(&ctx, &client, get_command, &start_key); + } + if check_query_num_read(cluster, store_id, region_id, QueryKind::Get, 1000) { flag = true; break; } @@ -284,14 +284,16 @@ fn test_raw_query_stats_tmpl() { fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); - test_query_num::(raw_get, true); - test_query_num::(raw_batch_get, true); - test_query_num::(raw_scan, true); - test_query_num::(raw_batch_scan, true); + test_query_num::(raw_get, true, true); + test_query_num::(raw_batch_get, true, true); + test_query_num::(raw_scan, true, true); + test_query_num::(raw_batch_scan, true, true); if F::IS_TTL_ENABLED { - test_query_num::(raw_get_key_ttl, true); + test_query_num::(raw_get_key_ttl, true, true); } - test_query_num::(raw_batch_get_command, true); + // requests may failed caused by `EpochNotMatch` after split when auto split is + // enabled, disable it. + test_query_num::(raw_batch_get_command, true, false); test_raw_delete_query::(); fail::remove("mock_tick_interval"); fail::remove("mock_hotspot_threshold"); @@ -385,19 +387,34 @@ fn test_txn_query_stats_tmpl() { req.set_get(get_req); req }); - batch_commands(&ctx, &client, get_command, &start_key); - assert!(check_split_key( - cluster, - Key::from_raw(&start_key).as_encoded().to_vec(), - None - )); - if check_query_num_read( - cluster, - store_id, - region_id, - QueryKind::Get, - (i + 1) * 1000, - ) { + if i == 0 { + batch_commands(&ctx, &client, get_command, &start_key); + } + if check_query_num_read(cluster, store_id, region_id, QueryKind::Get, 1000) { + flag = true; + break; + } + } + assert!(flag); + }); + let batch_coprocessor: Box = + Box::new(|ctx, cluster, client, store_id, region_id, start_key| { + let mut flag = false; + for i in 0..3 { + let coprocessor: Box = Box::new(|ctx, _start_key| { + let mut req = BatchCommandsRequestRequest::new(); + let table = ProductTable::new(); + let mut cop_req = DagSelect::from(&table).build(); + cop_req.set_context(ctx.clone()); + req.set_coprocessor(cop_req); + req + }); + if i == 0 { + batch_commands(&ctx, &client, coprocessor, &start_key); + } + // here cannot read any data, so expect is 0. may need fix. here mainly used to + // verify the request source is as expect. + if check_query_num_read(cluster, store_id, region_id, QueryKind::Coprocessor, 0) { flag = true; break; } @@ -407,21 +424,26 @@ fn test_txn_query_stats_tmpl() { fail::cfg("mock_hotspot_threshold", "return(0)").unwrap(); fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); - test_query_num::(get, false); - test_query_num::(batch_get, false); - test_query_num::(scan, false); - test_query_num::(scan_lock, false); - test_query_num::(batch_get_command, false); + fail::cfg("only_check_source_task_name", "return(test_stats)").unwrap(); + test_query_num::(get, false, true); + test_query_num::(batch_get, false, true); + test_query_num::(scan, false, true); + test_query_num::(scan_lock, false, true); + // requests may failed caused by `EpochNotMatch` after split when auto split is + // enabled, disable it. + test_query_num::(batch_get_command, false, false); + test_query_num::(batch_coprocessor, false, false); test_txn_delete_query::(); test_pessimistic_lock(); test_rollback(); fail::remove("mock_tick_interval"); fail::remove("mock_hotspot_threshold"); fail::remove("mock_collect_tick_interval"); + fail::remove("only_check_source_task_name"); } fn raw_put( - _cluster: &Cluster, + _cluster: &Cluster>, client: &TikvClient, ctx: &Context, _store_id: u64, @@ -439,7 +461,7 @@ fn raw_put( } fn put( - cluster: &Cluster, + cluster: &Cluster>, client: &TikvClient, ctx: &Context, store_id: u64, @@ -501,10 +523,11 @@ fn put( } fn test_pessimistic_lock() { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); + ctx.set_request_source("test_stats".to_owned()); let key = b"key2".to_vec(); let store_id = 1; put(&cluster, &client, &ctx, store_id, key.clone()); @@ -541,9 +564,10 @@ fn test_pessimistic_lock() { } pub fn test_rollback() { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); + ctx.set_request_source("test_stats".to_owned()); let key = b"key2".to_vec(); let store_id = 1; put(&cluster, &client, &ctx, store_id, key.clone()); @@ -572,17 +596,23 @@ pub fn test_rollback() { )); } -fn test_query_num(query: Box, is_raw_kv: bool) { +fn test_query_num(query: Box, is_raw_kv: bool, auto_split: bool) { let (mut cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); - cluster.cfg.split.qps_threshold = 0; + if auto_split { + cluster.cfg.split.qps_threshold = Some(0); + } else { + cluster.cfg.split.qps_threshold = Some(1000000); + } cluster.cfg.split.split_balance_score = 2.0; cluster.cfg.split.split_contained_score = 2.0; cluster.cfg.split.detect_times = 1; cluster.cfg.split.sample_threshold = 0; cluster.cfg.storage.set_api_version(F::TAG); + cluster.cfg.server.enable_request_batch = false; }); ctx.set_api_version(F::CLIENT_TAG); + ctx.set_request_source("test_stats".to_owned()); let mut k = b"key".to_vec(); // When a peer becomes leader, it can't read before committing to current term. @@ -610,6 +640,7 @@ fn test_raw_delete_query() { cluster.cfg.storage.set_api_version(F::TAG); }); ctx.set_api_version(F::CLIENT_TAG); + ctx.set_request_source("test_stats".to_owned()); raw_put::(&cluster, &client, &ctx, store_id, k.clone()); // Raw Delete @@ -635,10 +666,10 @@ fn test_txn_delete_query() { let store_id = 1; { - let (cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + let (cluster, client, mut ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { cluster.cfg.raft_store.pd_store_heartbeat_tick_interval = ReadableDuration::millis(50); }); - + ctx.set_request_source("test_stats".to_owned()); put(&cluster, &client, &ctx, store_id, k.clone()); // DeleteRange let mut delete_req = DeleteRangeRequest::default(); @@ -651,7 +682,7 @@ fn test_txn_delete_query() { } fn check_query_num_read( - cluster: &Cluster, + cluster: &Cluster>, store_id: u64, region_id: u64, kind: QueryKind, @@ -677,7 +708,7 @@ fn check_query_num_read( } fn check_query_num_write( - cluster: &Cluster, + cluster: &Cluster>, store_id: u64, kind: QueryKind, expect: u64, @@ -697,7 +728,7 @@ fn check_query_num_write( } fn check_split_key( - cluster: &Cluster, + cluster: &Cluster>, start_key: Vec, end_key: Option>, ) -> bool { @@ -762,4 +793,13 @@ fn batch_commands( } }); rx.recv_timeout(Duration::from_secs(10)).unwrap(); + sleep_ms(100); + // triage metrics flush + for _ in 0..10 { + let mut req = ScanRequest::default(); + req.set_context(ctx.to_owned()); + req.start_key = start_key.to_owned(); + req.end_key = vec![]; + client.kv_scan(&req).unwrap(); + } } diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index c1cd0befcf1..f5c419ac65b 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -3,6 +3,7 @@ use std::{sync::Arc, thread, time::Duration}; use crossbeam::channel; +use engine_rocks::RocksEngine; use engine_traits::{CfNamesExt, Iterable, Peekable, RaftEngineDebug, SyncMutable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState, StoreIdent}; use protobuf::Message; @@ -10,7 +11,7 @@ use raft::eraftpb::MessageType; use test_raftstore::*; use tikv_util::{config::*, time::Instant}; -fn test_tombstone(cluster: &mut Cluster) { +fn test_tombstone>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer number check. pd_client.disable_default_operator(); @@ -113,7 +114,7 @@ fn test_server_tombstone() { test_tombstone(&mut cluster); } -fn test_fast_destroy(cluster: &mut Cluster) { +fn test_fast_destroy>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer number check. @@ -158,7 +159,7 @@ fn test_server_fast_destroy() { test_fast_destroy(&mut cluster); } -fn test_readd_peer(cluster: &mut Cluster) { +fn test_readd_peer>(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer number check. pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_transport.rs b/tests/integrations/raftstore/test_transport.rs index 4ed3d8da160..cb1bcefbcad 100644 --- a/tests/integrations/raftstore/test_transport.rs +++ b/tests/integrations/raftstore/test_transport.rs @@ -1,8 +1,9 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +use engine_rocks::RocksEngine; use test_raftstore::*; -fn test_partition_write(cluster: &mut Cluster) { +fn test_partition_write>(cluster: &mut Cluster) { cluster.run(); let (key, value) = (b"k1", b"v1"); diff --git a/tests/integrations/raftstore/test_update_region_size.rs b/tests/integrations/raftstore/test_update_region_size.rs index f2ff0d4f217..22a5e1f4534 100644 --- a/tests/integrations/raftstore/test_update_region_size.rs +++ b/tests/integrations/raftstore/test_update_region_size.rs @@ -2,18 +2,19 @@ use std::{sync::Arc, thread, time}; +use engine_rocks::RocksEngine; use engine_traits::MiscExt; use pd_client::PdClient; use test_raftstore::*; use tikv_util::config::*; -fn flush(cluster: &mut Cluster) { +fn flush>(cluster: &mut Cluster) { for engines in cluster.engines.values() { engines.kv.flush_cfs(&[], true).unwrap(); } } -fn test_update_region_size(cluster: &mut Cluster) { +fn test_update_region_size>(cluster: &mut Cluster) { cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(50); cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize::kb(1)); @@ -24,7 +25,7 @@ fn test_update_region_size(cluster: &mut Cluster) { .level0_file_num_compaction_trigger = 10; cluster.start().unwrap(); - let batch_put = |cluster: &mut Cluster, mut start, end| { + let batch_put = |cluster: &mut Cluster, mut start, end| { while start < end { let next = std::cmp::min(end, start + 50); let requests = (start..next) diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index d4332403cea..e42ac75598e 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -7,6 +7,7 @@ use std::{ }; use collections::HashMap; +use engine_rocks::RocksEngine; use futures::executor::block_on; use kvproto::{ metapb, @@ -473,7 +474,7 @@ fn test_witness_replica_read() { request.mut_header().set_replica_read(true); let resp = cluster - .read(None, request, Duration::from_millis(100)) + .read(None, None, request, Duration::from_millis(100)) .unwrap(); assert_eq!( resp.get_header().get_error().get_is_witness(), @@ -484,8 +485,8 @@ fn test_witness_replica_read() { ); } -fn must_get_error_is_witness( - cluster: &mut Cluster, +fn must_get_error_is_witness>( + cluster: &mut Cluster, region: &metapb::Region, cmd: kvproto::raft_cmdpb::Request, ) { diff --git a/tests/integrations/resource_metering/test_cpu.rs b/tests/integrations/resource_metering/test_cpu.rs index c15bf445ed3..12d6fa4fbe0 100644 --- a/tests/integrations/resource_metering/test_cpu.rs +++ b/tests/integrations/resource_metering/test_cpu.rs @@ -12,6 +12,7 @@ use std::{ use concurrency_manager::ConcurrencyManager; use futures::{executor::block_on, StreamExt}; use kvproto::kvrpcpb::Context; +use resource_control::ResourceGroupManager; use test_coprocessor::{DagSelect, Insert, ProductTable, Store}; use tidb_query_datatype::codec::Datum; use tikv::{ @@ -95,7 +96,10 @@ pub fn test_reschedule_coprocessor() { let mut req = DagSelect::from(&table).build(); let mut ctx = Context::default(); ctx.set_resource_group_tag(tag.as_bytes().to_vec()); + ctx.set_request_source("test".to_owned()); req.set_context(ctx); + fail::cfg("only_check_source_task_name", "return(test)").unwrap(); + defer!(fail::remove("only_check_source_task_name")); assert!( !block_on(endpoint.parse_and_handle_unary_request(req, None)) .consume() @@ -229,7 +233,7 @@ fn setup_test_suite() -> (TestSuite, Store, Endpoint) cm, test_suite.get_tag_factory(), Arc::new(QuotaLimiter::default()), - None, + Some(Arc::new(ResourceGroupManager::default())), ); (test_suite, store, endpoint) } diff --git a/tests/integrations/resource_metering/test_read_keys.rs b/tests/integrations/resource_metering/test_read_keys.rs index f7a4ef86906..64c291049d9 100644 --- a/tests/integrations/resource_metering/test_read_keys.rs +++ b/tests/integrations/resource_metering/test_read_keys.rs @@ -4,6 +4,7 @@ use std::{sync::Arc, time::Duration}; use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{unbounded, Receiver, RecvTimeoutError, Sender}; +use engine_rocks::RocksEngine as RocksDb; use grpcio::{ChannelBuilder, Environment}; use kvproto::{coprocessor, kvrpcpb::*, resource_usage_agent::ResourceUsageRecord, tikvpb::*}; use protobuf::Message; @@ -108,7 +109,14 @@ pub fn test_read_keys() { }); } -fn new_cluster(port: u16, env: Arc) -> (Cluster, TikvClient, Context) { +fn new_cluster( + port: u16, + env: Arc, +) -> ( + Cluster>, + TikvClient, + Context, +) { let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { cluster.cfg.resource_metering.receiver_address = format!("127.0.0.1:{}", port); cluster.cfg.resource_metering.precision = ReadableDuration::millis(100); diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index cfadde84405..238102df6b6 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -7,15 +7,17 @@ use grpcio::{ChannelBuilder, Environment}; use keys::data_key; use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::server::gc_worker::sync_gc; use tikv_util::HandyRwLock; use txn_types::Key; // Since v5.0 GC bypasses Raft, which means GC scans/deletes records with // `keys::DATA_PREFIX`. This case ensures it's performed correctly. -#[test] +#[test_case(test_raftstore::must_new_cluster_mul)] +#[test_case(test_raftstore_v2::must_new_cluster_mul)] fn test_gc_bypass_raft() { - let (cluster, leader, ctx) = must_new_cluster_mul(2); + let (cluster, leader, ctx) = new_cluster(2); cluster.pd_client.disable_default_operator(); let env = Arc::new(Environment::new(1)); @@ -25,7 +27,7 @@ fn test_gc_bypass_raft() { let pk = b"k1".to_vec(); let value = vec![b'x'; 300]; - let engine = cluster.engines.get(&leader_store).unwrap(); + let engine = cluster.get_engine(leader_store); for &start_ts in &[10, 20, 30, 40] { let commit_ts = start_ts + 5; @@ -37,11 +39,11 @@ fn test_gc_bypass_raft() { let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_some()); + assert!(engine.get_value(&key).unwrap().is_some()); let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_some()); + assert!(engine.get_value_cf(CF_WRITE, &key).unwrap().is_some()); } let node_ids = cluster.get_node_ids(); @@ -53,16 +55,16 @@ fn test_gc_bypass_raft() { region.set_end_key(b"k2".to_vec()); sync_gc(&gc_sched, region, 200.into()).unwrap(); - let engine = cluster.engines.get(&store_id).unwrap(); + let engine = cluster.get_engine(store_id); for &start_ts in &[10, 20, 30] { let commit_ts = start_ts + 5; let key = Key::from_raw(b"k1").append_ts(start_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_none()); + assert!(engine.get_value(&key).unwrap().is_none()); let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_none()); + assert!(engine.get_value_cf(CF_WRITE, &key).unwrap().is_none()); } } } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 24b6a87bfa5..845ae2bc969 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -10,6 +10,7 @@ use std::{ use api_version::{ApiV1, ApiV1Ttl, ApiV2, KvFormat}; use concurrency_manager::ConcurrencyManager; +use engine_rocks::RocksEngine; use engine_traits::{ MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, SyncMutable, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, @@ -1383,7 +1384,8 @@ fn test_double_run_node() { let mut sim = cluster.sim.wl(); let node = sim.get_node(id).unwrap(); let pd_worker = LazyWorker::new("test-pd-worker"); - let simulate_trans = SimulateTransport::new(ChannelTransport::new()); + let simulate_trans = + SimulateTransport::<_, RocksEngine>::new(ChannelTransport::::new()); let tmp = Builder::new().prefix("test_cluster").tempdir().unwrap(); let snap_mgr = SnapManager::new(tmp.path().to_str().unwrap()); let coprocessor_host = CoprocessorHost::new(router, raftstore::coprocessor::Config::default()); @@ -2757,3 +2759,166 @@ fn test_pessimistic_lock_execution_tracking() { handle.join().unwrap(); } + +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] +fn test_mvcc_scan_memory_and_cf_locks() { + let (cluster, client, ctx) = new_cluster(); + + // Create both pessimistic and prewrite locks. + // The peer in memory limit is 512KiB, generate 1KiB key for pessimistic lock. + // So Writing 512 pessimistic locks may exceed the memory limit and later + // pessimistic locks would be written to the lock cf. + let byte_slice: &[u8] = &[b'k'; 512]; + let start_ts = 11; + let prewrite_start_ts = start_ts - 1; + let num_keys = 1040; + let prewrite_primary_key = b"prewrite_primary"; + let val = b"value"; + let format_key = |i| format!("{:?}{:04}", byte_slice, i).as_bytes().to_vec(); + for i in 0..num_keys { + let key = format_key(i); + if i % 2 == 0 { + must_kv_pessimistic_lock(&client, ctx.clone(), key, start_ts); + } else { + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(key); + mutation.set_value(val.to_vec()); + must_kv_prewrite_with( + &client, + ctx.clone(), + vec![mutation], + vec![], + prewrite_primary_key.to_vec(), + start_ts - 1, + 0, + false, + false, + ); + } + } + // Ensure the pessimistic locks are written to the memory. The first key should + // be written into the memory and the last key should be put to lock cf as + // memory limit is exceeded. + let engine = cluster.get_engine(1); + let cf_res = engine + .get_value_cf( + CF_LOCK, + keys::data_key(Key::from_raw(format_key(0).as_slice()).as_encoded()).as_slice(), + ) + .unwrap(); + assert!(cf_res.is_none()); + let cf_res = engine + .get_value_cf( + CF_LOCK, + keys::data_key(Key::from_raw(format_key(num_keys - 2).as_slice()).as_encoded()) + .as_slice(), + ) + .unwrap(); + assert!(cf_res.is_some()); + + // Scan lock, the pessimistic and prewrite results are returned. + // When limit is 0 or it's larger than num_keys, all keys should be returned. + // When limit is less than 512, in-memory pessimistic locks and prewrite locks + // should be returned. + // When limit is larger than 512, in-memory and lock cf pessimistic locks and + // prewrite locks should be returned. + for scan_limit in [0, 128, 256, 512, num_keys, num_keys * 2] { + let scan_ts = 20; + let scan_lock_max_version = scan_ts; + let mut scan_lock_req = ScanLockRequest::default(); + scan_lock_req.set_context(ctx.clone()); + scan_lock_req.max_version = scan_lock_max_version; + scan_lock_req.limit = scan_limit as u32; + let scan_lock_resp = client.kv_scan_lock(&scan_lock_req).unwrap(); + assert!(!scan_lock_resp.has_region_error()); + let expected_key_num = if scan_limit == 0 || scan_limit >= num_keys { + num_keys + } else { + scan_limit + }; + assert_eq!(scan_lock_resp.locks.len(), expected_key_num); + + for (i, lock_info) in (0..expected_key_num).zip(scan_lock_resp.locks.iter()) { + let key = format_key(i); + if i % 2 == 0 { + assert_eq!(lock_info.lock_type, Op::PessimisticLock); + assert_eq!(lock_info.lock_version, start_ts); + assert_eq!(lock_info.key, key); + } else { + assert_eq!( + lock_info.lock_type, + Op::Put, + "i={:?} lock_info={:?} expected_key_num={:?}, scan_limit={:?}", + i, + lock_info, + expected_key_num, + scan_limit + ); + assert_eq!(lock_info.primary_lock, prewrite_primary_key); + assert_eq!(lock_info.lock_version, prewrite_start_ts); + assert_eq!(lock_info.key, key); + } + } + } + + // Scan with smaller ts returns empty result. + let mut scan_lock_req = ScanLockRequest::default(); + scan_lock_req.set_context(ctx.clone()); + scan_lock_req.max_version = prewrite_start_ts - 1; + let scan_lock_resp = client.kv_scan_lock(&scan_lock_req).unwrap(); + assert!(!scan_lock_resp.has_region_error()); + assert_eq!(scan_lock_resp.locks.len(), 0); + + // Roll back the prewrite locks. + let rollback_start_version = prewrite_start_ts; + let mut rollback_req = BatchRollbackRequest::default(); + rollback_req.set_context(ctx.clone()); + rollback_req.start_version = rollback_start_version; + let keys = (0..num_keys) + .filter(|i| i % 2 != 0) + .map(|i| format_key(i)) + .collect(); + rollback_req.set_keys(keys); + let rollback_resp = client.kv_batch_rollback(&rollback_req).unwrap(); + assert!(!rollback_resp.has_region_error()); + assert!(!rollback_resp.has_error()); + + // Scan lock again after removing prewrite locks. + let mut scan_lock_req = ScanLockRequest::default(); + scan_lock_req.set_context(ctx.clone()); + scan_lock_req.max_version = start_ts + 1; + let scan_lock_resp = client.kv_scan_lock(&scan_lock_req).unwrap(); + assert!(!scan_lock_resp.has_region_error()); + assert_eq!(scan_lock_resp.locks.len(), num_keys / 2); + for (i, lock_info) in (0..num_keys / 2).zip(scan_lock_resp.locks.iter()) { + let key = format_key(i * 2); + assert_eq!(lock_info.lock_version, start_ts); + assert_eq!(lock_info.key, key); + assert_eq!(lock_info.lock_type, Op::PessimisticLock); + } + + // Pessimistic rollabck all the locks. Scan lock should return empty result. + let mut pessimsitic_rollback_req = PessimisticRollbackRequest::default(); + pessimsitic_rollback_req.start_version = start_ts; + pessimsitic_rollback_req.for_update_ts = start_ts; + pessimsitic_rollback_req.set_context(ctx.clone()); + let keys = (0..num_keys) + .filter(|i| i % 2 == 0) + .map(|i| format_key(i)) + .collect(); + pessimsitic_rollback_req.set_keys(keys); + let pessimistic_rollback_resp = client + .kv_pessimistic_rollback(&pessimsitic_rollback_req) + .unwrap(); + assert!(!pessimistic_rollback_resp.has_region_error()); + + // Scan lock again after all the cleanup. + let mut scan_lock_req = ScanLockRequest::default(); + scan_lock_req.set_context(ctx); + scan_lock_req.max_version = start_ts + 1; + let scan_lock_resp = client.kv_scan_lock(&scan_lock_req).unwrap(); + assert!(!scan_lock_resp.has_region_error()); + assert_eq!(scan_lock_resp.locks.len(), 0); +} diff --git a/tests/integrations/server/lock_manager.rs b/tests/integrations/server/lock_manager.rs index 289b10303a8..2d8b8d326e3 100644 --- a/tests/integrations/server/lock_manager.rs +++ b/tests/integrations/server/lock_manager.rs @@ -10,6 +10,7 @@ use std::{ time::Duration, }; +use engine_rocks::RocksEngine; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::*, @@ -69,7 +70,10 @@ fn deadlock(client: &TikvClient, ctx: Context, key1: &[u8], ts: u64) -> bool { resp.errors[0].has_deadlock() } -fn build_leader_client(cluster: &mut Cluster, key: &[u8]) -> (TikvClient, Context) { +fn build_leader_client( + cluster: &mut Cluster>, + key: &[u8], +) -> (TikvClient, Context) { let region_id = cluster.get_region_id(key); let leader = cluster.leader_of_region(region_id).unwrap(); let epoch = cluster.get_region_epoch(region_id); @@ -88,7 +92,11 @@ fn build_leader_client(cluster: &mut Cluster, key: &[u8]) -> (Tik } /// Creates a deadlock on the store containing key. -fn must_detect_deadlock(cluster: &mut Cluster, key: &[u8], ts: u64) { +fn must_detect_deadlock( + cluster: &mut Cluster>, + key: &[u8], + ts: u64, +) { // Sometimes, deadlocks can't be detected at once due to leader change, but it // will be detected. for _ in 0..5 { @@ -100,7 +108,10 @@ fn must_detect_deadlock(cluster: &mut Cluster, key: &[u8], ts: u6 panic!("failed to detect deadlock"); } -fn deadlock_detector_leader_must_be(cluster: &mut Cluster, store_id: u64) { +fn deadlock_detector_leader_must_be( + cluster: &mut Cluster>, + store_id: u64, +) { let leader_region = cluster.get_region(b""); assert_eq!( cluster @@ -115,7 +126,11 @@ fn deadlock_detector_leader_must_be(cluster: &mut Cluster, store_ .region_leader_must_be(leader_region.get_id(), leader_peer); } -fn must_transfer_leader(cluster: &mut Cluster, region_key: &[u8], store_id: u64) { +fn must_transfer_leader( + cluster: &mut Cluster>, + region_key: &[u8], + store_id: u64, +) { let region = cluster.get_region(region_key); let target_peer = find_peer_of_store(®ion, store_id); cluster.must_transfer_leader(region.get_id(), target_peer.clone()); @@ -130,7 +145,7 @@ fn must_transfer_leader(cluster: &mut Cluster, region_key: &[u8], /// REQUIRE: The source store must be the leader the region and the target store /// must not have this region. fn must_transfer_region( - cluster: &mut Cluster, + cluster: &mut Cluster>, region_key: &[u8], source_store_id: u64, target_store_id: u64, @@ -149,14 +164,18 @@ fn must_transfer_region( cluster.must_put(region_key, b"v"); } -fn must_split_region(cluster: &mut Cluster, region_key: &[u8], split_key: &[u8]) { +fn must_split_region( + cluster: &mut Cluster>, + region_key: &[u8], + split_key: &[u8], +) { let region = cluster.get_region(region_key); cluster.must_split(®ion, split_key); cluster.must_put(split_key, b"v"); } fn must_merge_region( - cluster: &mut Cluster, + cluster: &mut Cluster>, source_region_key: &[u8], target_region_key: &[u8], ) { @@ -179,7 +198,7 @@ fn find_peer_of_store(region: &Region, store_id: u64) -> Peer { /// Creates a cluster with only one region and store(1) is the leader of the /// region. -fn new_cluster_for_deadlock_test(count: usize) -> Cluster { +fn new_cluster_for_deadlock_test(count: usize) -> Cluster> { let mut cluster = new_server_cluster(0, count); cluster.cfg.pessimistic_txn.wait_for_lock_timeout = ReadableDuration::millis(500); cluster.cfg.pessimistic_txn.pipelined = false; diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index aad9ab7ceb1..2b51bb1f21b 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -21,8 +21,8 @@ use kvproto::{ use raft::eraftpb::Entry; use raftstore::errors::DiscardReason; use tikv::server::{ - self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, - Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, + load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, Config, ConnectionBuilder, + RaftClient, StoreAddrResolver, TestRaftStoreRouter, }; use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ @@ -32,24 +32,6 @@ use tikv_util::{ use super::*; -#[derive(Clone)] -pub struct StaticResolver { - port: u16, -} - -impl StaticResolver { - fn new(port: u16) -> StaticResolver { - StaticResolver { port } - } -} - -impl StoreAddrResolver for StaticResolver { - fn resolve(&self, _store_id: u64, cb: Callback) -> server::Result<()> { - cb(Ok(format!("localhost:{}", self.port))); - Ok(()) - } -} - fn get_raft_client(router: R, resolver: T) -> RaftClient where R: RaftExtension + Unpin + 'static, @@ -75,8 +57,16 @@ where RaftClient::new(0, builder) } -fn get_raft_client_by_port(port: u16) -> RaftClient { - get_raft_client(FakeExtension, StaticResolver::new(port)) +fn get_raft_client_by_port(port: u16) -> RaftClient { + get_raft_client( + FakeExtension, + resolve::MockStoreAddrResolver { + resolve_fn: Arc::new(move |_, cb| { + cb(Ok(format!("localhost:{}", port))); + Ok(()) + }), + }, + ) } #[derive(Clone)] @@ -177,7 +167,15 @@ fn test_raft_client_reconnect() { let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); let wrap = RaftRouterWrap::new(router); - let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); + let mut raft_client = get_raft_client( + wrap, + resolve::MockStoreAddrResolver { + resolve_fn: Arc::new(move |_, cb| { + cb(Ok(format!("localhost:{}", port))); + Ok(()) + }), + }, + ); (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); raft_client.flush(); diff --git a/tests/integrations/server/status_server.rs b/tests/integrations/server/status_server.rs index 3e1fbd4a9e8..a2921f77b95 100644 --- a/tests/integrations/server/status_server.rs +++ b/tests/integrations/server/status_server.rs @@ -45,7 +45,6 @@ fn test_region_meta_endpoint() { ConfigController::default(), Arc::new(SecurityConfig::default()), router, - std::env::temp_dir(), None, GrpcServiceManager::dummy(), ) diff --git a/tests/integrations/server_encryption.rs b/tests/integrations/server_encryption.rs index 7c88afde76a..041b15fd953 100644 --- a/tests/integrations/server_encryption.rs +++ b/tests/integrations/server_encryption.rs @@ -1,8 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +use engine_rocks::RocksEngine; use test_raftstore::*; -fn test_snapshot_encryption(cluster: &mut Cluster) { +fn test_snapshot_encryption>(cluster: &mut Cluster) { configure_for_encryption(cluster); cluster.pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); diff --git a/tests/integrations/storage/test_raft_storage.rs b/tests/integrations/storage/test_raft_storage.rs index 684b7a261fb..1b3ba6dc43b 100644 --- a/tests/integrations/storage/test_raft_storage.rs +++ b/tests/integrations/storage/test_raft_storage.rs @@ -8,6 +8,7 @@ use std::{ use api_version::{ApiV1, KvFormat}; use collections::HashMap; +use engine_rocks::RocksEngine; use error_code::{raftstore::STALE_COMMAND, ErrorCodeExt}; use kvproto::kvrpcpb::Context; use test_raftstore::*; @@ -25,8 +26,8 @@ use tikv_util::HandyRwLock; use txn_types::{Key, Mutation, TimeStamp}; fn new_raft_storage() -> ( - Cluster, - SyncTestStorageApiV1, + Cluster>, + SyncTestStorageApiV1>, Context, ) { new_raft_storage_with_store_count::(1, "") @@ -234,7 +235,7 @@ fn write_test_data( } fn check_data( - cluster: &mut Cluster, + cluster: &mut Cluster>, storages: &HashMap>, test_data: &[(Vec, Vec)], ts: impl Into, diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 1fb8075e10f..4129d5bc721 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -4,6 +4,7 @@ use std::{ thread, time, }; +use engine_rocks::RocksEngine as RocksDb; use engine_traits::{CfName, IterOptions, CF_DEFAULT}; use futures::executor::block_on; use kvproto::kvrpcpb::{Context, KeyRange}; @@ -323,7 +324,7 @@ fn test_invalid_read_index_when_no_leader() { true, ); request.mut_header().set_peer(follower.clone()); - let (cb, mut rx) = make_cb(&request); + let (cb, mut rx) = make_cb::(&request); cluster .sim .rl() diff --git a/tests/integrations/storage/test_region_info_accessor.rs b/tests/integrations/storage/test_region_info_accessor.rs index 2df7238e1a9..344f9c6607e 100644 --- a/tests/integrations/storage/test_region_info_accessor.rs +++ b/tests/integrations/storage/test_region_info_accessor.rs @@ -3,12 +3,15 @@ use std::{sync::mpsc::channel, thread, time::Duration}; use collections::HashMap; +use engine_rocks::RocksEngine; use kvproto::metapb::Region; use raftstore::coprocessor::{RegionInfoAccessor, RegionInfoProvider}; use test_raftstore::*; use tikv_util::HandyRwLock; -fn prepare_cluster(cluster: &mut Cluster) -> Vec { +fn prepare_cluster>( + cluster: &mut Cluster, +) -> Vec { for i in 0..15 { let i = i + b'0'; let key = vec![b'k', i]; diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 9c3eeec0c83..fc84d56fd00 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,7 +159,9 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let resource = cfg.rocksdb.build_resources(Default::default()); + let resource = cfg + .rocksdb + .build_resources(Default::default(), cfg.storage.engine); let kv_db_opts = cfg.rocksdb.build_opt(&resource, cfg.storage.engine); let kv_cfs_opts = cfg.rocksdb.build_cf_opts( &cfg.rocksdb.build_cf_resources(cache), @@ -369,11 +371,12 @@ fn test_delete_files_in_range_for_titan() { build_sst_cf_file_list::( &mut cf_file, &engines.kv, - &engines.kv.snapshot(), + &engines.kv.snapshot(None), b"", b"{", u64::MAX, &limiter, + None, ) .unwrap(); let mut cf_file_write = CfFile::new( @@ -385,11 +388,12 @@ fn test_delete_files_in_range_for_titan() { build_sst_cf_file_list::( &mut cf_file_write, &engines.kv, - &engines.kv.snapshot(), + &engines.kv.snapshot(None), b"", b"{", u64::MAX, &limiter, + None, ) .unwrap();