diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b2e1c37bc44..8add755c48d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ To build TiKV you'll need to at least have the following installed: * `cmake` - Build tool (required for gRPC) * `awk` - Pattern scanning/processing language * [`protoc`](https://github.com/protocolbuffers/protobuf/releases) - Google protocol buffer compiler -* C++ compiler - gcc 5+ (required for gRPC) +* `C++ compiler` - gcc 5+ or clang (required for gRPC) If you are targeting platforms other than x86_64/aarch64 Linux or macOS, you'll also need: diff --git a/Cargo.lock b/Cargo.lock index 6cb187720b3..1975b75a856 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -105,8 +105,6 @@ dependencies = [ "log_wrappers", "match-template", "panic_hook", - "thiserror", - "tikv_alloc", "tikv_util", "txn_types", ] @@ -175,9 +173,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.3.14" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345fd392ab01f746c717b1357165b76f0b67a60192007b234058c9045fdcf695" +checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" dependencies = [ "futures-core", "futures-io", @@ -199,9 +197,9 @@ dependencies = [ [[package]] name = "async-speed-limit" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "481ce9cb6a828f4679495f7376cb6779978d925dd9790b99b48d1bbde6d0f00b" +checksum = "97c1688bb8e4eb3dcd68a8b0e5a81deae887c67362bb482a902b785e83ac2edc" dependencies = [ "futures-core", "futures-io", @@ -232,13 +230,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.58" +version = "0.1.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" +checksum = "7b2d0f03b3640e3a630367e40c468cb7f309529c708ed1d88597047b0e7c6ef7" dependencies = [ "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] @@ -286,14 +284,8 @@ dependencies = [ "fail", "futures 0.3.15", "futures-util", - "grpcio", - "http", - "hyper", - "hyper-tls", "kvproto", - "lazy_static", "md5", - "prometheus", "rusoto_core", "rusoto_credential", "rusoto_kms", @@ -321,7 +313,6 @@ dependencies = [ "azure_storage_blobs", "base64 0.13.0", "cloud", - "fail", "futures 0.3.15", "futures-util", "kvproto", @@ -479,27 +470,21 @@ dependencies = [ "lazy_static", "log_wrappers", "online_config", - "pd_client", "prometheus", "raft", "raftstore", "rand 0.8.5", "resource_control", - "security", - "serde", - "serde_derive", "slog", "slog-global", "tempfile", "thiserror", - "tidb_query_common", "tikv", "tikv_alloc", "tikv_util", "tokio", "tokio-stream", "txn_types", - "yatp", ] [[package]] @@ -508,13 +493,12 @@ version = "0.1.0" dependencies = [ "async-compression", "async-trait", - "bytes", "chrono", "concurrency_manager", - "crossbeam", - "crossbeam-channel", + "crc64fast", "dashmap", "encryption", + "encryption_export", "engine_panic", "engine_rocks", "engine_test", @@ -524,10 +508,8 @@ dependencies = [ "fail", "file_system", "futures 0.3.15", - "futures-io", "grpcio", "hex 0.4.2", - "indexmap 1.6.2", "kvproto", "lazy_static", "log_wrappers", @@ -541,9 +523,7 @@ dependencies = [ "raft", "raftstore", "rand 0.8.5", - "regex", "resolved_ts", - "security", "slog", "slog-global", "tempdir", @@ -555,7 +535,6 @@ dependencies = [ "thiserror", "tidb_query_datatype", "tikv", - "tikv_alloc", "tikv_kv", "tikv_util", "tokio", @@ -567,7 +546,6 @@ dependencies = [ "url", "uuid 0.8.2", "walkdir", - "yatp", ] [[package]] @@ -586,7 +564,6 @@ checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" name = "batch-system" version = "0.1.0" dependencies = [ - "collections", "criterion", "crossbeam", "dashmap", @@ -599,7 +576,6 @@ dependencies = [ "prometheus", "resource_control", "serde", - "serde_derive", "slog", "slog-global", "tikv_alloc", @@ -690,7 +666,7 @@ dependencies = [ "regex", "rustc-hash", "shlex 1.3.0", - "syn 2.0.43", + "syn 2.0.79", ] [[package]] @@ -767,9 +743,9 @@ checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "bytemuck" -version = "1.9.1" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" [[package]] name = "byteorder" @@ -845,30 +821,22 @@ checksum = "926013f2860c46252efceabb19f4a6b308197505082c609025aa6706c011d427" name = "causal_ts" version = "0.0.1" dependencies = [ - "api_version", "async-trait", "criterion", - "engine_rocks", - "engine_traits", "enum_dispatch", "error_code", - "fail", "futures 0.3.15", - "kvproto", "lazy_static", - "log_wrappers", "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", - "raft", "serde", "serde_derive", "slog", "slog-global", "test_pd_client", "thiserror", - "tikv_alloc", "tikv_util", "tokio", "txn_types", @@ -899,7 +867,6 @@ dependencies = [ "engine_traits", "fail", "futures 0.3.15", - "futures-timer", "getset", "grpcio", "keys", @@ -918,7 +885,6 @@ dependencies = [ "semver 1.0.4", "slog", "slog-global", - "tempfile", "test_pd_client", "test_raftstore", "test_util", @@ -1018,17 +984,16 @@ dependencies = [ "async-trait", "derive_more", "error_code", - "fail", + "futures 0.3.15", "futures-io", "kvproto", "lazy_static", - "openssl", "prometheus", "protobuf", - "rusoto_core", "thiserror", "tikv_util", "url", + "uuid 0.8.2", ] [[package]] @@ -1148,9 +1113,9 @@ checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" [[package]] name = "cpp_demangle" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e8227005286ec39567949b33df9896bcadfa6051bccca2488129f108ca23119" +checksum = "96e58d342ad113c2b878f16d5d034c03be492ae460cdbc02b7f0f2284d310c7d" dependencies = [ "cfg-if 1.0.0", ] @@ -1309,8 +1274,8 @@ name = "crossbeam-skiplist" version = "0.1.3" dependencies = [ "crossbeam-epoch", + "crossbeam-skiplist 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-utils", - "rand 0.8.5", ] [[package]] @@ -1334,7 +1299,6 @@ name = "crypto" version = "0.0.1" dependencies = [ "openssl", - "openssl-sys", "slog", "slog-global", ] @@ -1509,12 +1473,10 @@ version = "0.0.1" dependencies = [ "async-trait", "byteorder", - "bytes", "cloud", "crc32fast", "crossbeam", "crypto", - "derive_more", "error_code", "fail", "file_system", @@ -1528,6 +1490,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -1535,7 +1498,6 @@ dependencies = [ "tempfile", "test_util", "thiserror", - "tikv_alloc", "tikv_util", "tokio", "toml", @@ -1546,17 +1508,13 @@ dependencies = [ name = "encryption_export" version = "0.0.1" dependencies = [ - "async-trait", "aws", "azure", "cloud", - "derive_more", "encryption", - "error_code", "file_system", "gcp", "kvproto", - "openssl", "protobuf", "rust-ini", "slog", @@ -1573,8 +1531,6 @@ dependencies = [ "engine_traits", "kvproto", "raft", - "tikv_alloc", - "tikv_util", "tracker", "txn_types", ] @@ -1605,7 +1561,6 @@ dependencies = [ "regex", "rocksdb", "serde", - "serde_derive", "slog", "slog-global", "slog_derive", @@ -1623,16 +1578,12 @@ name = "engine_rocks_helper" version = "0.1.0" dependencies = [ "engine_rocks", - "engine_test", "engine_traits", "fail", - "futures 0.3.15", "keys", "kvproto", "lazy_static", - "pd_client", "prometheus", - "protobuf", "raftstore", "slog", "slog-global", @@ -1712,7 +1663,6 @@ dependencies = [ name = "engine_test" version = "0.0.1" dependencies = [ - "collections", "encryption", "engine_panic", "engine_rocks", @@ -1720,8 +1670,6 @@ dependencies = [ "file_system", "raft_log_engine", "tempfile", - "tikv_alloc", - "tikv_util", ] [[package]] @@ -1781,37 +1729,19 @@ dependencies = [ "file_system", "keys", "kvproto", - "lazy_static", "log_wrappers", "protobuf", "raft", "serde", - "serde_derive", "slog", "slog-global", "thiserror", - "tikv_alloc", "tikv_util", "toml", "tracker", "txn_types", ] -[[package]] -name = "engine_traits_tests" -version = "0.0.1" -dependencies = [ - "encryption", - "encryption_export", - "engine_test", - "engine_traits", - "kvproto", - "panic_hook", - "tempfile", - "test_util", - "tikv_alloc", -] - [[package]] name = "enum_dispatch" version = "0.3.8" @@ -1880,11 +1810,9 @@ dependencies = [ name = "error_code" version = "0.0.1" dependencies = [ - "grpcio", "kvproto", "lazy_static", "raft", - "serde", "tikv_alloc", ] @@ -1909,9 +1837,9 @@ dependencies = [ "async-trait", "aws", "azure", + "chrono", "cloud", "encryption", - "engine_traits", "file_system", "futures 0.3.15", "futures-io", @@ -1919,11 +1847,12 @@ dependencies = [ "gcp", "kvproto", "lazy_static", - "matches", "openssl", "prometheus", "rand 0.8.5", "rust-ini", + "serde", + "serde_json", "slog", "slog-global", "structopt", @@ -1933,6 +1862,8 @@ dependencies = [ "tokio", "tokio-util", "url", + "uuid 0.8.2", + "walkdir", ] [[package]] @@ -1985,8 +1916,6 @@ dependencies = [ "prometheus-static-metric", "rand 0.8.5", "serde", - "slog", - "slog-global", "strum 0.20.0", "tempfile", "thread_local", @@ -2182,9 +2111,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2192,9 +2121,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" @@ -2210,9 +2139,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" @@ -2231,28 +2160,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ - "autocfg", - "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.103", + "syn 2.0.79", ] [[package]] name = "futures-sink" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -2262,11 +2189,10 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.15" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ - "autocfg", "futures 0.1.31", "futures-channel", "futures-core", @@ -2277,8 +2203,6 @@ dependencies = [ "memchr", "pin-project-lite", "pin-utils", - "proc-macro-hack", - "proc-macro-nested", "slab", ] @@ -2354,7 +2278,6 @@ dependencies = [ "base64 0.13.0", "cloud", "crc32c", - "crypto", "futures-util", "http", "hyper", @@ -2556,7 +2479,6 @@ dependencies = [ "ordered-float", "parking_lot 0.12.1", "prometheus", - "prometheus-static-metric", "slog", "slog-global", "tikv_util", @@ -2683,6 +2605,8 @@ dependencies = [ "engine_rocks", "engine_traits", "fail", + "in_memory_engine", + "keys", "kvproto", "lazy_static", "online_config", @@ -2690,10 +2614,10 @@ dependencies = [ "prometheus-static-metric", "raft", "raftstore", - "range_cache_memory_engine", "slog", "slog-global", "tempfile", + "test_util", "tikv_util", "txn_types", ] @@ -2775,6 +2699,51 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" +[[package]] +name = "in_memory_engine" +version = "0.0.1" +dependencies = [ + "bytes", + "collections", + "criterion", + "crossbeam", + "crossbeam-skiplist 0.1.3", + "dashmap", + "engine_rocks", + "engine_traits", + "fail", + "futures 0.3.15", + "hex 0.4.2", + "keys", + "kvproto", + "lazy_static", + "libc 0.2.151", + "log_wrappers", + "online_config", + "parking_lot 0.12.1", + "pd_client", + "prometheus", + "prometheus-static-metric", + "proptest", + "raftstore", + "rand 0.8.5", + "security", + "serde", + "serde_json", + "slog", + "slog-global", + "smallvec", + "strum 0.20.0", + "tempfile", + "test_pd", + "test_util", + "thiserror", + "tikv_util", + "tokio", + "txn_types", + "yatp", +] + [[package]] name = "indexmap" version = "1.6.2" @@ -2984,7 +2953,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#0ad602a429126e39bd4a87c687be64f9ae173c02" +source = "git+https://github.com/pingcap/kvproto.git#4a3e17f5e62dc3999e2c0f63293fdeffced80626" dependencies = [ "futures 0.3.15", "grpcio", @@ -3119,7 +3088,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" +source = "git+https://github.com/tikv/rust-rocksdb.git#2bb1e4e32b9e45cf3fd8210766a9db38eacd5e4d" dependencies = [ "bindgen 0.65.1", "bzip2-sys", @@ -3138,7 +3107,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" +source = "git+https://github.com/tikv/rust-rocksdb.git#2bb1e4e32b9e45cf3fd8210766a9db38eacd5e4d" dependencies = [ "bzip2-sys", "cc", @@ -3717,13 +3686,13 @@ dependencies = [ [[package]] name = "num-derive" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.79", ] [[package]] @@ -3848,7 +3817,6 @@ dependencies = [ "chrono", "online_config_derive", "serde", - "serde_derive", "toml", ] @@ -4030,7 +3998,6 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "log", "log_wrappers", "prometheus", "prometheus-static-metric", @@ -4231,14 +4198,14 @@ version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e" dependencies = [ - "portable-atomic 1.6.0", + "portable-atomic 1.9.0", ] [[package]] name = "portable-atomic" -version = "1.6.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" [[package]] name = "pprof" @@ -4276,7 +4243,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.43", + "syn 2.0.79", ] [[package]] @@ -4313,23 +4280,11 @@ dependencies = [ "version_check 0.9.4", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "369a6ed065f249a159e06c45752c780bda2fb53c995718f9e484d08daa9eb42e" - [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" dependencies = [ "unicode-ident", ] @@ -4545,6 +4500,7 @@ dependencies = [ "health_controller", "hex 0.4.2", "hyper", + "in_memory_engine", "itertools", "keys", "kvproto", @@ -4566,7 +4522,6 @@ dependencies = [ "raft_log_engine", "raftstore", "rand 0.8.5", - "range_cache_memory_engine", "regex", "resolved_ts", "resource_control", @@ -4712,9 +4667,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -4753,7 +4708,7 @@ dependencies = [ "lz4-sys", "memmap2", "nix 0.26.2", - "num-derive 0.4.0", + "num-derive 0.4.2", "num-traits", "parking_lot 0.12.1", "prometheus", @@ -4787,14 +4742,8 @@ dependencies = [ "engine_traits", "file_system", "kvproto", - "lazy_static", - "num_cpus", - "online_config", - "protobuf", "raft", "raft-engine", - "serde", - "serde_derive", "slog", "slog-global", "tempfile", @@ -4826,24 +4775,19 @@ dependencies = [ "error_code", "fail", "file_system", - "fs2", "futures 0.3.15", "futures-util", "getset", - "grpcio-health", "health_controller", - "hybrid_engine", "into_other", "itertools", "keys", "kvproto", "lazy_static", - "log", "log_wrappers", "memory_trace_macros", "online_config", "openssl", - "ordered-float", "panic_hook", "parking_lot 0.12.1", "pd_client", @@ -4853,7 +4797,6 @@ dependencies = [ "raft", "raft-proto", "rand 0.8.5", - "range_cache_memory_engine", "resource_control", "resource_metering", "serde", @@ -4865,7 +4808,6 @@ dependencies = [ "smallvec", "sst_importer", "strum 0.20.0", - "strum_macros 0.24.3", "tempfile", "test_sst_importer", "thiserror", @@ -4914,7 +4856,6 @@ dependencies = [ "error_code", "fail", "file_system", - "fs2", "futures 0.3.15", "health_controller", "keys", @@ -4933,7 +4874,6 @@ dependencies = [ "service", "slog", "slog-global", - "smallvec", "sst_importer", "tempfile", "test_pd", @@ -5064,49 +5004,6 @@ dependencies = [ "rand_core 0.6.2", ] -[[package]] -name = "range_cache_memory_engine" -version = "0.0.1" -dependencies = [ - "bytes", - "collections", - "crossbeam", - "crossbeam-skiplist 0.1.3", - "dashmap", - "engine_rocks", - "engine_traits", - "fail", - "futures 0.3.15", - "hex 0.4.2", - "keys", - "kvproto", - "lazy_static", - "libc 0.2.151", - "log_wrappers", - "online_config", - "parking_lot 0.12.1", - "pd_client", - "prometheus", - "prometheus-static-metric", - "proptest", - "raftstore", - "rand 0.8.5", - "security", - "serde", - "serde_derive", - "serde_json", - "slog", - "slog-global", - "tempfile", - "test_pd", - "test_pd_client", - "test_util", - "thiserror", - "tikv_util", - "txn_types", - "yatp", -] - [[package]] name = "raw-cpuid" version = "10.2.0" @@ -5264,18 +5161,15 @@ version = "0.0.1" dependencies = [ "collections", "concurrency_manager", - "crossbeam", "engine_rocks", "engine_traits", "fail", "futures 0.3.15", "grpcio", - "hex 0.4.2", "kvproto", "lazy_static", "log_wrappers", "online_config", - "panic_hook", "pd_client", "prometheus", "protobuf", @@ -5300,10 +5194,8 @@ dependencies = [ name = "resource_control" version = "0.0.1" dependencies = [ - "byteorder", "collections", "crossbeam", - "crossbeam-skiplist 0.1.3", "dashmap", "fail", "file_system", @@ -5323,7 +5215,6 @@ dependencies = [ "slog-global", "strum 0.20.0", "test_pd", - "test_pd_client", "tikv_util", "tokio-timer", "yatp", @@ -5339,12 +5230,9 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.151", - "log", "online_config", "pdqselect", "pin-project", - "procinfo 0.4.2 (git+https://github.com/tikv/procinfo-rs?rev=7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1)", "prometheus", "rand 0.8.5", "serde", @@ -5398,7 +5286,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#224bed6ffa29ba3bbe9a91ef6bda7186200c59a8" +source = "git+https://github.com/tikv/rust-rocksdb.git#2bb1e4e32b9e45cf3fd8210766a9db38eacd5e4d" dependencies = [ "libc 0.2.151", "librocksdb_sys", @@ -5407,7 +5295,7 @@ dependencies = [ [[package]] name = "rusoto_core" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "async-trait", "base64 0.13.0", @@ -5431,7 +5319,7 @@ dependencies = [ [[package]] name = "rusoto_credential" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "async-trait", "chrono", @@ -5448,7 +5336,7 @@ dependencies = [ [[package]] name = "rusoto_kms" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "async-trait", "bytes", @@ -5461,7 +5349,7 @@ dependencies = [ [[package]] name = "rusoto_mock" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "async-trait", "chrono", @@ -5475,7 +5363,7 @@ dependencies = [ [[package]] name = "rusoto_s3" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "async-trait", "bytes", @@ -5489,7 +5377,7 @@ dependencies = [ [[package]] name = "rusoto_signature" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "base64 0.13.0", "bytes", @@ -5511,7 +5399,7 @@ dependencies = [ [[package]] name = "rusoto_sts" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#cc733208600bdb15a13940d6930c1fbd4ab604f2" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#2b142c1792062a7a3a8317610d78dd141ab4223d" dependencies = [ "async-trait", "bytes", @@ -5654,13 +5542,10 @@ dependencies = [ "collections", "encryption", "grpcio", - "kvproto", "log_wrappers", "serde", "serde_derive", - "serde_json", "tempfile", - "tikv_util", ] [[package]] @@ -5872,17 +5757,14 @@ dependencies = [ "error_code", "fail", "file_system", - "fs2", "futures 0.3.15", "grpcio", - "grpcio-health", "health_controller", - "hex 0.4.2", "hybrid_engine", + "in_memory_engine", "keys", "kvproto", "libc 0.2.151", - "log", "log_wrappers", "pd_client", "prometheus", @@ -5891,7 +5773,6 @@ dependencies = [ "raft_log_engine", "raftstore", "raftstore-v2", - "range_cache_memory_engine", "resolved_ts", "resource_control", "resource_metering", @@ -5907,8 +5788,6 @@ dependencies = [ "tikv_alloc", "tikv_util", "tokio", - "toml", - "txn_types", "yatp", ] @@ -6083,33 +5962,26 @@ name = "snap_recovery" version = "0.1.0" dependencies = [ "chrono", - "encryption", "encryption_export", "engine_rocks", "engine_traits", "futures 0.3.15", "grpcio", - "itertools", "keys", "kvproto", "lazy_static", - "log", "pd_client", "prometheus", "prometheus-static-metric", - "protobuf", "raft_log_engine", "raftstore", "slog", "slog-global", - "structopt", "tempfile", "thiserror", "tikv", - "tikv_alloc", "tikv_util", "tokio", - "toml", "txn_types", ] @@ -6156,10 +6028,12 @@ name = "sst_importer" version = "0.1.0" dependencies = [ "api_version", + "async-compression", "collections", "crc32fast", "dashmap", "encryption", + "encryption_export", "engine_rocks", "engine_test", "engine_traits", @@ -6189,6 +6063,7 @@ dependencies = [ "tikv_alloc", "tikv_util", "tokio", + "tokio-util", "txn_types", "uuid 0.8.2", ] @@ -6277,19 +6152,6 @@ dependencies = [ "syn 1.0.103", ] -[[package]] -name = "strum_macros" -version = "0.24.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn 1.0.103", -] - [[package]] name = "strum_macros" version = "0.25.0" @@ -6300,7 +6162,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.43", + "syn 2.0.79", ] [[package]] @@ -6339,9 +6201,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.43" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", @@ -6475,12 +6337,10 @@ dependencies = [ "api_version", "backup", "collections", - "concurrency_manager", "crc64fast", "engine_rocks", "engine_traits", "external_storage", - "file_system", "futures 0.3.15", "futures-executor", "futures-util", @@ -6489,7 +6349,6 @@ dependencies = [ "protobuf", "raftstore", "rand 0.8.5", - "tempfile", "test_raftstore", "tidb_query_common", "tikv", @@ -6501,12 +6360,12 @@ dependencies = [ name = "test_coprocessor" version = "0.0.1" dependencies = [ - "api_version", + "codec", "collections", "concurrency_manager", - "engine_rocks", "futures 0.3.15", "kvproto", + "pd_client", "protobuf", "resource_metering", "test_storage", @@ -6553,7 +6412,6 @@ dependencies = [ "slog", "slog-global", "tikv_util", - "tokio", "tokio-timer", "txn_types", ] @@ -6570,16 +6428,15 @@ dependencies = [ "crossbeam", "encryption_export", "engine_rocks", - "engine_rocks_helper", "engine_test", "engine_traits", "fail", "file_system", "futures 0.3.15", "grpcio", - "grpcio-health", "health_controller", "hybrid_engine", + "in_memory_engine", "keys", "kvproto", "lazy_static", @@ -6589,7 +6446,6 @@ dependencies = [ "raft", "raftstore", "rand 0.8.5", - "range_cache_memory_engine", "resolved_ts", "resource_control", "resource_metering", @@ -6604,7 +6460,6 @@ dependencies = [ "tikv", "tikv_util", "tokio", - "tokio-timer", "txn_types", ] @@ -6613,17 +6468,13 @@ name = "test_raftstore-v2" version = "0.0.1" dependencies = [ "api_version", - "backtrace", "causal_ts", "collections", "concurrency_manager", - "crossbeam", "encryption_export", "engine_rocks", - "engine_rocks_helper", "engine_test", "engine_traits", - "fail", "file_system", "futures 0.3.15", "grpcio", @@ -6631,7 +6482,6 @@ dependencies = [ "health_controller", "keys", "kvproto", - "lazy_static", "log_wrappers", "pd_client", "protobuf", @@ -6654,7 +6504,6 @@ dependencies = [ "tikv", "tikv_util", "tokio", - "tokio-timer", "txn_types", ] @@ -6695,7 +6544,6 @@ dependencies = [ "engine_traits", "futures 0.3.15", "kvproto", - "pd_client", "raftstore", "test_raftstore", "tikv", @@ -6735,7 +6583,6 @@ dependencies = [ "batch-system", "byteorder", "causal_ts", - "cdc", "collections", "concurrency_manager", "crc64fast", @@ -6757,17 +6604,15 @@ dependencies = [ "grpcio-health", "health_controller", "hyper", + "in_memory_engine", "keys", "kvproto", - "libc 0.2.151", "log_wrappers", "more-asserts", "online_config", - "panic_hook", "paste", "pd_client", "perfcnt", - "procinfo 0.4.2 (git+https://github.com/tikv/procinfo-rs?rev=7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1)", "profiler", "protobuf", "raft", @@ -6784,7 +6629,6 @@ dependencies = [ "service", "slog", "slog-global", - "sst_importer", "tempfile", "test_backup", "test_coprocessor", @@ -6796,11 +6640,9 @@ dependencies = [ "test_sst_importer", "test_storage", "test_util", - "tidb_query_aggr", "tidb_query_common", "tidb_query_datatype", "tidb_query_executors", - "tidb_query_expr", "tikv", "tikv_kv", "tikv_util", @@ -6884,7 +6726,6 @@ version = "0.0.1" dependencies = [ "anyhow", "api_version", - "async-trait", "byteorder", "derive_more", "error_code", @@ -6911,11 +6752,13 @@ dependencies = [ "bitflags 1.3.2", "boolinator", "bstr", + "bytemuck", "chrono", "chrono-tz", "codec", "collections", "crc32fast", + "criterion", "encoding_rs 0.8.29", "error_code", "hex 0.4.2", @@ -6969,7 +6812,6 @@ dependencies = [ "tikv_util", "tipb", "tipb_helper", - "yatp", ] [[package]] @@ -7011,15 +6853,13 @@ dependencies = [ [[package]] name = "tikv" -version = "8.2.0-alpha" +version = "8.5.0-alpha" dependencies = [ "anyhow", "api_version", "async-stream", "async-trait", "backtrace", - "batch-system", - "byteorder", "causal_ts", "chrono", "codec", @@ -7029,14 +6869,13 @@ dependencies = [ "crc32fast", "crc64fast", "crossbeam", - "crypto", "dashmap", + "encryption", "encryption_export", "engine_panic", "engine_rocks", "engine_test", "engine_traits", - "engine_traits_tests", "error_code", "example_coprocessor_plugin", "fail", @@ -7057,7 +6896,7 @@ dependencies = [ "hyper", "hyper-openssl", "hyper-tls", - "into_other", + "in_memory_engine", "itertools", "keyed_priority_queue", "keys", @@ -7068,7 +6907,6 @@ dependencies = [ "log", "log_wrappers", "match-template", - "memory_trace_macros", "mime", "more-asserts", "mur3", @@ -7085,7 +6923,6 @@ dependencies = [ "pin-project", "pnet_datalink", "pprof", - "procinfo 0.4.2 (git+https://github.com/tikv/procinfo-rs?rev=7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1)", "prometheus", "prometheus-static-metric", "protobuf", @@ -7094,7 +6931,6 @@ dependencies = [ "raftstore", "raftstore-v2", "rand 0.7.3", - "range_cache_memory_engine", "regex", "reqwest", "resource_control", @@ -7116,10 +6952,8 @@ dependencies = [ "sync_wrapper", "sysinfo", "tempfile", - "test_sst_importer", "test_util", "thiserror", - "tidb_query_aggr", "tidb_query_common", "tidb_query_datatype", "tidb_query_executors", @@ -7195,12 +7029,9 @@ dependencies = [ name = "tikv_kv" version = "0.1.0" dependencies = [ - "backtrace", "collections", - "encryption", "engine_panic", "engine_rocks", - "engine_test", "engine_traits", "error_code", "fail", @@ -7238,12 +7069,12 @@ dependencies = [ "codec", "collections", "cpu-time", - "crc32fast", "crossbeam", "crossbeam-skiplist 0.1.3", "derive_more", "error_code", "fail", + "fs2", "futures 0.3.15", "futures-util", "gag", @@ -7259,7 +7090,6 @@ dependencies = [ "num-traits", "num_cpus", "online_config", - "openssl", "page_size", "panic_hook", "parking_lot_core 0.9.1", @@ -7291,7 +7121,6 @@ dependencies = [ "toml", "tracker", "url", - "utime", "yatp", ] @@ -7362,7 +7191,7 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#87f5b80908ab561fb7875c88114488cad15727b1" +source = "git+https://github.com/pingcap/tipb.git#cf70966bef25e205cb845c19265301542d1238d1" dependencies = [ "futures 0.3.15", "grpcio", @@ -7488,9 +7317,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" @@ -7548,7 +7377,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.79", ] [[package]] @@ -7576,7 +7405,6 @@ dependencies = [ name = "tracker" version = "0.0.1" dependencies = [ - "collections", "crossbeam-utils", "kvproto", "lazy_static", @@ -7584,6 +7412,7 @@ dependencies = [ "pin-project", "prometheus", "slab", + "slog", ] [[package]] @@ -7658,9 +7487,9 @@ checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" @@ -7670,9 +7499,9 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -7707,17 +7536,6 @@ dependencies = [ "serde", ] -[[package]] -name = "utime" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "055058552ca15c566082fc61da433ae678f78986a6f16957e33162d1b218792a" -dependencies = [ - "kernel32-sys", - "libc 0.2.151", - "winapi 0.2.8", -] - [[package]] name = "uuid" version = "0.8.2" @@ -8283,7 +8101,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.43", + "syn 2.0.79", ] [[package]] @@ -8303,29 +8121,28 @@ dependencies = [ [[package]] name = "zstd" -version = "0.11.2+zstd.1.5.2" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ - "libc 0.2.151", "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.1+zstd.1.5.2" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", - "libc 0.2.151", + "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index 384e14231c7..386891d85b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "8.2.0-alpha" +version = "8.5.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" @@ -35,7 +35,7 @@ testexport = [ "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport", - "hybrid_engine/testexport", + "encryption/testexport", ] test-engine-kv-rocksdb = ["engine_test/test-engine-kv-rocksdb"] test-engine-raft-raft-engine = ["engine_test/test-engine-raft-raft-engine"] @@ -67,8 +67,6 @@ api_version = { workspace = true } async-stream = "0.2" async-trait = "0.1" backtrace = "0.3" -batch-system = { workspace = true } -byteorder = "1.2" causal_ts = { workspace = true } chrono = { workspace = true } codec = { workspace = true } @@ -78,8 +76,8 @@ coprocessor_plugin_api = { workspace = true } crc32fast = "1.2" crc64fast = "0.1" crossbeam = { workspace = true } -crypto = { workspace = true } dashmap = "5" +encryption = { workspace = true } encryption_export = { workspace = true } engine_panic = { workspace = true } engine_rocks = { workspace = true } @@ -105,7 +103,7 @@ http = "0" hybrid_engine = { workspace = true } hyper = { version = "0.14", features = ["full"] } hyper-tls = "0.5" -into_other = { workspace = true } +in_memory_engine = { workspace = true } itertools = "0.10" keyed_priority_queue = "0.4" keys = { workspace = true } @@ -119,7 +117,6 @@ log = { version = "0.4", features = [ ] } log_wrappers = { workspace = true } match-template = "0.0.1" -memory_trace_macros = { workspace = true } mime = "0.3.13" more-asserts = "0.2" mur3 = "0.1" @@ -146,7 +143,6 @@ raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.7.3" -range_cache_memory_engine = { workspace = true } regex = "1.3" resource_control = { workspace = true } resource_metering = { workspace = true } @@ -168,7 +164,6 @@ sync_wrapper = "0.1.1" sysinfo = "0.26" tempfile = "3.0" thiserror = "1.0" -tidb_query_aggr = { workspace = true } tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tidb_query_executors = { workspace = true } @@ -193,13 +188,11 @@ yatp = { workspace = true } [dev-dependencies] api_version = { workspace = true, features = ["testexport"] } engine_test = { workspace = true } -engine_traits_tests = { workspace = true } -example_coprocessor_plugin = { workspace = true } # should be a binary dependency +example_coprocessor_plugin = { workspace = true } # must, used for testing coprocessor plugin hyper-openssl = "0.9" panic_hook = { workspace = true } raftstore = { workspace = true, features = ["testexport"] } reqwest = { version = "0.11", features = ["blocking"] } -test_sst_importer = { workspace = true } test_util = { workspace = true } tokio = { version = "1.17", features = ["macros", "rt-multi-thread", "time"] } zipf = "6.1.0" @@ -237,9 +230,6 @@ cmake = { git = "https://github.com/rust-lang/cmake-rs" } backtrace = { git = 'https://github.com/hehechen/backtrace-rs', branch = "v0.3.61" } sysinfo = { git = "https://github.com/tikv/sysinfo", branch = "0.26-fix-cpu" } - -[target.'cfg(target_os = "linux")'.dependencies] -procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. @@ -336,6 +326,27 @@ members = [ ] default-members = ["raftstore-proxy"] +[workspace.metadata.cargo-machete] +ignored = [ + "slog", + "slog-global", + "serde", + "prometheus", + "slog_derive", + "strum", + "error_code", + "fuzz-targets", + "libfuzzer-sys", +] +[package.metadata.cargo-machete] +ignored = [ + "encryption", + "engine_panic", + "hybrid_engine", + "hyper-tls", + "match-template", +] + [workspace.dependencies] api_version = { path = "components/api_version" } aws = { path = "components/cloud/aws" } @@ -365,7 +376,7 @@ encryption_export = { path = "components/encryption/export" } engine_panic = { path = "components/engine_panic" } engine_rocks = { path = "components/engine_rocks" } hybrid_engine = { path = "components/hybrid_engine" } -range_cache_memory_engine = { path = "components/range_cache_memory_engine" } +in_memory_engine = { path = "components/in_memory_engine" } engine_rocks_helper = { path = "components/engine_rocks_helper" } engine_test = { path = "components/engine_test", default-features = false } engine_traits = { path = "components/engine_traits" } diff --git a/OWNERS b/OWNERS index b0e73247005..f70f2cbf0ea 100644 --- a/OWNERS +++ b/OWNERS @@ -10,6 +10,7 @@ approvers: - cfzjywxk - Connor1996 - coocood + - crazycs520 - disksing - ekexium - gengliqi @@ -47,7 +48,9 @@ approvers: - zhangjinpeng87 - zhongzc - zhouqiang-cl + - zyguan reviewers: + - 3AceShowHand - 3pointer - CalvinNeo - ethercflow @@ -55,7 +58,7 @@ reviewers: - Fullstop000 - gozssky - haojinming - - hi-rustin + - hbisheng - HuSharp - jayzhan211 - Jibbow @@ -68,9 +71,9 @@ reviewers: - MrCroxx - nolouch - rleungx + - Rustin170506 - tier-cap - v01dstar - wjhuang2016 - wshwsh12 - Xuanwo - - zyguan diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES index cd4a74373a9..49fdc25e4ee 100644 --- a/OWNERS_ALIASES +++ b/OWNERS_ALIASES @@ -1,14 +1,15 @@ # Sort the member alphabetically. aliases: sig-critical-approvers-config-components: - - easonn7 - - kevin-xianliu + - BenMeadowcroft + - yudongusa - zhangjinpeng87 sig-critical-approvers-config-src: - - easonn7 - - kevin-xianliu + - BenMeadowcroft - cfzjywxk + - yudongusa - zhangjinpeng87 sig-critical-approvers-config-cdc: - BenMeadowcroft - - kevin-xianliu + - flowbehappy + - yudongusa diff --git a/clippy.toml b/clippy.toml index 15e0f1f549c..b6c0b2dc320 100644 --- a/clippy.toml +++ b/clippy.toml @@ -51,5 +51,36 @@ reason = """ X509StoreRef::objects is unsound, see RUSTSEC-2020-0071 """ +# See more about RUSTSEC-2024-0357 in deny.toml. +[[disallowed-types]] +path = "openssl::bio::MemBio" +reason = """ +openssl::bio::MemBio::get_buf is unsound, see RUSTSEC-2024-0357 +""" +[[disallowed-types]] +path = "openssl::pkcs7::Pkcs7" +reason = """ +openssl::pkcs7::Pkcs7 may call openssl::bio::MemBio::get_buf, \ +see RUSTSEC-2024-0357 +""" +[[disallowed-types]] +path = "openssl::pkey::PKeyRef" +reason = """ +openssl::pkey::PKeyRef may call openssl::bio::MemBio::get_buf, \ +see RUSTSEC-2024-0357 +""" +[[disallowed-types]] +path = "openssl::cms::CmsContentInfoRef" +reason = """ +openssl::cms::CmsContentInfoRef may call openssl::bio::MemBio::get_buf, \ +see RUSTSEC-2024-0357 +""" +[[disallowed-types]] +path = "openssl::asn1::Asn1GeneralizedTimeRef" +reason = """ +openssl::asn1::Asn1GeneralizedTimeRef may call openssl::bio::MemBio::get_buf, \ +see RUSTSEC-2024-0357 +""" + avoid-breaking-exported-api = false upper-case-acronyms-aggressive = true diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 50b41309448..fc5865eece2 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -33,17 +33,12 @@ nortcheck = ["engine_rocks/nortcheck"] [dependencies] api_version = { workspace = true } -backup = { workspace = true } -cdc = { workspace = true } clap = { workspace = true } collections = { workspace = true } -concurrency_manager = { workspace = true } -crossbeam = { workspace = true } crypto = { workspace = true } encryption_export = { workspace = true } engine_rocks = { workspace = true } engine_traits = { workspace = true } -error_code = { workspace = true } file_system = { workspace = true } futures = "0.3" gag = "1.0" @@ -51,11 +46,9 @@ grpcio = { workspace = true } hex = "0.4" keys = { workspace = true } kvproto = { workspace = true } -libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } pd_client = { workspace = true } -prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } raft-engine = { workspace = true } @@ -69,9 +62,7 @@ server = { workspace = true } slog = { workspace = true } slog-global = { workspace = true } structopt = "0.3" -tempfile = "3.0" tikv = { workspace = true } -tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } toml = "0.5" @@ -80,6 +71,3 @@ txn_types = { workspace = true } [build-dependencies] cc = "1.0" time = { workspace = true } - -[target.'cfg(unix)'.dependencies] -signal-hook = "0.3" diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index ccc6730892d..bdb8af9e4f5 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -34,12 +34,6 @@ pprof-fp = ["tikv/pprof-fp"] [dependencies] clap = { workspace = true } crypto = { workspace = true } -encryption_export = { workspace = true } -engine_traits = { workspace = true } -keys = { workspace = true } -kvproto = { workspace = true } -raft-engine = { workspace = true } -regex = "1" serde_json = { version = "1.0", features = ["preserve_order"] } server = { workspace = true } tikv = { workspace = true } diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index fd3f1c765e9..62a0bf3d9a8 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -15,8 +15,6 @@ engine_traits = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } match-template = "0.0.1" -thiserror = "1.0" -tikv_alloc = { workspace = true } tikv_util = { workspace = true } txn_types = { workspace = true } diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 3aa319db92e..2897533dc74 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -26,15 +26,14 @@ test = true harness = true [dependencies] -async-compression = { version = "0.3.14", features = ["tokio", "zstd"] } +async-compression = { version = "0.4.12", features = ["tokio", "zstd"] } async-trait = { version = "0.1" } -bytes = "1" chrono = { workspace = true } concurrency_manager = { workspace = true } -crossbeam = { workspace = true } -crossbeam-channel = { workspace = true } +crc64fast = "0.1" dashmap = "5" encryption = { workspace = true } +encryption_export = { workspace = true } engine_rocks = { workspace = true } engine_traits = { workspace = true } error_code = { workspace = true } @@ -42,11 +41,8 @@ external_storage = { workspace = true } fail = "0.5" file_system = { workspace = true } futures = "0.3" -futures-io = "0.3" grpcio = { workspace = true } hex = "0.4" -# Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 -indexmap = "=1.6.2" kvproto = { workspace = true } lazy_static = "1.4" log_wrappers = { workspace = true } @@ -60,15 +56,12 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } raftstore = { workspace = true } rand = "0.8.0" -regex = "1" resolved_ts = { workspace = true } -security = { path = "../security" } slog = { workspace = true } slog-global = { workspace = true } thiserror = "1" tidb_query_datatype = { workspace = true } tikv = { workspace = true } -tikv_alloc = { workspace = true } tikv_kv = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } @@ -78,7 +71,6 @@ tracing = { workspace = true } tracing-active-tree = { workspace = true } txn_types = { workspace = true } uuid = "0.8" -yatp = { workspace = true } [dev-dependencies] async-trait = "0.1" diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index badb02c749c..7fede7dad0c 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -10,7 +10,8 @@ use std::{ }; use concurrency_manager::ConcurrencyManager; -use encryption::DataKeyManager; +use dashmap::DashMap; +use encryption::BackupEncryptionManager; use engine_traits::KvEngine; use error_code::ErrorCodeExt; use futures::{stream::AbortHandle, FutureExt, TryFutureExt}; @@ -25,7 +26,10 @@ use raftstore::{ router::CdcHandle, }; use resolved_ts::{resolve_by_raft, LeadershipResolver}; -use tikv::config::{BackupStreamConfig, ResolvedTsConfig}; +use tikv::{ + config::{BackupStreamConfig, ResolvedTsConfig}, + storage::txn::txn_status_cache::TxnStatusCache, +}; use tikv_util::{ box_err, config::ReadableDuration, @@ -60,7 +64,7 @@ use crate::{ metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, observer::BackupStreamObserver, - router::{self, ApplyEvents, Router, TaskSelector}, + router::{self, ApplyEvents, FlushContext, Router, TaskSelector}, subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, subscription_track::{Ref, RefMut, ResolveResult, SubscriptionTracer}, try_send, @@ -85,14 +89,12 @@ pub struct Endpoint { pub(crate) subs: SubscriptionTracer, pub(crate) concurrency_manager: ConcurrencyManager, - // Note: some of fields are public so test cases are able to access them. + // Note: some of the fields are public so test cases are able to access them. pub range_router: Router, observer: BackupStreamObserver, pool: Runtime, region_operator: Sender, failover_time: Option, - // We holds the config before, even it is useless for now, - // however probably it would be useful in the future. config: BackupStreamConfig, pub checkpoint_mgr: CheckpointManager, @@ -123,16 +125,16 @@ where pd_client: Arc, concurrency_manager: ConcurrencyManager, resolver: BackupStreamResolver, - data_key_manager: Option>, + backup_encryption_manager: BackupEncryptionManager, + txn_status_cache: Arc, ) -> Self { crate::metrics::STREAM_ENABLED.inc(); let pool = create_tokio_runtime((config.num_threads / 2).max(1), "backup-stream") .expect("failed to create tokio runtime for backup stream worker."); let meta_client = MetadataClient::new(store, store_id); - let mut conf = router::Config::from(config.clone()); - conf.data_key_manager = data_key_manager; - let range_router = Router::new(scheduler.clone(), conf); + let conf = router::Config::from(config.clone()); + let range_router = Router::new(scheduler.clone(), conf, backup_encryption_manager.clone()); // spawn a worker to watch task changes from etcd periodically. let meta_client_clone = meta_client.clone(); @@ -157,7 +159,7 @@ where }; let initial_scan_throughput_quota = Limiter::new(limit); info!("the endpoint of stream backup started"; "path" => %config.temp_path); - let subs = SubscriptionTracer::default(); + let subs = SubscriptionTracer(Arc::new(DashMap::new()), txn_status_cache.clone()); let initial_scan_semaphore = Arc::new(Semaphore::new(config.initial_scan_concurrency)); let (region_operator, op_loop) = RegionSubscriptionManager::start( @@ -274,9 +276,7 @@ where fn on_fatal_error(&self, select: TaskSelector, err: Box) { err.report_fatal(); - let tasks = self - .pool - .block_on(self.range_router.select_task(select.reference())); + let tasks = self.range_router.select_task(select.reference()); warn!("fatal error reporting"; "selector" => ?select, "selected" => ?tasks, "err" => %err); for task in tasks { // Let's pause the task first. @@ -323,8 +323,6 @@ where if task.is_paused { continue; } - // We have meet task upon store start, we must in a failover. - scheduler.schedule(Task::MarkFailover(Instant::now()))?; // move task to schedule scheduler.schedule(Task::WatchTask(TaskOp::AddTask(task)))?; } @@ -670,15 +668,16 @@ where pub fn on_register(&self, task: StreamTask) { let name = task.info.name.clone(); let start_ts = task.info.start_ts; - self.load_task(task); + self.register_stream_task(task); metrics::STORE_CHECKPOINT_TS .with_label_values(&[name.as_str()]) .set(start_ts as _); } - /// Load the task into memory: this would make the endpint start to observe. - fn load_task(&self, task: StreamTask) { + /// Load the task into memory: this would make the endpoint start to + /// observe. + fn register_stream_task(&self, task: StreamTask) { let cli = self.meta_client.clone(); let range_router = self.range_router.clone(); @@ -688,23 +687,11 @@ where ); let task_name = task.info.get_name().to_owned(); - // clean the safepoint created at pause(if there is) - self.pool.spawn(root!("load_initial_task"; - self.pd_client - .update_service_safe_point( - self.pause_guard_id_for_task(task.info.get_name()), - TimeStamp::zero(), - Duration::new(0, 0), - ) - .map(|r| { - r.map_err(|err| Error::from(err).report("removing safe point for pausing")) - }) - )); + self.clean_pause_guard_id_for_task(&task_name); self.pool.block_on(async move { - let task_clone = task.clone(); + let task_name_clone = task.info.get_name().to_owned(); let run = async move { - let task_name = task.info.get_name(); - let ranges = cli.ranges_of_task(task_name).await?; + let ranges = cli.ranges_of_task(task.info.get_name()).await?; fail::fail_point!("load_task::error_when_fetching_ranges", |_| { Err(Error::Other("what range? no such thing, go away.".into())) }); @@ -732,16 +719,31 @@ where "finish register backup stream ranges"; "task" => ?task, ); - Result::Ok(()) + Ok(()) }; if let Err(e) = run.await { - self.on_fatal_error_of_task(&task_clone.info.name, &Box::new(e)) + self.on_fatal_error_of_task(&task_name_clone, &Box::new(e)) .await; } }); metrics::update_task_status(TaskStatus::Running, &task_name); } + // clean the safepoint created at pause(if there is) + fn clean_pause_guard_id_for_task(&self, task_name: &str) { + self.pool.spawn(root!("unregister_task"; + self.pd_client + .update_service_safe_point( + self.pause_guard_id_for_task(task_name), + TimeStamp::zero(), + Duration::new(0, 0), + ) + .map(|r| { + r.map_err(|err| Error::from(err).report("removing safe point for pausing")) + }) + )); + } + fn pause_guard_id_for_task(&self, task: &str) -> String { format!("{}-{}-pause-guard", task, self.store_id) } @@ -759,7 +761,7 @@ where pub fn on_resume(&self, task_name: String) { let task = self.pool.block_on(self.meta_client.get_task(&task_name)); match task { - Ok(Some(stream_task)) => self.load_task(stream_task), + Ok(Some(stream_task)) => self.register_stream_task(stream_task), Ok(None) => { info!("backup stream task not existed"; "task" => %task_name); } @@ -776,9 +778,10 @@ where } } - pub fn on_unregister(&self, task: &str) -> Option { - let info = self.unload_task(task); - self.remove_metrics_after_unregister(task); + pub fn on_unregister(&self, task_name: &str) -> Option { + let info = self.unload_task(task_name); + self.clean_pause_guard_id_for_task(task_name); + self.remove_metrics_after_unregister(task_name); info } @@ -803,7 +806,7 @@ where // so simply clear all info would be fine. self.observer.ranges.wl().clear(); self.subs.clear(); - self.pool.block_on(router.unregister_task(task)) + router.unregister_task(task) } fn prepare_min_ts(&self) -> future![TimeStamp] { @@ -821,19 +824,25 @@ where } } - fn do_flush(&self, task: String, mut resolved: ResolvedRegions) -> future![Result<()>] { + fn do_flush(&self, task: String, resolved: ResolvedRegions) -> future![Result<()>] { let router = self.range_router.clone(); let store_id = self.store_id; let mut flush_ob = self.flush_observer(); async move { let mut new_rts = resolved.global_checkpoint(); fail::fail_point!("delay_on_flush"); - flush_ob.before(resolved.take_resolve_result()).await; + flush_ob.before(resolved.resolve_results().to_vec()).await; if let Some(rewritten_rts) = flush_ob.rewrite_resolved_ts(&task).await { info!("rewriting resolved ts"; "old" => %new_rts, "new" => %rewritten_rts); new_rts = rewritten_rts.min(new_rts); } - if let Some(rts) = router.do_flush(&task, store_id, new_rts).await { + let cx = FlushContext { + task_name: &task, + store_id, + resolved_regions: &resolved, + resolved_ts: new_rts, + }; + if let Some(rts) = router.do_flush(cx).await { info!("flushing and refreshing checkpoint ts."; "checkpoint_ts" => %rts, "task" => %task, @@ -850,9 +859,9 @@ where pub fn on_force_flush(&self, task: String) { self.pool.block_on(async move { - let info = self.range_router.get_task_info(&task).await; + let handler_res = self.range_router.get_task_handler(&task); // This should only happen in testing, it would be to unwrap... - let _ = info.unwrap().set_flushing_status_cas(false, true); + let _ = handler_res.unwrap().set_flushing_status_cas(false, true); let mts = self.prepare_min_ts().await; let sched = self.scheduler.clone(); self.region_op(ObserveOp::ResolveRegions { @@ -1441,44 +1450,3 @@ where self.run_task(task) } } - -#[cfg(test)] -mod test { - use engine_rocks::RocksEngine; - use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; - use tikv_util::worker::dummy_scheduler; - - use crate::{ - checkpoint_manager::tests::MockPdClient, endpoint, endpoint::Endpoint, metadata::test, Task, - }; - - #[tokio::test] - async fn test_start() { - let cli = test::test_meta_cli(); - let (sched, mut rx) = dummy_scheduler(); - let task = test::simple_task("simple_3"); - cli.insert_task_with_range(&task, &[]).await.unwrap(); - - fail::cfg("failed_to_get_tasks", "1*return").unwrap(); - Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockPdClient>::start_and_watch_tasks( - cli, sched, - ) - .await - .unwrap(); - fail::remove("failed_to_get_tasks"); - - let _t1 = rx.recv().unwrap(); - let t2 = rx.recv().unwrap(); - - match t2 { - Task::WatchTask(t) => match t { - endpoint::TaskOp::AddTask(t) => { - assert_eq!(t.info, task.info); - assert!(!t.is_paused); - } - _ => panic!("not match TaskOp type"), - }, - _ => panic!("not match Task type {:?}", t2), - } - } -} diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 467b0bcaa92..a990ea2a82b 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -160,7 +160,7 @@ impl EventLoader { debug!("meet lock during initial scanning."; "key" => %utils::redact(&lock_at), "ts" => %lock.ts); if utils::should_track_lock(&lock) { resolver - .track_phase_one_lock(lock.ts, lock_at) + .track_phase_one_lock(lock.ts, lock_at, lock.generation) .map_err(|_| Error::OutOfQuota { region_id: self.region.id, })?; @@ -483,7 +483,10 @@ mod tests { use futures::executor::block_on; use kvproto::metapb::*; - use tikv::storage::{txn::tests::*, TestEngineBuilder}; + use tikv::storage::{ + txn::{tests::*, txn_status_cache::TxnStatusCache}, + TestEngineBuilder, + }; use tikv_kv::SnapContext; use tikv_util::memory::{MemoryQuota, OwnedAllocated}; use txn_types::TimeStamp; @@ -524,7 +527,7 @@ mod tests { }); r.unwrap(); let mut events = ApplyEvents::with_capacity(1024, 42); - let mut res = TwoPhaseResolver::new(42, None); + let mut res = TwoPhaseResolver::new(42, None, Arc::new(TxnStatusCache::new_for_test())); loader.emit_entries_to(&mut events, &mut res).unwrap(); assert_ne!(events.len(), 0); assert_ne!(data_load, 0); diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index 0402e5d2ee3..6d7afa334c9 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(slice_group_by)] +#![feature(trait_alias)] #![feature(result_flattening)] #![feature(assert_matches)] #![feature(test)] @@ -25,4 +26,4 @@ pub use checkpoint_manager::GetCheckpointResult; pub use endpoint::{ BackupStreamResolver, Endpoint, ObserveOp, RegionCheckpointOperation, RegionSet, Task, }; -pub use service::Service; +pub use service::BackupStreamGrpcService; diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index 3a2fc1d119d..7b03c482774 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -30,6 +30,8 @@ pub fn remove_task_status_metric(task: &str) -> Result<()> { TASK_STATUS.remove_label_values(&[task]) } +// When adding new metrics, remember to update in the grafana dashboard, for +// example update the tikv_details.dashboard.py. lazy_static! { pub static ref INTERNAL_ACTOR_MESSAGE_HANDLE_DURATION: HistogramVec = register_histogram_vec!( "tikv_log_backup_interal_actor_acting_duration_sec", diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index 6a40a336fb8..071f09cf3d9 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -189,8 +189,9 @@ impl RegionChangeObserver for BackupStreamObserver { #[cfg(test)] mod tests { - use std::{assert_matches::assert_matches, time::Duration}; + use std::{assert_matches::assert_matches, sync::Arc, time::Duration}; + use dashmap::DashMap; use engine_panic::PanicEngine; use kvproto::metapb::Region; use raft::StateRole; @@ -198,6 +199,7 @@ mod tests { Cmd, CmdBatch, CmdObserveInfo, CmdObserver, ObserveHandle, ObserveLevel, ObserverContext, RegionChangeEvent, RegionChangeObserver, RegionChangeReason, RoleChange, RoleObserver, }; + use tikv::storage::txn::txn_status_cache::TxnStatusCache; use tikv_util::{worker::dummy_scheduler, HandyRwLock}; use super::BackupStreamObserver; @@ -220,7 +222,10 @@ mod tests { // Prepare: assuming a task wants the range of [0001, 0010]. let o = BackupStreamObserver::new(sched); - let subs = SubscriptionTracer::default(); + let subs = SubscriptionTracer( + Arc::new(DashMap::new()), + Arc::new(TxnStatusCache::new_for_test()), + ); assert!(o.ranges.wl().add((b"0001".to_vec(), b"0010".to_vec()))); // Test regions can be registered. @@ -245,7 +250,10 @@ mod tests { // Prepare: assuming a task wants the range of [0001, 0010]. let o = BackupStreamObserver::new(sched); - let subs = SubscriptionTracer::default(); + let subs = SubscriptionTracer( + Arc::new(DashMap::new()), + Arc::new(TxnStatusCache::new_for_test()), + ); assert!(o.ranges.wl().add((b"0001".to_vec(), b"0010".to_vec()))); // Test regions can be registered. @@ -288,7 +296,7 @@ mod tests { // Test region out of range won't be added to observe list. let r = fake_region(43, b"0010", b"0042"); let mut ctx = ObserverContext::new(&r); - o.on_role_change(&mut ctx, &RoleChange::new(StateRole::Leader)); + o.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Leader)); let task = rx.recv_timeout(Duration::from_millis(20)); assert!(task.is_err(), "it is {:?}", task); assert!(!subs.is_observing(43)); @@ -303,7 +311,7 @@ mod tests { // Test give up subscripting when become follower. let r = fake_region(42, b"0008", b"0009"); let mut ctx = ObserverContext::new(&r); - o.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); + o.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Follower)); let task = rx.recv_timeout(Duration::from_millis(20)); assert_matches!( task, @@ -325,7 +333,7 @@ mod tests { RegionChangeEvent::Update(RegionChangeReason::Split), StateRole::Leader, ); - o.on_role_change(&mut ctx, &RoleChange::new(StateRole::Leader)); + o.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Leader)); let task = rx.recv_timeout(Duration::from_millis(20)); assert!(task.is_err(), "it is {:?}", task); } diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index d2a501ff995..ce16776d9be 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -13,15 +13,20 @@ use std::{ time::Duration, }; -use encryption::DataKeyManager; +use dashmap::DashMap; +use encryption::{BackupEncryptionManager, EncrypterReader, Iv, MultiMasterKeyBackend}; +use encryption_export::create_async_backend; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_WRITE}; use external_storage::{create_storage, BackendConfig, ExternalStorage, UnpinReader}; +use file_system::Sha256Reader; use futures::io::Cursor; use kvproto::{ brpb::{ CompressionType, DataFileGroup, DataFileInfo, FileType, MetaVersion, Metadata, - StreamBackupTaskInfo, + StreamBackupTaskInfo, StreamBackupTaskSecurityConfig_oneof_encryption, }, + encryptionpb::{EncryptionMethod, FileEncryptionInfo, MasterKeyBased, PlainTextDataKey}, + metapb::RegionEpoch, raft_cmdpb::CmdType, }; use openssl::hash::{Hasher, MessageDigest}; @@ -44,7 +49,7 @@ use tokio::{ io::AsyncWriteExt, sync::{Mutex, RwLock}, }; -use tokio_util::compat::TokioAsyncReadCompatExt; +use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; use tracing::instrument; use tracing_active_tree::frame; use txn_types::{Key, Lock, TimeStamp, WriteRef}; @@ -56,8 +61,9 @@ use crate::{ errors::{ContextualResultExt, Error}, metadata::StreamTask, metrics::{HANDLE_KV_HISTOGRAM, SKIP_KV_COUNTER}, + subscription_manager::ResolvedRegions, subscription_track::TwoPhaseResolver, - tempfiles::{self, TempFilePool}, + tempfiles::{self, ForRead, TempFilePool}, try_send, utils::{self, CompressionWriter, FilesReader, SegmentMap, SlotMap, StopWatch}, }; @@ -132,7 +138,7 @@ impl<'a> TaskSelectorRef<'a> { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ApplyEvent { pub key: Vec, pub value: Vec, @@ -140,7 +146,7 @@ pub struct ApplyEvent { pub cmd_type: CmdType, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ApplyEvents { events: Vec, region_id: u64, @@ -148,6 +154,14 @@ pub struct ApplyEvents { region_resolved_ts: u64, } +#[derive(Clone, Copy)] +pub struct FlushContext<'a> { + pub task_name: &'a str, + pub store_id: u64, + pub resolved_regions: &'a ResolvedRegions, + pub resolved_ts: TimeStamp, +} + impl ApplyEvents { /// Convert a [CmdBatch] to a vector of events. Ignoring admin / error /// commands. At the same time, advancing status of the `Resolver` by @@ -199,7 +213,7 @@ impl ApplyEvents { Ok(lock) => { if utils::should_track_lock(&lock) { resolver - .track_lock(lock.ts, key) + .track_lock(lock.ts, key, lock.generation) .map_err(|_| Error::OutOfQuota { region_id })?; } } @@ -326,11 +340,10 @@ pub struct Config { pub temp_file_size_limit: u64, pub temp_file_memory_quota: u64, pub max_flush_interval: Duration, - pub data_key_manager: Option>, } -impl From for Config { - fn from(value: tikv::config::BackupStreamConfig) -> Self { +impl From for Config { + fn from(value: BackupStreamConfig) -> Self { let prefix = PathBuf::from(value.temp_path); let temp_file_size_limit = value.file_size_limit.0; let temp_file_memory_quota = value.temp_file_memory_quota.0; @@ -340,15 +353,22 @@ impl From for Config { temp_file_size_limit, temp_file_memory_quota, max_flush_interval, - data_key_manager: None, } } } impl Router { /// Create a new router with the temporary folder. - pub fn new(scheduler: Scheduler, config: Config) -> Self { - Self(Arc::new(RouterInner::new(scheduler, config))) + pub fn new( + scheduler: Scheduler, + config: Config, + backup_encryption_manager: BackupEncryptionManager, + ) -> Self { + Self(Arc::new(RouterInner::new( + scheduler, + config, + backup_encryption_manager, + ))) } } @@ -375,7 +395,7 @@ pub struct RouterInner { /// which range a point belongs to. ranges: SyncRwLock, String>>, /// The temporary files associated to some task. - tasks: Mutex>>, + tasks: DashMap>, /// The temporary directory for all tasks. prefix: PathBuf, @@ -387,7 +407,9 @@ pub struct RouterInner { temp_file_memory_quota: AtomicU64, /// The max duration the local data can be pending. max_flush_interval: SyncRwLock, - data_key_manager: Option>, + + /// Backup encryption manager + backup_encryption_manager: BackupEncryptionManager, } impl std::fmt::Debug for RouterInner { @@ -401,16 +423,20 @@ impl std::fmt::Debug for RouterInner { } impl RouterInner { - pub fn new(scheduler: Scheduler, config: Config) -> Self { + pub fn new( + scheduler: Scheduler, + config: Config, + backup_encryption_manager: BackupEncryptionManager, + ) -> Self { RouterInner { ranges: SyncRwLock::new(SegmentMap::default()), - tasks: Mutex::new(HashMap::default()), + tasks: DashMap::new(), prefix: config.prefix, scheduler, temp_file_size_limit: AtomicU64::new(config.temp_file_size_limit), temp_file_memory_quota: AtomicU64::new(config.temp_file_memory_quota), max_flush_interval: SyncRwLock::new(config.max_flush_interval), - data_key_manager: config.data_key_manager, + backup_encryption_manager, } } @@ -420,9 +446,9 @@ impl RouterInner { .store(config.file_size_limit.0, Ordering::SeqCst); self.temp_file_memory_quota .store(config.temp_file_memory_quota.0, Ordering::SeqCst); - let tasks = self.tasks.blocking_lock(); - for task in tasks.values() { - task.temp_file_pool + for entry in self.tasks.iter() { + entry + .temp_file_pool .config() .cache_size .store(config.temp_file_memory_quota.0 as usize, Ordering::SeqCst); @@ -479,21 +505,87 @@ impl RouterInner { merged_file_size_limit: u64, ) -> Result<()> { let task_name = task.info.get_name().to_owned(); - // register task info let cfg = self.tempfile_config_for_task(&task); - let stream_task = - StreamTaskInfo::new(task, ranges.clone(), merged_file_size_limit, cfg).await?; - frame!(self.tasks.lock()) - .await - .insert(task_name.clone(), Arc::new(stream_task)); + let backup_encryption_manager = + self.build_backup_encryption_manager_for_task(&task).await?; + let stream_task = StreamTaskHandler::new( + task, + ranges.clone(), + merged_file_size_limit, + cfg, + backup_encryption_manager, + ) + .await?; + self.tasks.insert(task_name.clone(), Arc::new(stream_task)); - // register ragnes + // register ranges self.register_ranges(&task_name, ranges); Ok(()) } + async fn build_backup_encryption_manager_for_task( + &self, + task: &StreamTask, + ) -> Result { + if let Some(config) = task.info.security_config.as_ref() { + if let Some(encryption) = config.encryption.as_ref() { + match encryption { + StreamBackupTaskSecurityConfig_oneof_encryption::PlaintextDataKey(key) => { + // sanity check key is valid + let opt_key = if !key.cipher_key.is_empty() + && (key.cipher_type != EncryptionMethod::Plaintext + && key.cipher_type != EncryptionMethod::Unknown) + { + Some(key.clone()) + } else { + None + }; + Ok(BackupEncryptionManager::new( + opt_key, + self.backup_encryption_manager + .master_key_based_file_encryption_method, + self.backup_encryption_manager + .multi_master_key_backend + .clone(), + self.backup_encryption_manager.tikv_data_key_manager.clone(), + )) + } + StreamBackupTaskSecurityConfig_oneof_encryption::MasterKeyConfig(config) => { + let multi_master_key_backend = if !config.master_keys.is_empty() { + let multi_master_key_backend = MultiMasterKeyBackend::new(); + multi_master_key_backend + .update_from_proto_if_needed( + config.master_keys.to_vec(), + create_async_backend, + ) + .await?; + multi_master_key_backend + } else { + error!( + "receive master key config but does not find master key inside, fall back to default" + ); + self.backup_encryption_manager + .multi_master_key_backend + .clone() + }; + Ok(BackupEncryptionManager::new( + None, + config.encryption_type, + multi_master_key_backend, + self.backup_encryption_manager.tikv_data_key_manager.clone(), + )) + } + } + } else { + Ok(self.backup_encryption_manager.clone()) + } + } else { + Ok(self.backup_encryption_manager.clone()) + } + } + fn tempfile_config_for_task(&self, task: &StreamTask) -> tempfiles::Config { // Note: the scope of this config is per-task. That means, when there are // multi tasks, we may need to share the pool over tasks, or at least share the @@ -507,18 +599,17 @@ impl RouterInner { content_compression: task.info.get_compression_type(), minimal_swap_out_file_size: ReadableSize::mb(1).0 as _, write_buffer_size: ReadableSize::kb(4).0 as _, - encryption: self.data_key_manager.clone(), } } - pub async fn unregister_task(&self, task_name: &str) -> Option { - frame!(self.tasks.lock()).await.remove(task_name).map(|t| { + pub fn unregister_task(&self, task_name: &str) -> Option { + self.tasks.remove(task_name).map(|t| { info!( "backup stream unregister task"; "task" => task_name, ); self.unregister_ranges(task_name); - t.task.info.clone() + t.1.task.info.clone() }) } @@ -528,11 +619,11 @@ impl RouterInner { r.get_value_by_point(key).cloned() } - #[instrument(skip(self))] - pub async fn select_task(&self, selector: TaskSelectorRef<'_>) -> Vec { - let s = frame!(self.tasks.lock()).await; - s.iter() - .filter(|(name, info)| { + pub fn select_task(&self, selector: TaskSelectorRef<'_>) -> Vec { + self.tasks + .iter() + .filter(|entry| { + let (name, info) = entry.pair(); selector.matches( name.as_str(), info.ranges @@ -540,25 +631,24 @@ impl RouterInner { .map(|(s, e)| (s.as_slice(), e.as_slice())), ) }) - .map(|(name, _)| name.to_owned()) + .map(|entry| entry.key().to_owned()) .collect() } #[cfg(test)] - pub(crate) async fn must_mut_task_info(&self, task_name: &str, mutator: F) + pub(crate) fn must_mut_task_info(&self, task_name: &str, mutator: F) where - F: FnOnce(&mut StreamTaskInfo), + F: FnOnce(&mut StreamTaskHandler), { - let mut tasks = self.tasks.lock().await; - let t = tasks.remove(task_name); - let mut raw = Arc::try_unwrap(t.unwrap()).unwrap(); + let t = self.tasks.remove(task_name); + let mut raw = Arc::try_unwrap(t.unwrap().1).unwrap(); mutator(&mut raw); - tasks.insert(task_name.to_owned(), Arc::new(raw)); + self.tasks.insert(task_name.to_owned(), Arc::new(raw)); } #[instrument(skip(self))] - pub async fn get_task_info(&self, task_name: &str) -> Result> { - let task_info = match frame!(self.tasks.lock()).await.get(task_name) { + pub fn get_task_handler(&self, task_name: &str) -> Result> { + let task_handler = match self.tasks.get(task_name) { Some(t) => t.clone(), None => { info!("backup stream no task"; "task" => ?task_name); @@ -567,13 +657,13 @@ impl RouterInner { }); } }; - Ok(task_info) + Ok(task_handler) } #[instrument(skip_all, fields(task))] - async fn on_event(&self, task: String, events: ApplyEvents) -> Result<()> { - let task_info = self.get_task_info(&task).await?; - task_info.on_events(events).await?; + async fn on_events_by_task(&self, task: String, events: ApplyEvents) -> Result<()> { + let task_handler = self.get_task_handler(&task)?; + task_handler.on_events(events).await?; let file_size_limit = self.temp_file_size_limit.load(Ordering::SeqCst); // When this event make the size of temporary files exceeds the size limit, make @@ -582,16 +672,16 @@ impl RouterInner { debug!( "backup stream statics size"; "task" => ?task, - "next_size" => task_info.total_size(), + "next_size" => task_handler.total_size(), "size_limit" => file_size_limit, ); - let cur_size = task_info.total_size(); - if cur_size > file_size_limit && !task_info.is_flushing() { + let cur_size = task_handler.total_size(); + if cur_size > file_size_limit && !task_handler.is_flushing() { info!("try flushing task"; "task" => %task, "size" => %cur_size); - if task_info.set_flushing_status_cas(false, true).is_ok() { + if task_handler.set_flushing_status_cas(false, true).is_ok() { if let Err(e) = self.scheduler.schedule(Task::Flush(task)) { error!("backup stream schedule task failed"; "error" => ?e); - task_info.set_flushing_status(false); + task_handler.set_flushing_status(false); } } } @@ -602,37 +692,33 @@ impl RouterInner { use futures::FutureExt; HANDLE_KV_HISTOGRAM.observe(kv.len() as _); let partitioned_events = kv.partition_by_range(&self.ranges.rl()); - let tasks = partitioned_events - .into_iter() - .map(|(task, events)| self.on_event(task.clone(), events).map(move |r| (task, r))); + let tasks = partitioned_events.into_iter().map(|(task, events)| { + self.on_events_by_task(task.clone(), events) + .map(move |r| (task, r)) + }); futures::future::join_all(tasks).await } - /// flush the specified task, once once success, return the min resolved ts + /// flush the specified task, once success, return the min resolved ts /// of this flush. returns `None` if failed. - #[instrument(skip(self, resolve_to))] - pub async fn do_flush( - &self, - task_name: &str, - store_id: u64, - resolve_to: TimeStamp, - ) -> Option { - let task = self.tasks.lock().await.get(task_name).cloned(); + #[instrument(skip(self, cx))] + pub async fn do_flush(&self, cx: FlushContext<'_>) -> Option { + let task = self.tasks.get(cx.task_name); match task { - Some(task_info) => { - let result = task_info.do_flush(store_id, resolve_to).await; + Some(task_handler) => { + let result = task_handler.do_flush(cx).await; // set false to flushing whether success or fail - task_info.set_flushing_status(false); + task_handler.set_flushing_status(false); if let Err(e) = result { e.report("failed to flush task."); warn!("backup steam do flush fail"; "err" => ?e); - if task_info.flush_failure_count() > FLUSH_FAILURE_BECOME_FATAL_THRESHOLD { + if task_handler.flush_failure_count() > FLUSH_FAILURE_BECOME_FATAL_THRESHOLD { // NOTE: Maybe we'd better record all errors and send them to the client? try_send!( self.scheduler, Task::FatalError( - TaskSelector::ByName(task_name.to_owned()), + TaskSelector::ByName(cx.task_name.to_owned()), Box::new(e) ) ); @@ -640,7 +726,7 @@ impl RouterInner { return None; } // if succeed in flushing, update flush_time. Or retry do_flush immediately. - task_info.update_flush_time(); + task_handler.update_flush_time(); result.ok().flatten() } _ => None, @@ -654,8 +740,7 @@ impl RouterInner { global_checkpoint: u64, store_id: u64, ) -> Result { - self.get_task_info(task_name) - .await? + self.get_task_handler(task_name)? .update_global_checkpoint(global_checkpoint, store_id) .await } @@ -665,7 +750,9 @@ impl RouterInner { pub async fn tick(&self) { let max_flush_interval = self.max_flush_interval.rl().to_owned(); - for (name, task_info) in self.tasks.lock().await.iter() { + for entry in self.tasks.iter() { + let name = entry.key(); + let task_info = entry.value(); if let Err(e) = self .scheduler .schedule(Task::UpdateGlobalCheckpoint(name.to_string())) @@ -745,6 +832,7 @@ impl TempFileKey { } /// The full name of the file owns the key. + #[allow(clippy::redundant_closure_call)] fn temp_file_name(&self) -> String { let timestamp = (|| { fail::fail_point!("temp_file_name_timestamp", |t| t.map_or_else( @@ -824,12 +912,18 @@ impl TempFileKey { } } -pub struct StreamTaskInfo { +/// StreamTaskHandler acts on the events for the backup stream task. +/// It writes the key value pair changes from raft store to local temp files and +/// flushes it to the external storage. +pub struct StreamTaskHandler { pub(crate) task: StreamTask, /// support external storage. eg local/s3. pub(crate) storage: Arc, /// The listening range of the task. ranges: Vec<(Vec, Vec)>, + // Note: acquiring locks on files, flushing_files, flushing_meta_files at the same time might + // introduce deadlocks if ordering is not carefully maintained. Better think of another way + // to make it risk-free. /// The temporary file index. Both meta (m prefixed keys) and data (t /// prefixed keys). files: SlotMap, @@ -858,9 +952,12 @@ pub struct StreamTaskInfo { merged_file_size_limit: u64, /// The pool for holding the temporary files. temp_file_pool: Arc, + /// encryption manager to encrypt backup files uploaded + /// to external storage and local temp files. + backup_encryption_manager: BackupEncryptionManager, } -impl Drop for StreamTaskInfo { +impl Drop for StreamTaskHandler { fn drop(&mut self) { let (success, failed): (Vec<_>, Vec<_>) = self .flushing_files @@ -870,7 +967,7 @@ impl Drop for StreamTaskInfo { .map(|(_, f, _)| f.inner.path().to_owned()) .map(|p| self.temp_file_pool.remove(&p)) .partition(|r| *r); - info!("stream task info dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); + info!("stream task handler dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); let (success, failed): (Vec<_>, Vec<_>) = self .files .get_mut() @@ -878,13 +975,13 @@ impl Drop for StreamTaskInfo { .map(|(_, f)| f.into_inner().inner.path().to_owned()) .map(|p| self.temp_file_pool.remove(&p)) .partition(|r| *r); - info!("stream task info dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); + info!("stream task handler dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); } } -impl std::fmt::Debug for StreamTaskInfo { +impl std::fmt::Debug for StreamTaskHandler { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("StreamTaskInfo") + f.debug_struct("StreamTaskHandler") .field("task", &self.task.info.name) .field("min_resolved_ts", &self.min_resolved_ts) .field("total_size", &self.total_size) @@ -893,13 +990,14 @@ impl std::fmt::Debug for StreamTaskInfo { } } -impl StreamTaskInfo { +impl StreamTaskHandler { /// Create a new temporary file set at the `temp_dir`. pub async fn new( task: StreamTask, ranges: Vec<(Vec, Vec)>, merged_file_size_limit: u64, temp_pool_cfg: tempfiles::Config, + backup_encryption_manager: BackupEncryptionManager, ) -> Result { let temp_dir = &temp_pool_cfg.swap_files; tokio::fs::create_dir_all(temp_dir).await?; @@ -922,7 +1020,40 @@ impl StreamTaskInfo { flush_fail_count: AtomicUsize::new(0), global_checkpoint_ts: AtomicU64::new(start_ts), merged_file_size_limit, - temp_file_pool: Arc::new(TempFilePool::new(temp_pool_cfg)?), + temp_file_pool: Arc::new(TempFilePool::new( + temp_pool_cfg, + backup_encryption_manager.clone(), + )?), + backup_encryption_manager, + }) + } + + #[cfg(test)] + pub fn new_test_only( + task: StreamTask, + ranges: Vec<(Vec, Vec)>, + merged_file_size_limit: u64, + external_storage: Arc, + pool: Arc, + backup_encryption_manager: BackupEncryptionManager, + ) -> Result { + let start_ts = task.info.get_start_ts(); + Ok(Self { + task, + storage: external_storage, + ranges, + min_resolved_ts: TimeStamp::max(), + files: SlotMap::default(), + flushing_files: RwLock::default(), + flushing_meta_files: RwLock::default(), + last_flush_time: AtomicPtr::new(Box::into_raw(Box::new(Instant::now()))), + total_size: AtomicUsize::new(0), + flushing: AtomicBool::new(false), + flush_fail_count: AtomicUsize::new(0), + global_checkpoint_ts: AtomicU64::new(start_ts), + merged_file_size_limit, + temp_file_pool: pool, + backup_encryption_manager, }) } @@ -988,7 +1119,7 @@ impl StreamTaskInfo { /// Flush all template files and generate corresponding metadata. #[instrument(skip_all)] - pub async fn generate_metadata(&self, store_id: u64) -> Result { + pub async fn generate_backup_metadata(&self, store_id: u64) -> Result { let mut w = self.flushing_files.write().await; let mut wm = self.flushing_meta_files.write().await; // Let's flush all files first... @@ -1008,6 +1139,36 @@ impl StreamTaskInfo { Ok(metadata) } + fn fill_region_info(&self, cx: FlushContext<'_>, metas: &mut MetadataInfo) { + let mut rmap = HashMap::, &[u8], &[u8])>::new(); + for res in cx.resolved_regions.resolve_results() { + rmap.entry(res.region.id) + .and_modify(|(epoch, start, end)| { + epoch.push(res.region.get_region_epoch()); + if *start > res.region.start_key.as_slice() { + *start = &res.region.start_key; + } + if *end < res.region.end_key.as_slice() { + *end = &res.region.end_key; + } + }) + .or_insert({ + let r = &res.region; + (vec![r.get_region_epoch()], &r.start_key, &r.end_key) + }); + } + + for fg in metas.file_groups.iter_mut() { + for f in fg.data_files_info.iter_mut() { + if let Some((epoches, start_key, end_key)) = rmap.get(&(f.region_id as _)) { + f.set_region_epoch(epoches.iter().copied().cloned().collect::>().into()); + f.set_region_start_key(start_key.to_vec()); + f.set_region_end_key(end_key.to_vec()); + } + } + } + } + pub fn set_flushing_status_cas(&self, expect: bool, new: bool) -> result::Result { self.flushing .compare_exchange(expect, new, Ordering::SeqCst, Ordering::SeqCst) @@ -1090,11 +1251,10 @@ impl StreamTaskInfo { #[instrument(skip_all)] async fn merge_and_flush_log_files_to( - storage: Arc, + &self, files: &mut [(TempFileKey, DataFile, DataFileInfo)], metadata: &mut MetadataInfo, is_meta: bool, - shared_pool: Arc, ) -> Result<()> { let mut data_files_open = Vec::new(); let mut data_file_infos = Vec::new(); @@ -1109,16 +1269,19 @@ impl StreamTaskInfo { // and push it into merged_file_info(DataFileGroup). file_info_clone.set_range_offset(stat_length); data_files_open.push({ - let file = shared_pool + // the file content is compressed. + let file_reader = self + .temp_file_pool .open_raw_for_read(data_file.inner.path()) .context(format_args!( "failed to open read file {:?}", data_file.inner.path() ))?; - let compress_length = file.len().await?; + let compress_length = file_reader.len().await?; stat_length += compress_length; file_info_clone.set_range_length(compress_length); - file + + file_reader }); data_file_infos.push(file_info_clone); @@ -1140,16 +1303,17 @@ impl StreamTaskInfo { merged_file_info.set_max_ts(max_ts); merged_file_info.set_min_ts(min_ts); merged_file_info.set_min_resolved_ts(min_resolved_ts.unwrap_or_default()); + let (unpin_reader, file_encryption_info_vec, hasher_vec) = self + .build_unpin_reader_with_encryption_if_needed(data_files_open) + .await?; - // to do: limiter to storage - let limiter = Limiter::builder(std::f64::INFINITY).build(); - - let files_reader = FilesReader::new(data_files_open); - - let reader = UnpinReader(Box::new(limiter.limit(files_reader.compat()))); let filepath = &merged_file_info.path; - let ret = storage.write(filepath, reader, stat_length).await; + // flush to external storage + let ret = self + .storage + .write(filepath, unpin_reader, stat_length) + .await; match ret { Ok(_) => { @@ -1168,43 +1332,46 @@ impl StreamTaskInfo { } } + // update data file metadata with encryption info and calculated file checksum + self.update_data_file_metadata( + &mut merged_file_info, + hasher_vec, + file_encryption_info_vec, + )?; + // push merged file into metadata metadata.push(merged_file_info); Ok(()) } #[instrument(skip_all)] - pub async fn flush_log(&self, metadata: &mut MetadataInfo) -> Result<()> { - let storage = self.storage.clone(); - self.merge_log(metadata, storage.clone(), &self.flushing_files, false) + pub async fn flush_log(&self, backup_metadata: &mut MetadataInfo) -> Result<()> { + self.merge_and_flush_log(backup_metadata, &self.flushing_files, false) .await?; - self.merge_log(metadata, storage.clone(), &self.flushing_meta_files, true) + self.merge_and_flush_log(backup_metadata, &self.flushing_meta_files, true) .await?; Ok(()) } #[instrument(skip_all)] - async fn merge_log( + async fn merge_and_flush_log( &self, metadata: &mut MetadataInfo, - storage: Arc, - files_lock: &RwLock>, - is_meta: bool, + files: &RwLock>, + is_tikv_meta_file: bool, ) -> Result<()> { - let mut files = files_lock.write().await; + let mut files_guard = files.write().await; let mut batch_size = 0; // file[batch_begin_index, i) is a batch let mut batch_begin_index = 0; // TODO: upload the merged file concurrently, // then collect merged_file_infos and push them into `metadata`. - for i in 0..files.len() { + for i in 0..files_guard.len() { if batch_size >= self.merged_file_size_limit { - Self::merge_and_flush_log_files_to( - storage.clone(), - &mut files[batch_begin_index..i], + self.merge_and_flush_log_files_to( + &mut files_guard[batch_begin_index..i], metadata, - is_meta, - self.temp_file_pool.clone(), + is_tikv_meta_file, ) .await?; @@ -1212,15 +1379,13 @@ impl StreamTaskInfo { batch_size = 0; } - batch_size += files[i].2.length; + batch_size += files_guard[i].2.length; } - if batch_begin_index < files.len() { - Self::merge_and_flush_log_files_to( - storage.clone(), - &mut files[batch_begin_index..], + if batch_begin_index < files_guard.len() { + self.merge_and_flush_log_files_to( + &mut files_guard[batch_begin_index..], metadata, - is_meta, - self.temp_file_pool.clone(), + is_tikv_meta_file, ) .await?; } @@ -1229,7 +1394,7 @@ impl StreamTaskInfo { } #[instrument(skip_all)] - pub async fn flush_meta(&self, metadata_info: MetadataInfo) -> Result<()> { + pub async fn flush_backup_metadata(&self, metadata_info: MetadataInfo) -> Result<()> { if !metadata_info.file_groups.is_empty() { let meta_path = metadata_info.path_to_meta(); let meta_buff = metadata_info.marshal_to()?; @@ -1258,11 +1423,7 @@ impl StreamTaskInfo { /// function, and we would use `max(resolved_ts_provided, /// resolved_ts_from_file)`. #[instrument(skip_all)] - pub async fn do_flush( - &self, - store_id: u64, - resolved_ts_provided: TimeStamp, - ) -> Result> { + pub async fn do_flush(&self, cx: FlushContext<'_>) -> Result> { // do nothing if not flushing status. let result: Result> = async move { if !self.is_flushing() { @@ -1271,11 +1432,11 @@ impl StreamTaskInfo { let begin = Instant::now_coarse(); let mut sw = StopWatch::by_now(); - // generate meta data and prepare to flush to storage - let mut metadata_info = self + // generate backup meta data and prepare to flush to storage + let mut backup_metadata = self .move_to_flushing_files() .await? - .generate_metadata(store_id) + .generate_backup_metadata(cx.store_id) .await?; fail::fail_point!("after_moving_to_flushing_files"); @@ -1284,22 +1445,25 @@ impl StreamTaskInfo { .observe(sw.lap().as_secs_f64()); // flush log file to storage. - self.flush_log(&mut metadata_info).await?; + self.flush_log(&mut backup_metadata).await?; // the field `min_resolved_ts` of metadata will be updated // only after flush is done. - metadata_info.min_resolved_ts = metadata_info + backup_metadata.min_resolved_ts = backup_metadata .min_resolved_ts - .max(Some(resolved_ts_provided.into_inner())); - let rts = metadata_info.min_resolved_ts; + .max(Some(cx.resolved_ts.into_inner())); + let rts = backup_metadata.min_resolved_ts; // compress length - let file_size_vec = metadata_info + let file_size_vec = backup_metadata .file_groups .iter() .map(|d| (d.length, d.data_files_info.len())) .collect::>(); + // flush meta file to storage. - self.flush_meta(metadata_info).await?; + self.fill_region_info(cx, &mut backup_metadata); + // flush backup metadata to external storage. + self.flush_backup_metadata(backup_metadata).await?; crate::metrics::FLUSH_DURATION .with_label_values(&["save_files"]) .observe(sw.lap().as_secs_f64()); @@ -1368,6 +1532,169 @@ impl StreamTaskInfo { } Ok(false) } + + async fn build_unpin_reader_with_encryption_if_needed( + &self, + mut data_file_readers: Vec, + ) -> Result<( + UnpinReader<'_>, + Vec, + Vec>>, + )> { + // to do: limiter to storage + let limiter = Limiter::builder(f64::INFINITY).build(); + + // prioritize plaintext key passed from the user, it will override the default + // master key config if there is any + // + if let Some(cipher_info) = self.backup_encryption_manager.plaintext_data_key.as_ref() { + let mut encrypted_hashing_readers = Vec::new(); + let mut hasher_vec = Vec::new(); + let mut encryption_info_vec = Vec::new(); + + let data_key = &cipher_info.cipher_key; + for data_file_reader in data_file_readers.drain(..) { + let file_iv = Iv::new_ctr().map_err(|e| { + Error::Other(box_err!( + "failed to create IV for plaintext data key: {:?}", + e + )) + })?; + let encrypter_reader = EncrypterReader::new( + data_file_reader.compat(), + cipher_info.cipher_type, + &data_key[..], + file_iv, + ) + .map_err(|e| { + Error::Other(box_err!( + "failed to create encrypted reader for plaintext data key: {:?}", + e + )) + })?; + let (sha256_reader, hasher) = Sha256Reader::new(encrypter_reader.compat()) + .map_err(|e| { + Error::Other(box_err!( + "failed to create sha256 reader for plaintext data key: {:?}", + e + )) + })?; + + let mut encryption_info = FileEncryptionInfo::new(); + encryption_info.set_file_iv(file_iv.as_slice().to_vec()); + encryption_info.set_encryption_method(cipher_info.cipher_type); + encryption_info.set_plain_text_data_key(PlainTextDataKey::new()); + + encryption_info_vec.push(encryption_info); + hasher_vec.push(hasher); + encrypted_hashing_readers.push(sha256_reader); + } + + let files_reader = FilesReader::new(encrypted_hashing_readers); + let unpin_reader = UnpinReader(Box::new(limiter.limit(files_reader.compat()))); + Ok((unpin_reader, encryption_info_vec, hasher_vec)) + } else if self + .backup_encryption_manager + .is_master_key_backend_initialized() + .await + { + let mut encrypted_hashing_readers = Vec::new(); + let mut hasher_vec = Vec::new(); + let mut encryption_info_vec = Vec::new(); + + let data_key = self + .backup_encryption_manager + .generate_data_key() + .map_err(|e| Error::Other(box_err!("failed to generate data key: {:?}", e)))?; + + let encrypted_data_key = self + .backup_encryption_manager + .encrypt_data_key(&data_key) + .await + .map_err(|e| Error::Other(box_err!("failed to encrypt data key: {:?}", e)))?; + + // iterate data files readers and wrap with encryption + hashing + // + for data_file_reader in data_file_readers.drain(..) { + let file_iv = Iv::new_ctr().map_err(|e| { + Error::Other(box_err!( + "failed to create IV for master key based data key: {:?}", + e + )) + })?; + let encrypter_reader = EncrypterReader::new( + data_file_reader.compat(), + self.backup_encryption_manager + .master_key_based_file_encryption_method, + &data_key, + file_iv, + ) + .map_err(|e| { + Error::Other(box_err!( + "failed to create encrypted reader for master key based data key: {:?}", + e + )) + })?; + let (sha256_reader, hasher) = Sha256Reader::new(encrypter_reader.compat()) + .map_err(|e| { + Error::Other(box_err!( + "failed to create sha256 reader for master key based data key: {:?}", + e + )) + })?; + + let mut master_key_based_info = MasterKeyBased::new(); + master_key_based_info + .set_data_key_encrypted_content(vec![encrypted_data_key.clone()].into()); + + let mut encryption_info = FileEncryptionInfo::new(); + encryption_info.set_master_key_based(master_key_based_info); + encryption_info.set_file_iv(file_iv.as_slice().to_vec()); + encryption_info.set_encryption_method( + self.backup_encryption_manager + .master_key_based_file_encryption_method, + ); + + encryption_info_vec.push(encryption_info); + hasher_vec.push(hasher); + encrypted_hashing_readers.push(sha256_reader); + } + + let files_reader = FilesReader::new(encrypted_hashing_readers); + let unpin_reader = UnpinReader(Box::new(limiter.limit(files_reader.compat()))); + Ok((unpin_reader, encryption_info_vec, hasher_vec)) + } else { + // no encryption + let files_reader = FilesReader::new(data_file_readers); + let unpin_reader = UnpinReader(Box::new(limiter.limit(files_reader.compat()))); + Ok((unpin_reader, vec![], vec![])) + } + } + + fn update_data_file_metadata( + &self, + data_file_meta: &mut DataFileGroup, + hasher_vec: Vec>>, + encryption_info_vec: Vec, + ) -> Result<()> { + // Iterate over the vectors and update the data_file_meta + for ((meta, hasher), mut encryption_info) in data_file_meta + .data_files_info + .iter_mut() + .zip(hasher_vec.into_iter()) + .zip(encryption_info_vec.into_iter()) + { + let checksum = hasher + .lock() + .unwrap() + .finish() + .map(|digest| digest.to_vec()) + .map_err(|e| Error::Other(box_err!("calculate checksum error: {:?}", e)))?; + encryption_info.set_checksum(checksum); + meta.set_file_encryption_info(encryption_info); + } + Ok(()) + } } /// A opened log file with some metadata. @@ -1376,6 +1703,7 @@ struct DataFile { max_ts: TimeStamp, resolved_ts: TimeStamp, min_begin_ts: Option, + // checksum of plaintext kv file , calculated before compression sha256: Hasher, // TODO: use lz4 with async feature inner: tempfiles::ForWrite, @@ -1384,6 +1712,7 @@ struct DataFile { end_key: Vec, number_of_entries: usize, file_size: usize, + crc64xor: u64, } #[derive(Debug)] @@ -1466,6 +1795,7 @@ impl DataFile { file_size: 0, start_key: vec![], end_key: vec![], + crc64xor: 0, }) } @@ -1488,6 +1818,10 @@ impl DataFile { let mut total_size = 0; for mut event in events.events { + let mut digest = crc64fast::Digest::new(); + digest.write(&event.key); + digest.write(&event.value); + self.crc64xor ^= digest.sum64(); let encoded = EventEncoder::encode_event(&event.key, &event.value); let mut size = 0; for slice in encoded { @@ -1550,6 +1884,7 @@ impl DataFile { .map(|bytes| bytes.to_vec()) .map_err(|err| Error::Other(box_err!("openssl hasher failed to init: {}", err)))?, ); + meta.set_crc64xor(self.crc64xor); meta.set_number_of_entries(self.number_of_entries as _); meta.set_max_ts(self.max_ts.into_inner() as _); meta.set_min_ts(self.min_ts.into_inner() as _); @@ -1580,6 +1915,7 @@ impl std::fmt::Debug for DataFile { .field("min_ts", &self.min_ts) .field("max_ts", &self.max_ts) .field("resolved_ts", &self.resolved_ts) + .field("file_size", &self.file_size) .finish() } } @@ -1596,23 +1932,35 @@ struct TaskRange { #[cfg(test)] mod tests { - use std::{ffi::OsStr, io, time::Duration}; - - use external_storage::{ExternalData, NoopStorage}; - use futures::AsyncReadExt; - use kvproto::brpb::{Local, Noop, StorageBackend, StreamBackupTaskInfo}; + use std::{ffi::OsStr, io, io::Cursor, time::Duration}; + + use async_compression::tokio::bufread::ZstdDecoder; + use encryption::{DecrypterReader, FileConfig, MasterKeyConfig, MultiMasterKeyBackend}; + use external_storage::{BlobObject, ExternalData, NoopStorage}; + use futures::{future::LocalBoxFuture, stream::LocalBoxStream, AsyncReadExt}; + use kvproto::{ + brpb::{CipherInfo, Noop, StorageBackend, StreamBackupTaskInfo}, + encryptionpb::EncryptionMethod, + }; use online_config::{ConfigManager, OnlineConfig}; + use rand::Rng; use tempfile::TempDir; use tikv_util::{ - codec::number::NumberEncoder, + codec::{ + number::NumberEncoder, + stream_event::{EventIterator, Iterator}, + }, config::ReadableDuration, worker::{dummy_scheduler, ReceiverWrapper}, }; + use tokio::{fs::File, io::BufReader}; use txn_types::{Write, WriteType}; use super::*; use crate::{config::BackupStreamConfigManager, utils}; + static EMPTY_RESOLVE: ResolvedRegions = ResolvedRegions::new(TimeStamp::zero(), vec![]); + #[derive(Debug)] struct KvEventsBuilder { events: ApplyEvents, @@ -1625,7 +1973,6 @@ mod tests { content_compression: CompressionType::Zstd, minimal_swap_out_file_size: 0, write_buffer_size: 0, - encryption: None, } } @@ -1723,8 +2070,8 @@ mod tests { temp_file_size_limit: 1024, temp_file_memory_quota: 1024 * 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), ); // -----t1.start-----t1.end-----t2.start-----t2.end------ // --|------------|----------|------------|-----------|-- @@ -1749,15 +2096,6 @@ mod tests { result } - fn create_local_storage_backend(path: String) -> StorageBackend { - let mut local = Local::default(); - local.set_path(path); - - let mut sb = StorageBackend::default(); - sb.set_local(local); - sb - } - fn create_noop_storage_backend() -> StorageBackend { let nop = Noop::new(); let mut backend = StorageBackend::default(); @@ -1765,15 +2103,12 @@ mod tests { backend } - async fn task(name: String) -> Result<(StreamBackupTaskInfo, PathBuf)> { + async fn task_handler(name: String) -> Result<(StreamBackupTaskInfo, PathBuf)> { let mut stream_task = StreamBackupTaskInfo::default(); stream_task.set_name(name); let storage_path = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); tokio::fs::create_dir_all(&storage_path).await?; - println!("storage={:?}", storage_path); - stream_task.set_storage(create_local_storage_backend( - storage_path.to_str().unwrap().to_string(), - )); + stream_task.set_storage(external_storage::make_local_backend(storage_path.as_path())); Ok((stream_task, storage_path)) } @@ -1834,22 +2169,22 @@ mod tests { temp_file_size_limit: 32, temp_file_memory_quota: 32 * 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), ); - let (stream_task, storage_path) = task("dummy".to_owned()).await.unwrap(); + let (stream_task, storage_path) = task_handler("dummy".to_owned()).await.unwrap(); must_register_table(&router, stream_task, 1).await; let start_ts = write_simple_data(&router).await; tokio::time::sleep(Duration::from_millis(200)).await; let end_ts = TimeStamp::physical_now(); - let files = router.tasks.lock().await.get("dummy").unwrap().clone(); - let mut meta = files + let task_handler = router.tasks.get("dummy").unwrap().clone(); + let mut meta = task_handler .move_to_flushing_files() .await .unwrap() - .generate_metadata(1) + .generate_backup_metadata(1) .await .unwrap(); @@ -1869,16 +2204,16 @@ mod tests { // in some case when flush failed to write files to storage. // we may run `generate_metadata` again with same files. - let mut another_meta = files + let mut another_meta = task_handler .move_to_flushing_files() .await .unwrap() - .generate_metadata(1) + .generate_backup_metadata(1) .await .unwrap(); - files.flush_log(&mut meta).await.unwrap(); - files.flush_log(&mut another_meta).await.unwrap(); + task_handler.flush_log(&mut meta).await.unwrap(); + task_handler.flush_log(&mut another_meta).await.unwrap(); // meta updated let files_num = meta .file_groups @@ -1899,8 +2234,8 @@ mod tests { } } - files.flush_meta(meta).await.unwrap(); - files.clear_flushing_files().await; + task_handler.flush_backup_metadata(meta).await.unwrap(); + task_handler.clear_flushing_files().await; drop(router); let cmds = collect_recv(rx); @@ -1915,7 +2250,6 @@ mod tests { for entry in walkdir::WalkDir::new(storage_path) { let entry = entry.unwrap(); let filename = entry.file_name(); - println!("walking {}", entry.path().display()); if entry.path().extension() == Some(OsStr::new("meta")) { meta_count += 1; } else if entry.path().extension() == Some(OsStr::new("log")) { @@ -1957,11 +2291,12 @@ mod tests { is_paused: false, }; let merged_file_size_limit = 0x10000; - let task = StreamTaskInfo::new( + let task_handler = StreamTaskHandler::new( stream_task, vec![(vec![], vec![])], merged_file_size_limit, make_tempfiles_cfg(tmp_dir.path()), + BackupEncryptionManager::default(), ) .await .unwrap(); @@ -1970,28 +2305,23 @@ mod tests { let region_count = merged_file_size_limit / (4 * 1024); // 2 merged log files for i in 1..=region_count { let kv_events = mock_build_large_kv_events(i as _, i as _, i as _); - task.on_events(kv_events).await.unwrap(); + task_handler.on_events(kv_events).await.unwrap(); } // do_flush - task.set_flushing_status(true); - task.do_flush(1, TimeStamp::new(1)).await.unwrap(); - assert_eq!(task.flush_failure_count(), 0); - assert_eq!(task.files.read().await.is_empty(), true); - assert_eq!(task.flushing_files.read().await.is_empty(), true); + task_handler.set_flushing_status(true); + let cx = FlushContext { + task_name: &task_handler.task.info.name, + store_id: 1, + resolved_regions: &EMPTY_RESOLVE, + resolved_ts: TimeStamp::new(1), + }; + task_handler.do_flush(cx).await.unwrap(); + assert_eq!(task_handler.flush_failure_count(), 0); + assert_eq!(task_handler.files.read().await.is_empty(), true); + assert_eq!(task_handler.flushing_files.read().await.is_empty(), true); // assert backup log files - let mut meta_count = 0; - let mut log_count = 0; - for entry in walkdir::WalkDir::new(tmp_dir.path()) { - let entry = entry.unwrap(); - if entry.path().extension() == Some(OsStr::new("meta")) { - meta_count += 1; - } else if entry.path().extension() == Some(OsStr::new("log")) { - log_count += 1; - } - } - assert_eq!(meta_count, 1); - assert_eq!(log_count, 2); + verify_on_disk_file(tmp_dir.path(), 2, 1); } struct ErrorStorage { @@ -2044,7 +2374,7 @@ mod tests { async fn write( &self, name: &str, - reader: UnpinReader, + reader: UnpinReader<'_>, content_length: u64, ) -> io::Result<()> { (self.error_on_write)()?; @@ -2058,6 +2388,17 @@ mod tests { fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.inner.read_part(name, off, len) } + + fn iter_prefix( + &self, + _prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result> { + unreachable!() + } + + fn delete(&self, _name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + unreachable!() + } } fn build_kv_event(base: i32, count: i32) -> ApplyEvents { @@ -2084,30 +2425,29 @@ mod tests { temp_file_size_limit: 1, temp_file_memory_quota: 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), )); - let (task, _path) = task("error_prone".to_owned()).await?; + let cx = FlushContext { + task_name: "error_prone", + store_id: 42, + resolved_regions: &EMPTY_RESOLVE, + resolved_ts: TimeStamp::max(), + }; + let (task, _path) = task_handler("error_prone".to_owned()).await?; must_register_table(router.as_ref(), task, 1).await; - router - .must_mut_task_info("error_prone", |i| { - i.storage = Arc::new(ErrorStorage::with_first_time_error(i.storage.clone())) - }) - .await; + router.must_mut_task_info("error_prone", |i| { + i.storage = Arc::new(ErrorStorage::with_first_time_error(i.storage.clone())) + }); check_on_events_result(&router.on_events(build_kv_event(0, 10)).await); - assert!( - router - .do_flush("error_prone", 42, TimeStamp::max()) - .await - .is_none() - ); + assert!(router.do_flush(cx).await.is_none()); check_on_events_result(&router.on_events(build_kv_event(10, 10)).await); - let t = router.get_task_info("error_prone").await.unwrap(); - let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; + let t = router.get_task_handler("error_prone").unwrap(); + let _ = router.do_flush(cx).await; assert_eq!(t.total_size() > 0, true); t.set_flushing_status(true); - let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; + let _ = router.do_flush(cx).await; assert_eq!(t.total_size(), 0); Ok(()) } @@ -2123,8 +2463,8 @@ mod tests { temp_file_size_limit: 32, temp_file_memory_quota: 32 * 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), ); let mut stream_task = StreamBackupTaskInfo::default(); stream_task.set_name("nothing".to_string()); @@ -2141,10 +2481,16 @@ mod tests { ) .await .unwrap(); - let task = router.get_task_info("nothing").await.unwrap(); + let task = router.get_task_handler("nothing").unwrap(); task.set_flushing_status_cas(false, true).unwrap(); let ts = TimeStamp::compose(TimeStamp::physical_now(), 42); - let rts = router.do_flush("nothing", 1, ts).await.unwrap(); + let cx = FlushContext { + task_name: "nothing", + store_id: 1, + resolved_regions: &EMPTY_RESOLVE, + resolved_ts: ts, + }; + let rts = router.do_flush(cx).await.unwrap(); assert_eq!(ts.into_inner(), rts); } @@ -2159,21 +2505,19 @@ mod tests { temp_file_size_limit: 1, temp_file_memory_quota: 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), )); - let (task, _path) = task("cleanup_test".to_owned()).await?; + let (task, _path) = task_handler("cleanup_test".to_owned()).await?; must_register_table(&router, task, 1).await; write_simple_data(&router).await; let tempfiles = router - .get_task_info("cleanup_test") - .await + .get_task_handler("cleanup_test") .unwrap() .temp_file_pool .clone(); router - .get_task_info("cleanup_test") - .await? + .get_task_handler("cleanup_test")? .move_to_flushing_files() .await?; write_simple_data(&router).await; @@ -2216,24 +2560,23 @@ mod tests { temp_file_size_limit: 1, temp_file_memory_quota: 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), )); - let (task, _path) = task("flush_failure".to_owned()).await?; + let (task, _path) = task_handler("flush_failure".to_owned()).await?; must_register_table(router.as_ref(), task, 1).await; - router - .must_mut_task_info("flush_failure", |i| { - i.storage = Arc::new(ErrorStorage::with_always_error(i.storage.clone())) - }) - .await; + router.must_mut_task_info("flush_failure", |i| { + i.storage = Arc::new(ErrorStorage::with_always_error(i.storage.clone())) + }); + let cx = FlushContext { + task_name: "flush_failure", + store_id: 42, + resolved_regions: &EMPTY_RESOLVE, + resolved_ts: TimeStamp::zero(), + }; for i in 0..=FLUSH_FAILURE_BECOME_FATAL_THRESHOLD { check_on_events_result(&router.on_events(build_kv_event((i * 10) as _, 10)).await); - assert_eq!( - router - .do_flush("flush_failure", 42, TimeStamp::zero()) - .await, - None, - ); + assert_eq!(router.do_flush(cx).await, None,); } let messages = collect_recv(rx); assert!( @@ -2339,7 +2682,7 @@ mod tests { // create local storage let tmp_dir = tempfile::tempdir().unwrap(); let backend = external_storage::make_local_backend(tmp_dir.path()); - + let backup_encryption_manager = BackupEncryptionManager::default(); // build a StreamTaskInfo let mut task_info = StreamBackupTaskInfo::default(); task_info.set_storage(backend); @@ -2347,33 +2690,39 @@ mod tests { info: task_info, is_paused: false, }; - let task = StreamTaskInfo::new( + let task_handler = StreamTaskHandler::new( stream_task, vec![(vec![], vec![])], 0x100000, make_tempfiles_cfg(tmp_dir.path()), + backup_encryption_manager, ) .await .unwrap(); - task.global_checkpoint_ts.store(10001, Ordering::SeqCst); + task_handler + .global_checkpoint_ts + .store(10001, Ordering::SeqCst); // test no need to update global checkpoint let store_id = 3; let mut global_checkpoint = 10000; - let is_updated = task + let is_updated = task_handler .update_global_checkpoint(global_checkpoint, store_id) .await?; assert_eq!(is_updated, false); - assert_eq!(task.global_checkpoint_ts.load(Ordering::SeqCst), 10001); + assert_eq!( + task_handler.global_checkpoint_ts.load(Ordering::SeqCst), + 10001 + ); // test update global checkpoint global_checkpoint = 10002; - let is_updated = task + let is_updated = task_handler .update_global_checkpoint(global_checkpoint, store_id) .await?; assert_eq!(is_updated, true); assert_eq!( - task.global_checkpoint_ts.load(Ordering::SeqCst), + task_handler.global_checkpoint_ts.load(Ordering::SeqCst), global_checkpoint ); @@ -2392,7 +2741,7 @@ mod tests { } struct MockCheckContentStorage { - s: NoopStorage, + s: Box, } #[async_trait::async_trait] @@ -2408,7 +2757,7 @@ mod tests { async fn write( &self, _name: &str, - mut reader: UnpinReader, + mut reader: UnpinReader<'_>, content_length: u64, ) -> io::Result<()> { let mut data = Vec::new(); @@ -2425,25 +2774,41 @@ mod tests { } } - fn read(&self, name: &str) -> external_storage::ExternalData<'_> { + fn read(&self, name: &str) -> ExternalData<'_> { self.s.read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> external_storage::ExternalData<'_> { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.s.read_part(name, off, len) } + + /// Walk the prefix of the blob storage. + /// It returns the stream of items. + fn iter_prefix( + &self, + _prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result> { + unreachable!() + } + + fn delete(&self, _name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + unreachable!() + } } #[tokio::test] async fn test_est_len_in_flush() -> Result<()> { let noop_s = NoopStorage::default(); - let ms = MockCheckContentStorage { s: noop_s }; + let mock_external_storage = Arc::new(MockCheckContentStorage { + s: Box::new(noop_s), + }); let file_name = format!("{}", uuid::Uuid::new_v4()); let file_path = Path::new(&file_name); let tempfile = TempDir::new().unwrap(); let cfg = make_tempfiles_cfg(tempfile.path()); - let pool = Arc::new(TempFilePool::new(cfg).unwrap()); + let backup_encryption_manager = BackupEncryptionManager::default(); + let pool = Arc::new(TempFilePool::new(cfg, backup_encryption_manager.clone()).unwrap()); let mut f = pool.open_for_write(file_path).unwrap(); f.write_all(b"test-data").await?; f.done().await?; @@ -2455,14 +2820,24 @@ mod tests { let tmp_key = TempFileKey::of(&kv_event.events[0], 1); data_file.inner.done().await?; let mut files = vec![(tmp_key, data_file, info)]; - let result = StreamTaskInfo::merge_and_flush_log_files_to( - Arc::new(ms), - &mut files[0..], - &mut meta, - false, + + let stream_task = StreamTask { + info: StreamBackupTaskInfo::default(), + is_paused: false, + }; + let stream_task_handler = StreamTaskHandler::new_test_only( + stream_task, + vec![(vec![], vec![])], + 0x100000, + mock_external_storage, pool.clone(), + backup_encryption_manager, ) - .await; + .unwrap(); + + let result = stream_task_handler + .merge_and_flush_log_files_to(&mut files[0..], &mut meta, false) + .await; result.unwrap(); Ok(()) } @@ -2478,8 +2853,8 @@ mod tests { temp_file_size_limit: 1, temp_file_memory_quota: 2, max_flush_interval: cfg.max_flush_interval.0, - data_key_manager: None, }, + BackupEncryptionManager::default(), )); let mut cfg_manager = BackupStreamConfigManager::new(sched, cfg.clone()); @@ -2508,7 +2883,7 @@ mod tests { } #[test] - fn test_udpate_invalid_config() { + fn test_update_invalid_config() { let cfg = BackupStreamConfig::default(); let (sched, _) = dummy_scheduler(); let mut cfg_manager = BackupStreamConfigManager::new(sched, cfg.clone()); @@ -2535,17 +2910,15 @@ mod tests { temp_file_size_limit: 1000, temp_file_memory_quota: 2, max_flush_interval: Duration::from_secs(300), - data_key_manager: None, }, + BackupEncryptionManager::default(), )); - let (task, _path) = task("race".to_owned()).await?; + let (task, _path) = task_handler("race".to_owned()).await?; must_register_table(router.as_ref(), task, 1).await; - router - .must_mut_task_info("race", |i| { - i.storage = Arc::new(NoopStorage::default()); - }) - .await; + router.must_mut_task_info("race", |i| { + i.storage = Arc::new(NoopStorage::default()); + }); let mut b = KvEventsBuilder::new(42, 0); b.put_table(CF_DEFAULT, 1, b"k1", b"v1"); let events_before_flush = b.finish(); @@ -2562,7 +2935,7 @@ mod tests { let (fp_tx, fp_rx) = std::sync::mpsc::sync_channel(0); let fp_rx = std::sync::Mutex::new(fp_rx); - let t = router.get_task_info("race").await.unwrap(); + let t = router.get_task_handler("race").unwrap(); let _ = router.on_events(events_before_flush).await; // make generate temp files ***happen after*** moving files to flushing_files @@ -2587,13 +2960,19 @@ mod tests { }) .unwrap(); + let cx = FlushContext { + task_name: "race", + store_id: 42, + resolved_regions: &EMPTY_RESOLVE, + resolved_ts: TimeStamp::max(), + }; // set flush status to true, because we disabled the auto flush. t.set_flushing_status(true); let router_clone = router.clone(); let _ = tokio::join!( // do flush in another thread tokio::spawn(async move { - router_clone.do_flush("race", 42, TimeStamp::max()).await; + router_clone.do_flush(cx).await; }), router.on_events(events_after_flush) ); @@ -2604,10 +2983,283 @@ mod tests { // set flush status to true, because we disabled the auto flush. t.set_flushing_status(true); - let res = router.do_flush("race", 42, TimeStamp::max()).await; + let res = router.do_flush(cx).await; // this time flush should success. assert!(res.is_some()); assert_eq!(t.files.read().await.len(), 0,); Ok(()) } + + #[tokio::test] + async fn test_encryption_not_set() -> Result<()> { + test_encryption(BackupEncryptionManager::default()).await + } + + #[tokio::test] + async fn test_encryption_plaintext_data_key() -> Result<()> { + // set up plaintext data key + // + let data_key: [u8; 32] = rand::thread_rng().gen(); + let mut cipher = CipherInfo::new(); + cipher.set_cipher_key(data_key.to_vec()); + cipher.set_cipher_type(EncryptionMethod::Aes256Ctr); + + let multi_master_key_backends = MultiMasterKeyBackend::new(); + let backup_encryption_manager = BackupEncryptionManager::new( + Some(cipher), + EncryptionMethod::Aes256Ctr, + multi_master_key_backends, + None, + ); + + test_encryption(backup_encryption_manager).await + } + #[tokio::test] + async fn test_encryption_master_key_based() -> Result<()> { + // set up file backed master key + // + let hex_bytes = encryption::test_utils::generate_random_master_key(); + let (path, _dir) = encryption::test_utils::create_master_key_file_test_only(&hex_bytes); + let master_key_config = MasterKeyConfig::File { + config: FileConfig { + path: path.to_string_lossy().into_owned(), + }, + }; + let multi_master_key_backends = MultiMasterKeyBackend::new(); + multi_master_key_backends + .update_from_config_if_needed(vec![master_key_config], create_async_backend) + .await?; + + let backup_encryption_manager = BackupEncryptionManager::new( + None, + EncryptionMethod::Aes256Ctr, + multi_master_key_backends, + None, + ); + + test_encryption(backup_encryption_manager).await + } + + async fn test_encryption(backup_encryption_manager: BackupEncryptionManager) -> Result<()> { + // set up local file backend for external storage + // + let local_backend_file_path = tempfile::tempdir().unwrap(); + let backend = external_storage::make_local_backend(local_backend_file_path.path()); + let mut task_info = StreamBackupTaskInfo::default(); + task_info.set_storage(backend); + let stream_task = StreamTask { + info: task_info, + is_paused: false, + }; + let merged_file_size_limit = 0x10000000; + + // configure task handler with optional encryption + // + let task_handler = StreamTaskHandler::new( + stream_task, + vec![(vec![], vec![])], + merged_file_size_limit, + make_tempfiles_cfg(tempfile::tempdir().unwrap().path()), + backup_encryption_manager.clone(), + ) + .await + .unwrap(); + + // write some kv into the handler and flush it + // + let kv_events = build_kv_event(0, 1000000); + task_handler.on_events(kv_events.clone()).await?; + task_handler.set_flushing_status(true); + let start = Instant::now(); + let cx = FlushContext { + task_name: &task_handler.task.info.name, + store_id: 1, + resolved_regions: &EMPTY_RESOLVE, + resolved_ts: TimeStamp::new(1), + }; + task_handler.do_flush(cx).await?; + let duration = start.saturating_elapsed(); + println!("Time taken for do_flush: {:?}", duration); + + // verify_on_disk_file(local_backend_file_path.path(), 1, 1); + + // read meta file first to figure out the data file offset + // + let meta_file_paths = meta_file_names(local_backend_file_path.path()); + assert_eq!(meta_file_paths.len(), 1); + + let meta_vec = read_and_parse_meta_files(meta_file_paths); + assert_eq!(meta_vec.len(), 1); + + let meta = meta_vec.first().unwrap(); + let file_group = meta.file_groups.first().unwrap(); + + // read log file and parse to kv pairs + // + let log_file_paths = log_file_names(local_backend_file_path.path()); + assert_eq!(log_file_paths.len(), 1); + + let mut read_out_kv_pairs = read_and_parse_log_file( + log_file_paths.first().unwrap(), + file_group, + backup_encryption_manager, + ) + .await; + + // check whether kv pair matches + // + let mut expected_kv_pairs = events_to_kv_pair(&kv_events); + read_out_kv_pairs.sort(); + expected_kv_pairs.sort(); + assert_eq!(read_out_kv_pairs, expected_kv_pairs); + Ok(()) + } + + fn verify_on_disk_file(path: &Path, num_log: i32, num_backup_meta: i32) { + let mut meta_count = 0; + let mut log_count = 0; + for entry in walkdir::WalkDir::new(path) { + let entry = entry.unwrap(); + + if entry.path().extension() == Some(OsStr::new("meta")) { + meta_count += 1; + } else if entry.path().extension() == Some(OsStr::new("log")) { + log_count += 1; + } + } + assert_eq!(meta_count, num_backup_meta); + assert_eq!(log_count, num_log); + } + + fn log_file_names(path: &Path) -> Vec { + let mut log_files = Vec::new(); + for entry in walkdir::WalkDir::new(path) { + let entry = entry.unwrap(); + if entry.path().extension() == Some(OsStr::new("log")) { + log_files.push(entry.path().to_path_buf()); + } + } + log_files + } + + fn meta_file_names(path: &Path) -> Vec { + let mut meta_files = Vec::new(); + for entry in walkdir::WalkDir::new(path) { + let entry = entry.unwrap(); + if entry.path().extension() == Some(OsStr::new("meta")) { + meta_files.push(entry.path().to_path_buf()); + } + } + meta_files + } + + fn read_and_parse_meta_files(paths: Vec) -> Vec { + let mut meta_vec = Vec::new(); + for path in paths { + let content = std::fs::read(path).unwrap(); + let metadata = protobuf::parse_from_bytes::(&content).unwrap(); + meta_vec.push(metadata); + } + meta_vec + } + + async fn read_and_parse_log_file( + path: &Path, + file_group_meta: &DataFileGroup, + backup_encryption_manager: BackupEncryptionManager, + ) -> Vec<(Vec, Vec)> { + use tokio::io::AsyncReadExt; + + let mut log_file_bytes = Vec::new(); + + // read out the entire disk + // + let mut bytes_buf = Vec::new(); + BufReader::new(File::open(path).await.unwrap()) + .read_to_end(&mut bytes_buf) + .await + .unwrap(); + // find each file length and read it and append to the result + // + assert!(!file_group_meta.get_data_files_info().is_empty()); + for file_info in file_group_meta.get_data_files_info() { + let slice = &bytes_buf[file_info.range_offset as usize + ..(file_info.range_offset + file_info.range_length) as usize]; + let mut file_buf = Vec::new(); + if let Some(cipher_info) = backup_encryption_manager.plaintext_data_key.as_ref() { + let iv = Iv::from_slice(&file_info.file_encryption_info.as_ref().unwrap().file_iv) + .unwrap(); + let mut decrypter = DecrypterReader::new( + BufReader::new(Cursor::new(slice)).compat(), + file_info + .file_encryption_info + .as_ref() + .unwrap() + .encryption_method, + &cipher_info.cipher_key, + iv, + ) + .unwrap(); + let mut decrypted_buf = Vec::new(); + decrypter.read_to_end(&mut decrypted_buf).await.unwrap(); + let mut decoder = ZstdDecoder::new(BufReader::new(Cursor::new(decrypted_buf))); + decoder.read_to_end(&mut file_buf).await.unwrap(); + } else if backup_encryption_manager + .is_master_key_backend_initialized() + .await + { + let iv = Iv::from_slice(&file_info.file_encryption_info.as_ref().unwrap().file_iv) + .unwrap(); + let cipher_data_key = file_info + .file_encryption_info + .as_ref() + .unwrap() + .get_master_key_based() + .data_key_encrypted_content + .first() + .unwrap(); + let plaintext_data_key = backup_encryption_manager + .decrypt_data_key(cipher_data_key) + .await + .unwrap(); + let mut decrypter = DecrypterReader::new( + BufReader::new(Cursor::new(slice)).compat(), + file_info + .file_encryption_info + .as_ref() + .unwrap() + .encryption_method, + &plaintext_data_key, + iv, + ) + .unwrap(); + let mut decrypted_buf = Vec::new(); + decrypter.read_to_end(&mut decrypted_buf).await.unwrap(); + let mut decoder = ZstdDecoder::new(BufReader::new(Cursor::new(decrypted_buf))); + decoder.read_to_end(&mut file_buf).await.unwrap(); + } else { + let mut decode_reader = ZstdDecoder::new(BufReader::new(Cursor::new(slice))); + decode_reader.read_to_end(&mut file_buf).await.unwrap(); + } + + // parse to kv pair + // + let mut event_iter = EventIterator::new(&file_buf); + while event_iter.valid() { + event_iter.next().unwrap(); + let key = event_iter.key(); + let val = event_iter.value(); + log_file_bytes.push((key.to_vec(), val.to_vec())) + } + } + log_file_bytes + } + + fn events_to_kv_pair(apply_events: &ApplyEvents) -> Vec<(Vec, Vec)> { + let mut kv_pairs = Vec::new(); + for apply_event in &apply_events.events { + kv_pairs.push((apply_event.key.clone(), apply_event.value.clone())); + } + kv_pairs + } } diff --git a/components/backup-stream/src/service.rs b/components/backup-stream/src/service.rs index 43d4ede2f27..e639f44a731 100644 --- a/components/backup-stream/src/service.rs +++ b/components/backup-stream/src/service.rs @@ -13,11 +13,11 @@ use crate::{ }; #[derive(Clone)] -pub struct Service { +pub struct BackupStreamGrpcService { endpoint: Scheduler, } -impl Service { +impl BackupStreamGrpcService { pub fn new(endpoint: Scheduler) -> Self { Self { endpoint } } @@ -39,7 +39,7 @@ impl From for RegionIdentity { } } -impl LogBackup for Service { +impl LogBackup for BackupStreamGrpcService { fn get_last_flush_ts_of_region( &mut self, _ctx: RpcContext<'_>, @@ -92,11 +92,9 @@ impl LogBackup for Service { fn subscribe_flush_event( &mut self, - _ctx: grpcio::RpcContext<'_>, - _req: kvproto::logbackuppb::SubscribeFlushEventRequest, - #[allow(unused_variables)] sink: grpcio::ServerStreamingSink< - kvproto::logbackuppb::SubscribeFlushEventResponse, - >, + _ctx: RpcContext<'_>, + _req: SubscribeFlushEventRequest, + #[allow(unused_variables)] sink: grpcio::ServerStreamingSink, ) { #[cfg(test)] panic!("Service should not be used in an unit test"); diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 232a292e571..e1803048d71 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -86,27 +86,23 @@ impl ResolvedRegions { /// Note: Maybe we can compute the global checkpoint internal and getting /// the interface clear. However we must take the `min_ts` or we cannot /// provide valid global checkpoint if there isn't any region checkpoint. - pub fn new(checkpoint: TimeStamp, checkpoints: Vec) -> Self { + pub const fn new(checkpoint: TimeStamp, checkpoints: Vec) -> Self { Self { items: checkpoints, checkpoint, } } - /// take the region checkpoints from the structure. - #[deprecated = "please use `take_resolve_result` instead."] - pub fn take_region_checkpoints(&mut self) -> Vec<(Region, TimeStamp)> { - std::mem::take(&mut self.items) - .into_iter() - .map(|x| (x.region, x.checkpoint)) - .collect() - } - /// take the resolve result from this struct. pub fn take_resolve_result(&mut self) -> Vec { std::mem::take(&mut self.items) } + /// Get the resolve results. + pub fn resolve_results(&self) -> &[ResolveResult] { + &self.items + } + /// get the global checkpoint. pub fn global_checkpoint(&self) -> TimeStamp { self.checkpoint @@ -141,8 +137,6 @@ trait InitialScan: Clone + Sync + Send + 'static { start_ts: TimeStamp, handle: ObserveHandle, ) -> Result; - - fn handle_fatal_error(&self, region: &Region, err: Error); } #[async_trait::async_trait] @@ -171,19 +165,6 @@ where let stat = self.do_initial_scan(region, h, start_ts, snap).await?; Ok(stat) } - - fn handle_fatal_error(&self, region: &Region, err: Error) { - try_send!( - self.scheduler, - Task::FatalError( - TaskSelector::ByRange( - region.get_start_key().to_owned(), - region.get_end_key().to_owned() - ), - Box::new(err), - ) - ); - } } impl ScanCmd { @@ -850,6 +831,8 @@ mod test { time::Duration, }; + use dashmap::DashMap; + use encryption::BackupEncryptionManager; use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use kvproto::{ brpb::{Noop, StorageBackend, StreamBackupTaskInfo}, @@ -860,7 +843,10 @@ mod test { router::{CdcRaftRouter, ServerRaftStoreRouter}, RegionInfo, }; - use tikv::{config::BackupStreamConfig, storage::Statistics}; + use tikv::{ + config::BackupStreamConfig, + storage::{txn::txn_status_cache::TxnStatusCache, Statistics}, + }; use tikv_util::{box_err, info, memory::MemoryQuota, worker::dummy_scheduler}; use tokio::{sync::mpsc::Sender, task::JoinHandle}; use txn_types::TimeStamp; @@ -897,15 +883,11 @@ mod test { async fn do_initial_scan( &self, region: &Region, - start_ts: txn_types::TimeStamp, - handle: raftstore::coprocessor::ObserveHandle, - ) -> crate::errors::Result { + start_ts: TimeStamp, + handle: ObserveHandle, + ) -> crate::errors::Result { (self.0)(region, start_ts, handle) } - - fn handle_fatal_error(&self, region: &Region, err: crate::errors::Error) { - panic!("fatal {:?} {}", region, err) - } } #[test] @@ -918,7 +900,7 @@ mod test { use super::ScanCmd; use crate::{subscription_manager::spawn_executors, utils::FutureWaitGroup}; - fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { + fn should_finish_in(f: impl FnOnce() + Send + 'static, d: Duration) { let (tx, rx) = futures::channel::oneshot::channel(); std::thread::spawn(move || { f(); @@ -1053,10 +1035,17 @@ mod test { let meta_cli = SlashEtcStore::default(); let meta_cli = MetadataClient::new(meta_cli, 1); let (scheduler, mut output) = dummy_scheduler(); - let subs = SubscriptionTracer::default(); + let subs = SubscriptionTracer( + Arc::new(DashMap::new()), + Arc::new(TxnStatusCache::new_for_test()), + ); let memory_manager = Arc::new(MemoryQuota::new(1024)); let (tx, mut rx) = tokio::sync::mpsc::channel(8); - let router = RouterInner::new(scheduler.clone(), BackupStreamConfig::default().into()); + let router = RouterInner::new( + scheduler.clone(), + BackupStreamConfig::default().into(), + BackupEncryptionManager::default(), + ); let mut task = StreamBackupTaskInfo::new(); task.set_name(task_name.to_owned()); task.set_storage({ diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index c1bd40197e7..e0b10418bd9 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -9,6 +9,7 @@ use dashmap::{ use kvproto::metapb::Region; use raftstore::coprocessor::*; use resolved_ts::{Resolver, TsSource, TxnLocks}; +use tikv::storage::txn::txn_status_cache::TxnStatusCache; use tikv_util::{ info, memory::{MemoryQuota, MemoryQuotaExceeded}, @@ -18,9 +19,18 @@ use txn_types::TimeStamp; use crate::{debug, metrics::TRACK_REGION, utils}; -/// A utility to tracing the regions being subscripted. -#[derive(Clone, Default, Debug)] -pub struct SubscriptionTracer(Arc>); +/// A utility to tracing the regions being subscribed. +#[derive(Clone)] +pub struct SubscriptionTracer( + pub Arc>, + pub Arc, +); + +impl std::fmt::Debug for SubscriptionTracer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("SubscriptionTracer").field(&self.0).finish() + } +} /// The state of the subscription state machine: /// Initial state is `ABSENT`, the subscription isn't in the tracer. @@ -73,8 +83,13 @@ impl std::fmt::Debug for ActiveSubscription { } impl ActiveSubscription { - pub fn new(region: Region, handle: ObserveHandle, start_ts: Option) -> Self { - let resolver = TwoPhaseResolver::new(region.get_id(), start_ts); + pub fn new( + region: Region, + handle: ObserveHandle, + start_ts: Option, + txn_status_cache: Arc, + ) -> Self { + let resolver = TwoPhaseResolver::new(region.get_id(), start_ts, txn_status_cache); Self { handle, meta: region, @@ -100,7 +115,7 @@ impl ActiveSubscription { } } -#[derive(PartialEq, Eq)] +#[derive(PartialEq, Eq, Clone)] pub enum CheckpointType { MinTs, StartTsOfInitialScan, @@ -120,6 +135,7 @@ impl std::fmt::Debug for CheckpointType { } } +#[derive(Clone)] pub struct ResolveResult { pub region: Region, pub checkpoint: TimeStamp, @@ -226,7 +242,7 @@ impl SubscriptionTracer { let e = self.0.entry(region.id); match e { Entry::Occupied(o) => { - let sub = ActiveSubscription::new(region.clone(), handle, start_ts); + let sub = ActiveSubscription::new(region.clone(), handle, start_ts, self.1.clone()); let (_, s) = o.replace_entry(SubscribeState::Running(sub)); if !s.is_pending() { // If there is another subscription already (perhaps repeated Start), @@ -237,7 +253,7 @@ impl SubscriptionTracer { } Entry::Vacant(e) => { warn!("excepted state transform: absent -> running"; utils::slog_region(region)); - let sub = ActiveSubscription::new(region.clone(), handle, start_ts); + let sub = ActiveSubscription::new(region.clone(), handle, start_ts, self.1.clone()); e.insert(SubscribeState::Running(sub)); } } @@ -486,17 +502,18 @@ pub struct TwoPhaseResolver { } enum FutureLock { - Lock(Vec, TimeStamp), + Lock(Vec, TimeStamp, u64 /* generation */), Unlock(Vec), } impl std::fmt::Debug for FutureLock { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Lock(arg0, arg1) => f + Self::Lock(arg0, arg1, generation) => f .debug_tuple("Lock") .field(&format_args!("{}", utils::redact(arg0))) .field(arg1) + .field(generation) .finish(), Self::Unlock(arg0) => f .debug_tuple("Unlock") @@ -523,11 +540,12 @@ impl TwoPhaseResolver { &mut self, start_ts: TimeStamp, key: Vec, + generation: u64, ) -> Result<(), MemoryQuotaExceeded> { if !self.in_phase_one() { warn!("backup stream tracking lock as if in phase one"; "start_ts" => %start_ts, "key" => %utils::redact(&key)) } - self.resolver.track_lock(start_ts, key, None)?; + self.resolver.track_lock(start_ts, key, None, generation)?; Ok(()) } @@ -535,12 +553,14 @@ impl TwoPhaseResolver { &mut self, start_ts: TimeStamp, key: Vec, + generation: u64, ) -> Result<(), MemoryQuotaExceeded> { if self.in_phase_one() { - self.future_locks.push(FutureLock::Lock(key, start_ts)); + self.future_locks + .push(FutureLock::Lock(key, start_ts, generation)); return Ok(()); } - self.resolver.track_lock(start_ts, key, None)?; + self.resolver.track_lock(start_ts, key, None, generation)?; Ok(()) } @@ -555,9 +575,9 @@ impl TwoPhaseResolver { fn handle_future_lock(&mut self, lock: FutureLock) { match lock { - FutureLock::Lock(key, ts) => { + FutureLock::Lock(key, ts, generation) => { // TODO: handle memory quota exceed, for now, quota is set to usize::MAX. - self.resolver.track_lock(ts, key, None).unwrap(); + self.resolver.track_lock(ts, key, None, generation).unwrap(); } FutureLock::Unlock(key) => self.resolver.untrack_lock(&key, None), } @@ -579,11 +599,15 @@ impl TwoPhaseResolver { self.resolver.resolved_ts() } - pub fn new(region_id: u64, stable_ts: Option) -> Self { + pub fn new( + region_id: u64, + stable_ts: Option, + txn_status_cache: Arc, + ) -> Self { // TODO: limit the memory usage of the resolver. let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); Self { - resolver: Resolver::new(region_id, memory_quota), + resolver: Resolver::new(region_id, memory_quota, txn_status_cache), future_locks: Default::default(), stable_ts, } @@ -622,9 +646,11 @@ impl std::fmt::Debug for TwoPhaseResolver { mod test { use std::sync::Arc; + use dashmap::DashMap; use kvproto::metapb::{Region, RegionEpoch}; use raftstore::coprocessor::ObserveHandle; use resolved_ts::TxnLocks; + use tikv::storage::txn::txn_status_cache::TxnStatusCache; use txn_types::TimeStamp; use super::{SubscriptionTracer, TwoPhaseResolver}; @@ -634,14 +660,15 @@ mod test { fn test_two_phase_resolver() { let key = b"somewhere_over_the_rainbow"; let ts = TimeStamp::new; - let mut r = TwoPhaseResolver::new(42, Some(ts(42))); - r.track_phase_one_lock(ts(48), key.to_vec()).unwrap(); + let mut r = + TwoPhaseResolver::new(42, Some(ts(42)), Arc::new(TxnStatusCache::new_for_test())); + r.track_phase_one_lock(ts(48), key.to_vec(), 0).unwrap(); // When still in phase one, the resolver should not be advanced. r.untrack_lock(&key[..]); assert_eq!(r.resolve(ts(50)), ts(42)); // Even new lock tracked... - r.track_lock(ts(52), key.to_vec()).unwrap(); + r.track_lock(ts(52), key.to_vec(), 0).unwrap(); r.untrack_lock(&key[..]); assert_eq!(r.resolve(ts(53)), ts(42)); @@ -650,7 +677,7 @@ mod test { assert_eq!(r.resolve(ts(54)), ts(54)); // It should be able to track incremental locks. - r.track_lock(ts(55), key.to_vec()).unwrap(); + r.track_lock(ts(55), key.to_vec(), 0).unwrap(); assert_eq!(r.resolve(ts(56)), ts(55)); r.untrack_lock(&key[..]); assert_eq!(r.resolve(ts(57)), ts(57)); @@ -668,7 +695,10 @@ mod test { #[test] fn test_delay_remove() { - let subs = SubscriptionTracer::default(); + let subs = SubscriptionTracer( + Arc::new(DashMap::new()), + Arc::new(TxnStatusCache::new_for_test()), + ); let handle = ObserveHandle::new(); subs.register_region(®ion(1, 1, 1), handle, Some(TimeStamp::new(42))); assert!(subs.get_subscription_of(1).is_some()); @@ -679,7 +709,10 @@ mod test { #[test] fn test_cal_checkpoint() { - let subs = SubscriptionTracer::default(); + let subs = SubscriptionTracer( + Arc::new(DashMap::new()), + Arc::new(TxnStatusCache::new_for_test()), + ); subs.register_region( ®ion(1, 1, 1), ObserveHandle::new(), @@ -706,7 +739,7 @@ mod test { region4_sub .value_mut() .resolver - .track_lock(TimeStamp::new(128), b"Alpi".to_vec()) + .track_lock(TimeStamp::new(128), b"Alpi".to_vec(), 0) .unwrap(); subs.register_region(®ion(5, 8, 1), ObserveHandle::new(), None); subs.deregister_region_if(®ion(5, 8, 1), |_, _| true); diff --git a/components/backup-stream/src/tempfiles.rs b/components/backup-stream/src/tempfiles.rs index def8342a606..4e1e82676b6 100644 --- a/components/backup-stream/src/tempfiles.rs +++ b/components/backup-stream/src/tempfiles.rs @@ -28,7 +28,7 @@ use std::{ task::{ready, Context, Poll}, }; -use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter, Iv}; +use encryption::{BackupEncryptionManager, DecrypterReader, EncrypterWriter, Iv}; use futures::{AsyncWriteExt, TryFutureExt}; use kvproto::{brpb::CompressionType, encryptionpb::EncryptionMethod}; use tikv_util::warn; @@ -59,14 +59,11 @@ pub struct Config { /// Prevent files with size less than this being swapped out. /// We perfer to swap larger files for reducing IOps. pub minimal_swap_out_file_size: usize, - /// The buffer size for writting swap files. + /// The buffer size for writing swap files. /// Even some of files has been swapped out, when new content appended, /// those content would be kept in memory before they reach a threshold. /// This would help us to reduce the I/O system calls. pub write_buffer_size: usize, - /// The encryption applied to swapped out files. - /// The in-memory content will be plaintext always. - pub encryption: Option>, } impl std::fmt::Debug for Config { @@ -80,10 +77,6 @@ impl std::fmt::Debug for Config { &self.minimal_swap_out_file_size, ) .field("write_buffer_size", &self.write_buffer_size) - .field( - "encryption", - &self.encryption.as_ref().map(|enc| enc.encryption_method()), - ) .finish() } } @@ -98,6 +91,7 @@ pub struct TempFilePool { cfg: Config, current: AtomicUsize, files: BlockMutex, + backup_encryption_manager: BackupEncryptionManager, #[cfg(test)] override_swapout: Option< @@ -201,10 +195,10 @@ struct FileSet { } impl TempFilePool { - pub fn new(cfg: Config) -> Result { + pub fn new(cfg: Config, backup_encryption_manager: BackupEncryptionManager) -> Result { if let Ok(true) = std::fs::metadata(&cfg.swap_files).map(|x| x.is_dir()) { warn!("find content in the swap file directory node. truncating them."; "dir" => %cfg.swap_files.display()); - if let Some(enc) = &cfg.encryption { + if let Some(enc) = backup_encryption_manager.opt_data_key_manager() { enc.remove_dir(&cfg.swap_files, None)?; } std::fs::remove_dir_all(&cfg.swap_files)?; @@ -215,6 +209,7 @@ impl TempFilePool { cfg, current: AtomicUsize::new(0usize), files: BlockMutex::default(), + backup_encryption_manager, #[cfg(test)] override_swapout: None, @@ -262,13 +257,14 @@ impl TempFilePool { } } - /// Open a file reference for reading. - /// Please notice that once a compression applied, this would yield the - /// compressed content (won't decompress them.) -- that is what "raw" - /// implies. - /// "But why there isn't a `open_for_read` which decompresses the content?" - /// "Because in our use case, we only need the raw content -- we just send - /// it to external storage." + /// Opens a file for reading in its raw format. + /// + /// This method returns the file content as-is, including any applied + /// compression. It does not perform decompression. + /// + /// Note: We don't provide an `open_for_read` method that decompresses the + /// content because our primary use case is to send the raw (potentially + /// compressed) data directly to external storage. pub fn open_raw_for_read(&self, p: &Path) -> Result { use std::io::{Error as IoErr, ErrorKind}; @@ -314,7 +310,7 @@ impl TempFilePool { } /// Remove a file from the pool. - /// If there are still some reference to the file, the deletion may be + /// If there is still some reference to the file, the deletion may be /// delayed until all reference to the file drop. pub fn remove(&self, p: &Path) -> bool { let mut files = self.files.lock().unwrap(); @@ -334,8 +330,8 @@ impl TempFilePool { self.current.load(Ordering::Acquire) } - /// Create a file for writting. - /// This function is synchronous so we can call it easier in the polling + /// Create a file for writing. + /// This function is synchronous, so we can call it easier in the polling /// context. (Anyway, it is really hard to call an async function in the /// polling context.) fn create_relative(&self, p: &Path) -> std::io::Result { @@ -346,7 +342,8 @@ impl TempFilePool { None => {} } let file = OsFile::from_std(SyncOsFile::create(&abs_path)?); - let pfile = match &self.cfg.encryption { + + let pfile = match &self.backup_encryption_manager.opt_data_key_manager() { Some(enc) => SwappedOut::Encrypted( enc.open_file_with_writer(&abs_path, file.compat(), true) .map_err(Self::convert_encrypt_error_to_io)? @@ -365,7 +362,7 @@ impl TempFilePool { let abs_path = self.cfg.swap_files.join(p); let file = SyncOsFile::open(&abs_path)?; let async_file = OsFile::from_std(file).compat(); - let decrypted_file = match &self.cfg.encryption { + let decrypted_file = match &self.backup_encryption_manager.opt_data_key_manager() { Some(enc) => enc .open_file_with_reader(&abs_path, async_file) .map_err(Self::convert_encrypt_error_to_io)? @@ -379,7 +376,7 @@ impl TempFilePool { fn delete_relative(&self, p: &Path) -> std::io::Result<()> { let abs_path = self.cfg.swap_files.join(p); - if let Some(enc) = &self.cfg.encryption { + if let Some(enc) = &self.backup_encryption_manager.opt_data_key_manager() { enc.delete_file(&abs_path.to_string_lossy(), None)?; } std::fs::remove_file(&abs_path)?; @@ -791,7 +788,7 @@ mod test { }; use async_compression::tokio::bufread::ZstdDecoder; - use encryption::DataKeyManager; + use encryption::{BackupEncryptionManager, DataKeyManager, MultiMasterKeyBackend}; use kvproto::{brpb::CompressionType, encryptionpb::EncryptionMethod}; use tempfile::{tempdir, TempDir}; use test_util::new_test_key_manager; @@ -831,12 +828,11 @@ mod test { content_compression: CompressionType::Unknown, minimal_swap_out_file_size: 8192, write_buffer_size: 4096, - encryption: None, }; m(&mut cfg); TestPool { tmpdir: Arc::new(tmp), - pool: Arc::new(TempFilePool::new(cfg).unwrap()), + pool: Arc::new(TempFilePool::new(cfg, BackupEncryptionManager::default()).unwrap()), } } @@ -1122,12 +1118,30 @@ mod test { fn test_encryption(enc: DataKeyManager) { let method = enc.encryption_method(); - let pool = test_pool_with_modify(|cfg| { - cfg.encryption = Some(Arc::new(enc)); - cfg.minimal_swap_out_file_size = 15; - cfg.write_buffer_size = 15; - cfg.cache_size = AtomicUsize::new(15); - }); + let tmp = tempdir().unwrap(); + let cfg = Config { + cache_size: AtomicUsize::new(15), + swap_files: tmp.path().to_owned(), + content_compression: CompressionType::Unknown, + minimal_swap_out_file_size: 15, + write_buffer_size: 15, + }; + let pool = TestPool { + tmpdir: Arc::new(tmp), + pool: Arc::new( + TempFilePool::new( + cfg, + BackupEncryptionManager::new( + None, + EncryptionMethod::Plaintext, + MultiMasterKeyBackend::default(), + Some(Arc::new(enc)), + ), + ) + .unwrap(), + ), + }; + let rt = rt_for_test(); let content_to_write: [&[u8]; 4] = [ b"Now let's test the encryption.", diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index f651ab6d308..9165f54053a 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -1012,7 +1012,7 @@ mod test { let (items, size) = super::with_record_read_throughput(|| { let mut items = vec![]; - let snap = engine.snapshot(None); + let snap = engine.snapshot(); snap.scan(CF_DEFAULT, b"", b"", false, |k, v| { items.push((k.to_owned(), v.to_owned())); Ok(true) diff --git a/components/backup-stream/tests/failpoints/mod.rs b/components/backup-stream/tests/failpoints/mod.rs index d1a9bc3a97b..2499a679e38 100644 --- a/components/backup-stream/tests/failpoints/mod.rs +++ b/components/backup-stream/tests/failpoints/mod.rs @@ -254,7 +254,7 @@ mod all { .unwrap(); suite.sync(); - suite.wait_with_router(move |r| block_on(r.get_task_info("retry_abort")).is_ok()); + suite.wait_with_router(move |r| r.get_task_handler("retry_abort").is_ok()); let items = run_async_test(suite.write_records(0, 128, 1)); suite.force_flush_files("retry_abort"); suite.wait_for_flush(); diff --git a/components/backup-stream/tests/integration/mod.rs b/components/backup-stream/tests/integration/mod.rs index 01a51ddc504..ed161c68047 100644 --- a/components/backup-stream/tests/integration/mod.rs +++ b/components/backup-stream/tests/integration/mod.rs @@ -7,19 +7,24 @@ mod suite; mod all { - use std::time::{Duration, Instant}; + use std::{ + os::unix::ffi::OsStrExt, + time::{Duration, Instant}, + }; use backup_stream::{ errors::Error, router::TaskSelector, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; use futures::{Stream, StreamExt}; + use kvproto::metapb::RegionEpoch; use pd_client::PdClient; use test_raftstore::IsolationFilterFactory; use tikv::config::BackupStreamConfig; use tikv_util::{box_err, defer, info, HandyRwLock}; use tokio::time::timeout; use txn_types::{Key, TimeStamp}; + use walkdir::WalkDir; use super::suite::{ make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, @@ -44,6 +49,64 @@ mod all { suite.cluster.shutdown(); } + #[test] + fn region_boundaries() { + let mut suite = SuiteBuilder::new_named("region_boundaries") + .nodes(1) + .build(); + let round = run_async_test(async { + suite.must_split(&make_split_key_at_record(1, 42)); + suite.must_split(&make_split_key_at_record(1, 86)); + let round1 = suite.write_records(0, 128, 1).await; + suite.must_register_task(1, "region_boundaries"); + round1 + }); + suite.force_flush_files("region_boundaries"); + suite.wait_for_flush(); + suite.check_for_write_records(suite.flushed_files.path(), round.iter().map(Vec::as_slice)); + + let a_meta = WalkDir::new(suite.flushed_files.path().join("v1/backupmeta")) + .contents_first(true) + .into_iter() + .find(|v| { + v.as_ref() + .is_ok_and(|v| v.file_name().as_bytes().ends_with(b".meta")) + }) + .unwrap() + .unwrap(); + let mut a_meta_content = protobuf::parse_from_bytes::( + &std::fs::read(a_meta.path()).unwrap(), + ) + .unwrap(); + let dfs = a_meta_content.mut_file_groups()[0].mut_data_files_info(); + // Two regions, two CFs. + assert_eq!(dfs.len(), 6); + dfs.sort_by(|x1, x2| x1.start_key.cmp(&x2.start_key)); + let hnd_key = |hnd| make_split_key_at_record(1, hnd); + let epoch = |ver, conf_ver| { + let mut e = RegionEpoch::new(); + e.set_version(ver); + e.set_conf_ver(conf_ver); + e + }; + assert_eq!(dfs[0].region_start_key, b""); + assert_eq!(dfs[0].region_end_key, hnd_key(42)); + assert_eq!(dfs[0].region_epoch.len(), 1, "{:?}", dfs[0]); + assert_eq!(dfs[0].region_epoch[0], epoch(2, 1), "{:?}", dfs[0]); + + assert_eq!(dfs[2].region_start_key, hnd_key(42)); + assert_eq!(dfs[2].region_end_key, hnd_key(86)); + assert_eq!(dfs[2].region_epoch.len(), 1, "{:?}", dfs[2]); + assert_eq!(dfs[2].region_epoch[0], epoch(3, 1), "{:?}", dfs[2]); + + assert_eq!(dfs[4].region_start_key, hnd_key(86)); + assert_eq!(dfs[4].region_end_key, b""); + assert_eq!(dfs[4].region_epoch.len(), 1, "{:?}", dfs[4]); + assert_eq!(dfs[4].region_epoch[0], epoch(3, 1), "{:?}", dfs[4]); + + suite.cluster.shutdown(); + } + /// This test tests whether we can handle some weird transactions and their /// race with initial scanning. /// Generally, those transactions: diff --git a/components/backup-stream/tests/suite.rs b/components/backup-stream/tests/suite.rs index 73462b0119f..e3254f16acc 100644 --- a/components/backup-stream/tests/suite.rs +++ b/components/backup-stream/tests/suite.rs @@ -18,14 +18,15 @@ use backup_stream::{ }, observer::BackupStreamObserver, router::{Router, TaskSelector}, - utils, BackupStreamResolver, Endpoint, GetCheckpointResult, RegionCheckpointOperation, - RegionSet, Service, Task, + utils, BackupStreamGrpcService, BackupStreamResolver, Endpoint, GetCheckpointResult, + RegionCheckpointOperation, RegionSet, Task, }; -use engine_rocks::RocksEngine; +use encryption::{BackupEncryptionManager, MultiMasterKeyBackend}; use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt}; use grpcio::{ChannelBuilder, Server, ServerBuilder}; use kvproto::{ brpb::{CompressionType, Local, Metadata, StorageBackend}, + encryptionpb::EncryptionMethod, kvrpcpb::*, logbackuppb::{SubscribeFlushEventRequest, SubscribeFlushEventResponse}, logbackuppb_grpc::{create_log_backup, LogBackupClient}, @@ -38,7 +39,10 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{new_server_cluster, Cluster, Config, ServerCluster}; use test_util::retry; -use tikv::config::{BackupStreamConfig, ResolvedTsConfig}; +use tikv::{ + config::{BackupStreamConfig, ResolvedTsConfig}, + storage::txn::txn_status_cache::TxnStatusCache, +}; use tikv_util::{ codec::{ number::NumberEncoder, @@ -264,7 +268,7 @@ impl MetaStore for ErrorStore { pub struct Suite { pub endpoints: HashMap>, pub meta_store: ErrorStore, - pub cluster: Cluster>, + pub cluster: Cluster, tikv_cli: HashMap, log_backup_cli: HashMap, obs: HashMap, @@ -303,7 +307,7 @@ impl Suite { let ob = BackupStreamObserver::new(worker.scheduler()); let ob2 = ob.clone(); - s.coprocessor_hooks + s.coprocessor_hosts .entry(id) .or_default() .push(Box::new(move |host| { @@ -357,7 +361,7 @@ impl Suite { .get(&id) .expect("must register endpoint first"); - let serv = Service::new(endpoint.scheduler()); + let serv = BackupStreamGrpcService::new(endpoint.scheduler()); let builder = ServerBuilder::new(self.env.clone()).register_service(create_log_backup(serv)); let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); @@ -405,7 +409,13 @@ impl Suite { cluster.pd_client.clone(), cm, BackupStreamResolver::V1(resolver), - sim.encryption.clone(), + BackupEncryptionManager::new( + None, + EncryptionMethod::Plaintext, + MultiMasterKeyBackend::default(), + sim.encryption.clone(), + ), + Arc::new(TxnStatusCache::new_for_test()), ); worker.start(endpoint); } @@ -435,7 +445,7 @@ impl Suite { )) .unwrap(); let name = name.to_owned(); - self.wait_with_router(move |r| block_on(r.get_task_info(&name)).is_ok()) + self.wait_with_router(move |r| r.get_task_handler(&name).is_ok()) } /// This function tries to calculate the global checkpoint from the flush @@ -926,9 +936,9 @@ impl Suite { pub fn wait_for_flush(&self) { self.wait_with_router(move |r| { - let task_names = block_on(r.select_task(TaskSelector::All.reference())); + let task_names = r.select_task(TaskSelector::All.reference()); for task_name in task_names { - let tsk = block_on(r.get_task_info(&task_name)); + let tsk = r.get_task_handler(&task_name); if tsk.unwrap().is_flushing() { return false; } diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index af5e74d0eec..a76f019482f 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -51,26 +51,20 @@ kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } -pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } raft = { workspace = true } raftstore = { workspace = true } resource_control = { workspace = true } -security = { workspace = true } -serde = "1.0" -serde_derive = "1.0" slog = { workspace = true } # better to not use slog-global, but pass in the logger slog-global = { workspace = true } thiserror = "1.0" -tidb_query_common = { workspace = true } tikv = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-stream = "0.1" txn_types = { workspace = true } -yatp = { workspace = true } [dev-dependencies] rand = "0.8" diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 96a9216292b..681187ad0d6 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -1348,9 +1348,10 @@ pub mod tests { impl MockRegionInfoProvider { pub fn new(encode_key: bool) -> Self { MockRegionInfoProvider { - regions: Arc::new(Mutex::new(RegionCollector::new(Arc::new(RwLock::new( - HashSet::default(), - ))))), + regions: Arc::new(Mutex::new(RegionCollector::new( + Arc::new(RwLock::new(HashSet::default())), + Box::new(|| 0), + ))), cancel: None, need_encode_key: encode_key, } diff --git a/components/backup/src/metrics.rs b/components/backup/src/metrics.rs index a24a1593e9f..352092619bc 100644 --- a/components/backup/src/metrics.rs +++ b/components/backup/src/metrics.rs @@ -3,12 +3,9 @@ use lazy_static::*; use prometheus::*; +// When adding new metrics, remember to update in the grafana dashboard, for +// example update the tikv_details.dashboard.py. lazy_static! { - pub static ref BACKUP_REQUEST_HISTOGRAM: Histogram = register_histogram!( - "tikv_backup_request_duration_seconds", - "Bucketed histogram of backup requests duration" - ) - .unwrap(); pub static ref BACKUP_RANGE_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( "tikv_backup_range_duration_seconds", "Bucketed histogram of backup range duration", diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index b945b8bb304..09607aa8fe2 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -9,7 +9,6 @@ default = ["test-runner"] test-runner = ["derive_more"] [dependencies] -collections = { workspace = true } crossbeam = { workspace = true } dashmap = "5.2" derive_more = { version = "0.99", optional = true } @@ -21,7 +20,6 @@ online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } resource_control = { workspace = true } serde = { version = "1.0", features = ["derive"] } -serde_derive = "1.0" slog = { workspace = true } slog-global = { workspace = true } tikv_alloc = { workspace = true } diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index ac9f13b6d60..6928880f4ac 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -9,22 +9,15 @@ license = "Apache-2.0" testexport = [] [dependencies] -api_version = { workspace = true } async-trait = { version = "0.1" } -engine_rocks = { workspace = true } -engine_traits = { workspace = true } enum_dispatch = "0.3.8" error_code = { workspace = true } -fail = "0.5" futures = { version = "0.3" } -kvproto = { workspace = true } lazy_static = "1.3" -log_wrappers = { workspace = true } parking_lot = "0.12" pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" -raft = { workspace = true } serde = "1.0" serde_derive = "1.0" slog = { workspace = true } @@ -33,7 +26,6 @@ slog-global = { workspace = true } # TODO: move it to `dev-dependencies` after we have a better way to handle it. test_pd_client = { workspace = true } thiserror = "1.0" -tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } txn_types = { workspace = true } diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 33980a8cecb..592effa62bd 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -39,7 +39,6 @@ engine_rocks = { workspace = true } engine_traits = { workspace = true } fail = "0.5" futures = "0.3" -futures-timer = "3.0" getset = "0.1" grpcio = { workspace = true } keys = { workspace = true } @@ -69,7 +68,6 @@ txn_types = { workspace = true } criterion = "0.3" engine_rocks = { workspace = true } engine_traits = { workspace = true } -tempfile = "3.0" test_pd_client = { workspace = true } test_raftstore = { workspace = true } test_util = { workspace = true } diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index c49bec00547..98520dc7b02 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -22,13 +22,13 @@ use kvproto::cdcpb::{ChangeDataEvent, Event, ResolvedTs}; use protobuf::Message; use tikv_util::{ future::block_on_timeout, - impl_display_as_debug, + impl_display_as_debug, info, memory::{MemoryQuota, MemoryQuotaExceeded}, time::Instant, warn, }; -use crate::metrics::*; +use crate::{metrics::*, service::ConnId}; /// The maximum bytes of events can be batched into one `CdcEvent::Event`, 32KB. pub const CDC_EVENT_MAX_BYTES: usize = 32 * 1024; @@ -194,7 +194,7 @@ impl EventBatcher { } } -pub fn channel(buffer: usize, memory_quota: Arc) -> (Sink, Drain) { +pub fn channel(conn_id: ConnId, buffer: usize, memory_quota: Arc) -> (Sink, Drain) { let (unbounded_sender, unbounded_receiver) = unbounded(); let (bounded_sender, bounded_receiver) = bounded(buffer); ( @@ -207,14 +207,19 @@ pub fn channel(buffer: usize, memory_quota: Arc) -> (Sink, Drain) { unbounded_receiver, bounded_receiver, memory_quota, + conn_id, }, ) } #[derive(Clone, Debug, PartialEq)] pub enum SendError { + // Full is returned by the sender if the channel is full, this should only happen to the + // bounded sender. Full, + // Disconnected is returned by the sender if the channel is disconnected. Disconnected, + // Congested is returned if memory quota exceeded. Congested, } @@ -355,6 +360,7 @@ pub struct Drain { unbounded_receiver: UnboundedReceiver, bounded_receiver: Receiver, memory_quota: Arc, + conn_id: ConnId, } impl<'a> Drain { @@ -362,6 +368,7 @@ impl<'a> Drain { let observed = (&mut self.unbounded_receiver).map(|x| (x.created, x.event, x.size)); let scaned = (&mut self.bounded_receiver).filter_map(|x| { if x.truncated.load(Ordering::Acquire) { + self.memory_quota.free(x.size as _); return futures::future::ready(None); } futures::future::ready(Some((x.created, x.event, x.size))) @@ -420,14 +427,17 @@ impl Drop for Drain { self.bounded_receiver.close(); self.unbounded_receiver.close(); let start = Instant::now(); - let mut drain = Box::pin(async { + let mut total_bytes = 0; + let mut drain = Box::pin(async move { + let conn_id = self.conn_id; let memory_quota = self.memory_quota.clone(); - let mut total_bytes = 0; let mut drain = self.drain(); while let Some((_, bytes)) = drain.next().await { total_bytes += bytes; } memory_quota.free(total_bytes); + info!("drop Drain finished, free memory"; "conn_id" => ?conn_id, + "freed_bytes" => total_bytes, "inuse_bytes" => memory_quota.in_use()); }); block_on(&mut drain); let takes = start.saturating_elapsed(); @@ -463,7 +473,7 @@ mod tests { type Send = Box Result<(), SendError>>; fn new_test_channel(buffer: usize, capacity: usize, force_send: bool) -> (Send, Drain) { let memory_quota = Arc::new(MemoryQuota::new(capacity)); - let (mut tx, rx) = channel(buffer, memory_quota); + let (mut tx, rx) = channel(ConnId::default(), buffer, memory_quota); let mut flag = true; let send = move |event| { flag = !flag; @@ -476,6 +486,38 @@ mod tests { (Box::new(send), rx) } + #[test] + fn test_scanned_event() { + let mut e = Event::default(); + e.region_id = 233; + { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let (mut tx, mut rx) = channel(ConnId::default(), 10, memory_quota); + + let truncated = Arc::new(AtomicBool::new(false)); + let event = CdcEvent::Event(e.clone()); + let size = event.size() as usize; + let _ = block_on(tx.send_all(vec![event], truncated)); + + let memory_quota = rx.memory_quota.clone(); + let mut drain = rx.drain(); + assert_matches!(block_on(drain.next()), Some((CdcEvent::Event(_), _))); + assert_eq!(memory_quota.in_use(), size); + } + { + let memory_quota = Arc::new(MemoryQuota::new(1024)); + let (mut tx, mut rx) = channel(ConnId::default(), 10, memory_quota); + + let truncated = Arc::new(AtomicBool::new(true)); + let _ = block_on(tx.send_all(vec![CdcEvent::Event(e)], truncated)); + + let memory_quota = rx.memory_quota.clone(); + let mut drain = rx.drain(); + recv_timeout(&mut drain, Duration::from_millis(100)).unwrap_err(); + assert_eq!(memory_quota.in_use(), 0); + } + } + #[test] fn test_barrier() { let force_send = false; @@ -535,10 +577,10 @@ mod tests { #[test] fn test_congest() { - let mut e = kvproto::cdcpb::Event::default(); + let mut e = Event::default(); e.region_id = 1; let event = CdcEvent::Event(e.clone()); - assert!(event.size() != 0); + assert_ne!(event.size(), 0); // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); @@ -552,10 +594,10 @@ mod tests { #[test] fn test_set_capacity() { - let mut e = kvproto::cdcpb::Event::default(); + let mut e = Event::default(); e.region_id = 1; let event = CdcEvent::Event(e.clone()); - assert!(event.size() != 0); + assert_ne!(event.size(), 0); // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); @@ -603,15 +645,15 @@ mod tests { #[test] fn test_force_send() { - let mut e = kvproto::cdcpb::Event::default(); + let mut e = Event::default(); e.region_id = 1; let event = CdcEvent::Event(e.clone()); - assert!(event.size() != 0); + assert_ne!(event.size(), 0); // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size(); let memory_quota = Arc::new(MemoryQuota::new(max_pending_bytes as _)); - let (tx, _rx) = channel(buffer as _, memory_quota); + let (tx, _rx) = channel(ConnId::default(), buffer as _, memory_quota); for _ in 0..buffer { tx.unbounded_send(CdcEvent::Event(e.clone()), false) .unwrap(); @@ -626,10 +668,10 @@ mod tests { #[test] fn test_channel_memory_leak() { - let mut e = kvproto::cdcpb::Event::default(); + let mut e = Event::default(); e.region_id = 1; let event = CdcEvent::Event(e.clone()); - assert!(event.size() != 0); + assert_ne!(event.size(), 0); // 1KB let max_pending_bytes = 1024; let buffer = max_pending_bytes / event.size() + 1; diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 050e9419cb0..beff91f7b45 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -1,16 +1,20 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - mem, + collections::btree_map::{BTreeMap, Entry as BTreeMapEntry}, + fmt, + ops::Bound, + result::Result as StdResult, string::String, sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, Arc, }, + time::Duration, }; use api_version::{ApiV2, KeyMode, KvFormat}; -use collections::{HashMap, HashMapEntry}; +use collections::HashMap; use crossbeam::atomic::AtomicCell; use kvproto::{ cdcpb::{ @@ -28,21 +32,22 @@ use raftstore::{ store::util::compare_region_epoch, Error as RaftStoreError, }; -use resolved_ts::{Resolver, TsSource, ON_DROP_WARN_HEAP_SIZE}; use tikv::storage::{txn::TxnEntry, Statistics}; use tikv_util::{ debug, info, memory::{HeapSize, MemoryQuota}, + time::Instant, warn, }; use txn_types::{Key, Lock, LockType, TimeStamp, WriteBatchFlags, WriteRef, WriteType}; use crate::{ channel::{CdcEvent, SendError, Sink, CDC_EVENT_MAX_BYTES}, + endpoint::Advance, initializer::KvEntry, metrics::*, old_value::{OldValueCache, OldValueCallback}, - service::ConnId, + service::{Conn, ConnId, FeatureGate, RequestId}, txn_source::TxnSource, Error, Result, }; @@ -70,7 +75,7 @@ pub enum DownstreamState { /// It's just created and rejects change events and resolved timestamps. Uninitialized, /// It has got a snapshot for incremental scan, and change events will be - /// accepted. However it still rejects resolved timestamps. + /// accepted. However, it still rejects resolved timestamps. Initializing, /// Incremental scan is finished so that resolved timestamps are acceptable /// now. @@ -120,26 +125,39 @@ impl DownstreamState { } } -#[derive(Clone)] pub struct Downstream { - // TODO: include cdc request. /// A unique identifier of the Downstream. - id: DownstreamId, - // The request ID set by CDC to identify events corresponding different requests. - req_id: u64, - conn_id: ConnId, - // The IP address of downstream. - peer: String, - region_epoch: RegionEpoch, + pub id: DownstreamId, + /// The IP address of downstream. + pub peer: String, + pub region_epoch: RegionEpoch, + /// The request ID set by CDC to identify events corresponding different + /// requests. + pub req_id: RequestId, + pub conn_id: ConnId, + + pub kv_api: ChangeDataRequestKvApi, + pub filter_loop: bool, + pub observed_range: ObservedRange, + sink: Option, state: Arc>, - kv_api: ChangeDataRequestKvApi, - filter_loop: bool, - pub(crate) observed_range: ObservedRange, - - // When meet region errors like split or merge, we can cancel incremental scan draining - // by `scan_truncated`. pub(crate) scan_truncated: Arc, + + // Fields to handle ResolvedTs advancing. If `lock_heap` is none it means + // the downstream hasn't finished the incremental scanning. + lock_heap: Option>, + advanced_to: TimeStamp, +} + +impl fmt::Debug for Downstream { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Downstream") + .field("id", &self.id) + .field("req_id", &self.req_id) + .field("conn_id", &self.conn_id) + .finish() + } } impl Downstream { @@ -150,7 +168,7 @@ impl Downstream { pub fn new( peer: String, region_epoch: RegionEpoch, - req_id: u64, + req_id: RequestId, conn_id: ConnId, kv_api: ChangeDataRequestKvApi, filter_loop: bool, @@ -158,17 +176,21 @@ impl Downstream { ) -> Downstream { Downstream { id: DownstreamId::new(), - req_id, - conn_id, peer, region_epoch, - sink: None, - state: Arc::new(AtomicCell::new(DownstreamState::default())), + req_id, + conn_id, kv_api, filter_loop, + observed_range, + sink: None, + state: Arc::new(AtomicCell::new(DownstreamState::default())), scan_truncated: Arc::new(AtomicBool::new(false)), + + lock_heap: None, + advanced_to: TimeStamp::zero(), } } @@ -176,10 +198,10 @@ impl Downstream { // because the sink can be also used by an incremental scan. We must ensure // no more events can be pushed to the sink after an `EventError` is sent. pub fn sink_event(&self, mut event: Event, force: bool) -> Result<()> { - event.set_request_id(self.req_id); + event.set_request_id(self.req_id.0); if self.sink.is_none() { info!("cdc drop event, no sink"; - "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => self.req_id); + "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => ?self.req_id); return Err(Error::Sink(SendError::Disconnected)); } let sink = self.sink.as_ref().unwrap(); @@ -187,13 +209,13 @@ impl Downstream { Ok(_) => Ok(()), Err(SendError::Disconnected) => { debug!("cdc send event failed, disconnected"; - "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => self.req_id); + "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => ?self.req_id); Err(Error::Sink(SendError::Disconnected)) } // TODO handle errors. Err(e @ SendError::Full) | Err(e @ SendError::Congested) => { info!("cdc send event failed, full"; - "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => self.req_id); + "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => ?self.req_id); Err(Error::Sink(e)) } } @@ -204,7 +226,7 @@ impl Downstream { /// `sink_error_event` is called. pub fn sink_error_event(&self, region_id: u64, err_event: EventError) -> Result<()> { info!("cdc downstream meets region error"; - "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => self.req_id); + "conn_id" => ?self.conn_id, "downstream_id" => ?self.id, "req_id" => ?self.req_id); self.scan_truncated.store(true, Ordering::Release); let mut change_data_event = Event::default(); @@ -219,107 +241,79 @@ impl Downstream { self.sink = Some(sink); } - pub fn get_id(&self) -> DownstreamId { - self.id - } - - pub fn get_filter_loop(&self) -> bool { - self.filter_loop - } - pub fn get_state(&self) -> Arc> { self.state.clone() } - - pub fn get_conn_id(&self) -> ConnId { - self.conn_id - } - pub fn get_req_id(&self) -> u64 { - self.req_id - } } -struct Pending { - downstreams: Vec, - locks: Vec, - pending_bytes: usize, - memory_quota: Arc, +// In `PendingLock`, `key` is encoded. +pub enum PendingLock { + Track { key: Key, start_ts: MiniLock }, + Untrack { key: Key }, } -impl Pending { - fn new(memory_quota: Arc) -> Pending { - Pending { - downstreams: vec![], - locks: vec![], - pending_bytes: 0, - memory_quota, +impl HeapSize for PendingLock { + fn approximate_heap_size(&self) -> usize { + match self { + PendingLock::Track { key, .. } | PendingLock::Untrack { key } => { + key.approximate_heap_size() + } } } +} - fn push_pending_lock(&mut self, lock: PendingLock) -> Result<()> { - let bytes = lock.approximate_heap_size(); - self.memory_quota.alloc(bytes)?; - self.locks.push(lock); - self.pending_bytes += bytes; - CDC_PENDING_BYTES_GAUGE.add(bytes as i64); - Ok(()) - } +pub enum LockTracker { + Pending, + Preparing(Vec), + Prepared { + region: Region, + locks: BTreeMap, + }, +} - fn on_region_ready(&mut self, resolver: &mut Resolver) -> Result<()> { - fail::fail_point!("cdc_pending_on_region_ready", |_| Err( - Error::MemoryQuotaExceeded(tikv_util::memory::MemoryQuotaExceeded) - )); - // Must take locks, otherwise it may double free memory quota on drop. - for lock in mem::take(&mut self.locks) { - self.memory_quota.free(lock.approximate_heap_size()); - match lock { - PendingLock::Track { key, start_ts } => { - resolver.track_lock(start_ts, key, None)?; - } - PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), +impl fmt::Debug for LockTracker { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + LockTracker::Pending => write!(f, "LockTracker::Pending"), + LockTracker::Preparing(ref locks) => { + write!(f, "LockTracker::Preparing({})", locks.len()) + } + LockTracker::Prepared { locks, .. } => { + write!(f, "LockTracker::Prepared({})", locks.len()) } } - Ok(()) } } -impl Drop for Pending { - fn drop(&mut self) { - CDC_PENDING_BYTES_GAUGE.sub(self.pending_bytes as i64); - let locks = mem::take(&mut self.locks); - if locks.is_empty() { - return; - } +/// `MiniLock` is like `Lock`, but only contains fields that CDC cares about. +#[derive(Eq, PartialEq, Debug)] +pub struct MiniLock { + pub ts: TimeStamp, + pub txn_source: u64, + pub generation: u64, +} - // Free memory quota used by pending locks and unlocks. - let mut bytes = 0; - let num_locks = locks.len(); - for lock in locks { - bytes += lock.approximate_heap_size(); - } - if bytes > ON_DROP_WARN_HEAP_SIZE { - warn!("cdc drop huge Pending"; - "bytes" => bytes, - "num_locks" => num_locks, - "memory_quota_in_use" => self.memory_quota.in_use(), - "memory_quota_capacity" => self.memory_quota.capacity(), - ); +impl MiniLock { + pub fn new(ts: T, txn_source: u64, generation: u64) -> Self + where + TimeStamp: From, + { + MiniLock { + ts: TimeStamp::from(ts), + txn_source, + generation, } - self.memory_quota.free(bytes); } -} -enum PendingLock { - Track { key: Vec, start_ts: TimeStamp }, - Untrack { key: Vec }, -} - -impl HeapSize for PendingLock { - fn approximate_heap_size(&self) -> usize { - match self { - PendingLock::Track { key, .. } | PendingLock::Untrack { key } => { - key.approximate_heap_size() - } + #[cfg(test)] + pub fn from_ts(ts: T) -> Self + where + TimeStamp: From, + { + MiniLock { + ts: TimeStamp::from(ts), + txn_source: 0, + generation: 0, } } } @@ -327,47 +321,209 @@ impl HeapSize for PendingLock { /// A CDC delegate of a raftstore region peer. /// /// It converts raft commands into CDC events and broadcast to downstreams. -/// It also track trancation on the fly in order to compute resolved ts. +/// It also tracks transactions on the fly in order to compute resolved ts. pub struct Delegate { - pub handle: ObserveHandle, pub region_id: u64, + pub handle: ObserveHandle, + memory_quota: Arc, - // None if the delegate is not initialized. - region: Option, - pub resolver: Option, - - // Downstreams after the delegate has been resolved. - resolved_downstreams: Vec, - pending: Option, + lock_tracker: LockTracker, + downstreams: Vec, txn_extra_op: Arc>, failed: bool, + + created: Instant, + last_lag_warn: Instant, +} + +impl Drop for Delegate { + fn drop(&mut self) { + match &self.lock_tracker { + LockTracker::Pending => {} + LockTracker::Preparing(locks) => { + let mut free_bytes = 0; + for lock in locks { + free_bytes += lock.approximate_heap_size(); + } + self.memory_quota.free(free_bytes); + CDC_PENDING_BYTES_GAUGE.sub(free_bytes as _); + } + LockTracker::Prepared { locks, .. } => { + let mut free_bytes = 0; + for lock in locks.keys() { + free_bytes += lock.approximate_heap_size(); + } + self.memory_quota.free(free_bytes); + CDC_PENDING_BYTES_GAUGE.sub(free_bytes as _); + } + } + } } impl Delegate { + fn push_lock(&mut self, key: Key, start_ts: MiniLock) -> Result { + let bytes = key.approximate_heap_size(); + let mut lock_count_modify = 0; + match &mut self.lock_tracker { + LockTracker::Pending => unreachable!(), + LockTracker::Preparing(locks) => { + self.memory_quota.alloc(bytes)?; + CDC_PENDING_BYTES_GAUGE.add(bytes as _); + locks.push(PendingLock::Track { key, start_ts }); + } + LockTracker::Prepared { locks, .. } => match locks.entry(key) { + BTreeMapEntry::Occupied(mut x) => { + assert_eq!(x.get().ts, start_ts.ts); + assert!(x.get().generation <= start_ts.generation); + x.get_mut().generation = start_ts.generation; + } + BTreeMapEntry::Vacant(x) => { + x.insert(start_ts); + self.memory_quota.alloc(bytes)?; + CDC_PENDING_BYTES_GAUGE.add(bytes as _); + lock_count_modify = 1; + } + }, + } + Ok(lock_count_modify) + } + + fn pop_lock(&mut self, key: Key) -> Result { + let mut lock_count_modify = 0; + match &mut self.lock_tracker { + LockTracker::Pending => unreachable!(), + LockTracker::Preparing(locks) => { + let bytes = key.approximate_heap_size(); + self.memory_quota.alloc(bytes)?; + CDC_PENDING_BYTES_GAUGE.add(bytes as _); + locks.push(PendingLock::Untrack { key }); + } + LockTracker::Prepared { locks, .. } => { + if let Some((key, _)) = locks.remove_entry(&key) { + let bytes = key.approximate_heap_size(); + self.memory_quota.free(bytes); + CDC_PENDING_BYTES_GAUGE.sub(bytes as _); + lock_count_modify = -1; + } + } + } + Ok(lock_count_modify) + } + + pub(crate) fn init_lock_tracker(&mut self) -> bool { + if matches!(self.lock_tracker, LockTracker::Pending) { + self.lock_tracker = LockTracker::Preparing(vec![]); + return true; + } + false + } + + fn finish_prepare_lock_tracker( + &mut self, + region: Region, + mut locks: BTreeMap, + ) -> Result<()> { + let delta_locks = match std::mem::replace(&mut self.lock_tracker, LockTracker::Pending) { + LockTracker::Preparing(locks) => locks, + _ => unreachable!(), + }; + + let mut free_bytes = 0usize; + for delta_lock in delta_locks { + free_bytes += delta_lock.approximate_heap_size(); + match delta_lock { + PendingLock::Track { key, start_ts } => match locks.entry(key) { + BTreeMapEntry::Vacant(x) => { + x.insert(start_ts); + } + BTreeMapEntry::Occupied(x) => { + assert_eq!(x.get().ts, start_ts.ts); + assert!(x.get().generation <= start_ts.generation); + } + }, + PendingLock::Untrack { key } => match locks.entry(key.clone()) { + BTreeMapEntry::Vacant(..) => { + warn!("untrack lock not found when try to finish prepare lock tracker"; + "key" => %key); + } + BTreeMapEntry::Occupied(x) => { + x.remove(); + } + }, + } + } + self.memory_quota.free(free_bytes); + CDC_PENDING_BYTES_GAUGE.sub(free_bytes as _); + + let mut alloc_bytes = 0usize; + for key in locks.keys() { + alloc_bytes += key.approximate_heap_size(); + } + self.memory_quota.alloc(alloc_bytes)?; + CDC_PENDING_BYTES_GAUGE.add(alloc_bytes as _); + + self.lock_tracker = LockTracker::Prepared { region, locks }; + Ok(()) + } + + pub(crate) fn finish_scan_locks( + &mut self, + region: Region, + locks: BTreeMap, + ) -> Result> { + fail::fail_point!("cdc_finish_scan_locks_memory_quota_exceed", |_| Err( + Error::MemoryQuotaExceeded(tikv_util::memory::MemoryQuotaExceeded) + )); + + info!("cdc region is ready"; "region_id" => self.region_id); + self.finish_prepare_lock_tracker(region, locks)?; + + let region = match &self.lock_tracker { + LockTracker::Prepared { region, .. } => region, + _ => unreachable!(), + }; + + // Check observed key range in region. + let mut failed_downstreams = Vec::new(); + for downstream in &mut self.downstreams { + downstream.observed_range.update_region_key_range(region); + if let Err(e) = Self::check_epoch_on_ready(downstream, region) { + failed_downstreams.push((&*downstream, e)); + } + } + + Ok(failed_downstreams) + } + /// Create a Delegate the given region. pub fn new( region_id: u64, - txn_extra_op: Arc>, memory_quota: Arc, + txn_extra_op: Arc>, ) -> Delegate { Delegate { region_id, handle: ObserveHandle::new(), - resolver: None, - region: None, - resolved_downstreams: Vec::new(), - pending: Some(Pending::new(memory_quota)), + memory_quota, + + lock_tracker: LockTracker::Pending, + downstreams: Vec::new(), txn_extra_op, failed: false, + + created: Instant::now_coarse(), + last_lag_warn: Instant::now_coarse(), } } /// Let downstream subscribe the delegate. /// Return error if subscribe fails and the `Delegate` won't be changed. - pub fn subscribe(&mut self, downstream: Downstream) -> Result<()> { - if self.region.is_some() { - // Check if the downstream is out dated. - self.check_epoch_on_ready(&downstream)?; + pub fn subscribe(&mut self, downstream: Downstream) -> StdResult<(), (Error, Downstream)> { + if let LockTracker::Prepared { ref region, .. } = &self.lock_tracker { + // Check if the downstream is outdated. + if let Err(e) = Self::check_epoch_on_ready(&downstream, region) { + return Err((e, downstream)); + } } self.add_downstream(downstream); Ok(()) @@ -378,17 +534,11 @@ impl Delegate { } pub fn downstreams(&self) -> &Vec { - self.pending - .as_ref() - .map(|p| &p.downstreams) - .unwrap_or(&self.resolved_downstreams) + &self.downstreams } pub fn downstreams_mut(&mut self) -> &mut Vec { - self.pending - .as_mut() - .map(|p| &mut p.downstreams) - .unwrap_or(&mut self.resolved_downstreams) + &mut self.downstreams } /// Let downstream unsubscribe the delegate. @@ -402,7 +552,7 @@ impl Delegate { warn!("cdc send unsubscribe failed"; "region_id" => region_id, "error" => ?err, "origin_error" => ?error_event, "downstream_id" => ?d.id, "downstream" => ?d.peer, - "request_id" => d.req_id, "conn_id" => ?d.conn_id); + "request_id" => ?d.req_id, "conn_id" => ?d.conn_id); } } d.state.store(DownstreamState::Stopped); @@ -437,19 +587,18 @@ impl Delegate { warn!("cdc send region error failed"; "region_id" => region_id, "error" => ?err, "origin_error" => ?error, "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, - "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); + "request_id" => ?downstream.req_id, "conn_id" => ?downstream.conn_id); } else { info!("cdc send region error success"; "region_id" => region_id, "origin_error" => ?error, "downstream_id" => ?downstream.id, "downstream" => ?downstream.peer, - "request_id" => downstream.req_id, "conn_id" => ?downstream.conn_id); + "request_id" => ?downstream.req_id, "conn_id" => ?downstream.conn_id); } - Ok(()) }; - // TODO: In case we drop error messages, maybe we need a heartbeat mechanism - // to allow TiCDC detect region status. - let _ = self.broadcast(send); + for downstream in &self.downstreams { + send(downstream); + } } /// `txn_extra_op` returns a shared flag which is accessed in TiKV's @@ -462,74 +611,101 @@ impl Delegate { self.txn_extra_op.as_ref() } - fn broadcast(&self, send: F) -> Result<()> - where - F: Fn(&Downstream) -> Result<()>, - { - let downstreams = self.downstreams(); - assert!( - !downstreams.is_empty(), - "region {} miss downstream", - self.region_id - ); - for downstream in downstreams { - send(downstream)?; - } - Ok(()) - } - - /// Install a resolver. Return downstreams which fail because of the - /// region's internal changes. - pub fn on_region_ready( + /// Try advance and broadcast resolved ts. + pub(crate) fn on_min_ts( &mut self, - mut resolver: Resolver, - region: Region, - ) -> Result> { - assert!( - self.resolver.is_none(), - "region {} resolver should not be ready", - self.region_id, - ); + min_ts: TimeStamp, + current_ts: TimeStamp, + connections: &HashMap, + advance: &mut Advance, + ) { + let locks = match &self.lock_tracker { + LockTracker::Prepared { locks, .. } => locks, + _ => { + let now = Instant::now_coarse(); + let elapsed = now.duration_since(self.created); + if elapsed > WARN_LAG_THRESHOLD + && now.duration_since(self.last_lag_warn) > WARN_LAG_INTERVAL + { + warn!( + "cdc region scan locks too slow"; + "region_id" => self.region_id, + "elapsed" => ?elapsed, + "stage" => ?self.lock_tracker, + ); + self.last_lag_warn = now; + } + return; + } + }; - // Check observed key range in region. - for downstream in self.downstreams_mut() { - downstream.observed_range.update_region_key_range(®ion); - } + let mut handle_downstream = |downstream: &mut Downstream| -> Option { + if !downstream.state.load().ready_for_advancing_ts() { + advance.blocked_on_scan += 1; + return None; + } + advance.scan_finished += 1; - // Mark the delegate as initialized. - info!("cdc region is ready"; "region_id" => self.region_id); - // Downstreams in pending must be moved to resolved_downstreams - // immediately and must not return in the middle, otherwise the delegate - // loses downstreams. - let mut pending = self.pending.take().unwrap(); - self.resolved_downstreams = mem::take(&mut pending.downstreams); + if downstream.lock_heap.is_none() { + let mut lock_heap = BTreeMap::::new(); + for (_, lock) in locks.range(downstream.observed_range.to_range()) { + let lock_count = lock_heap.entry(lock.ts).or_default(); + *lock_count += 1; + } + downstream.lock_heap = Some(lock_heap); + } + + let lock_heap = downstream.lock_heap.as_ref().unwrap(); + let min_lock = lock_heap.keys().next().cloned().unwrap_or(min_ts); + let advanced_to = std::cmp::min(min_lock, min_ts); + if advanced_to > downstream.advanced_to { + downstream.advanced_to = advanced_to; + } else { + advance.blocked_on_locks += 1; + } + Some(downstream.advanced_to) + }; - pending.on_region_ready(&mut resolver)?; - self.resolver = Some(resolver); - self.region = Some(region); + let mut slow_downstreams = Vec::new(); + for d in &mut self.downstreams { + let advanced_to = match handle_downstream(d) { + Some(ts) => ts, + None => continue, + }; - let mut failed_downstreams = Vec::new(); - for downstream in self.downstreams() { - if let Err(e) = self.check_epoch_on_ready(downstream) { - failed_downstreams.push((downstream, e)); + let features = connections.get(&d.conn_id).unwrap().features(); + if features.contains(FeatureGate::STREAM_MULTIPLEXING) { + let k = (d.conn_id, d.req_id); + let v = advance.multiplexing.entry(k).or_default(); + v.push(self.region_id, advanced_to); + } else if features.contains(FeatureGate::BATCH_RESOLVED_TS) { + let v = advance.exclusive.entry(d.conn_id).or_default(); + v.push(self.region_id, advanced_to); + } else { + let k = (d.conn_id, self.region_id); + let v = (d.req_id, advanced_to); + advance.compat.insert(k, v); + } + + let lag = current_ts + .physical() + .saturating_sub(d.advanced_to.physical()); + if Duration::from_millis(lag) > WARN_LAG_THRESHOLD { + slow_downstreams.push(d.id); } } - Ok(failed_downstreams) - } - /// Try advance and broadcast resolved ts. - pub fn on_min_ts(&mut self, min_ts: TimeStamp) -> Option { - if self.resolver.is_none() { - debug!("cdc region resolver not ready"; - "region_id" => self.region_id, "min_ts" => min_ts); - return None; + if !slow_downstreams.is_empty() { + let now = Instant::now_coarse(); + if now.duration_since(self.last_lag_warn) > WARN_LAG_INTERVAL { + warn!( + "cdc region downstreams are too slow"; + "region_id" => self.region_id, + "downstreams" => ?slow_downstreams, + ); + self.last_lag_warn = now; + } } - debug!("cdc try to advance ts"; "region_id" => self.region_id, "min_ts" => min_ts); - let resolver = self.resolver.as_mut().unwrap(); - let resolved_ts = resolver.resolve(min_ts, None, TsSource::Cdc); - debug!("cdc resolved ts updated"; - "region_id" => self.region_id, "resolved_ts" => resolved_ts); - Some(resolved_ts) } pub fn on_batch( @@ -552,19 +728,17 @@ impl Delegate { } = cmd; if response.get_header().has_error() { let err_header = response.mut_header().take_error(); - self.mark_failed(); return Err(Error::request(err_header)); } if !request.has_admin_request() { let flags = WriteBatchFlags::from_bits_truncate(request.get_header().get_flags()); - let is_one_pc = flags.contains(WriteBatchFlags::ONE_PC); self.sink_data( index, request.requests.into(), + flags, old_value_cb, old_value_cache, statistics, - is_one_pc, )?; } else { self.sink_admin(request.take_admin_request(), response.take_admin_response())?; @@ -575,7 +749,7 @@ impl Delegate { pub(crate) fn convert_to_grpc_events( region_id: u64, - request_id: u64, + request_id: RequestId, entries: Vec>, filter_loop: bool, observed_range: &ObservedRange, @@ -605,7 +779,7 @@ impl Delegate { } decode_default(default.1, &mut row, &mut _has_value); row.old_value = old_value.finalized().unwrap_or_default(); - row_size = row.key.len() + row.value.len(); + row_size = row.key.len() + row.value.len() + row.old_value.len(); } Some(KvEntry::TxnEntry(TxnEntry::Commit { default, @@ -633,7 +807,7 @@ impl Delegate { } set_event_row_type(&mut row, EventLogType::Committed); row.old_value = old_value.finalized().unwrap_or_default(); - row_size = row.key.len() + row.value.len(); + row_size = row.key.len() + row.value.len() + row.old_value.len(); } None => { // This type means scan has finished. @@ -641,10 +815,9 @@ impl Delegate { row_size = 0; } } - let lossy_ddl_filter = TxnSource::is_lossy_ddl_reorg_source_set(row.txn_source); - let cdc_write_filter = - TxnSource::is_cdc_write_source_set(row.txn_source) && filter_loop; - if lossy_ddl_filter || cdc_write_filter { + if TxnSource::is_lossy_ddl_reorg_source_set(row.txn_source) + || filter_loop && TxnSource::is_cdc_write_source_set(row.txn_source) + { continue; } if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { @@ -665,7 +838,7 @@ impl Delegate { }; CdcEvent::Event(Event { region_id, - request_id, + request_id: request_id.0, event: Some(Event_oneof_event::Entries(event_entries)), ..Default::default() }) @@ -678,12 +851,13 @@ impl Delegate { &mut self, index: u64, requests: Vec, + flags: WriteBatchFlags, old_value_cb: &OldValueCallback, old_value_cache: &mut OldValueCache, statistics: &mut Statistics, - is_one_pc: bool, ) -> Result<()> { debug_assert_eq!(self.txn_extra_op.load(), TxnExtraOp::ReadOldValue); + let mut read_old_value = |row: &mut EventRow, read_old_ts| -> Result<()> { let key = Key::from_raw(&row.key).append_ts(row.start_ts.into()); let old_value = old_value_cb(key, read_old_ts, old_value_cache, statistics)?; @@ -691,274 +865,220 @@ impl Delegate { Ok(()) }; - // map[key] -> (event, has_value). - let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); - let mut raw_rows: Vec = Vec::new(); + let mut rows_builder = RowsBuilder::default(); + rows_builder.is_one_pc = flags.contains(WriteBatchFlags::ONE_PC); for mut req in requests { - let res = match req.get_cmd_type() { - CmdType::Put => self.sink_put( - req.take_put(), - is_one_pc, - &mut txn_rows, - &mut raw_rows, - &mut read_old_value, - ), - CmdType::Delete => self.sink_delete(req.take_delete()), - _ => { - debug!( - "skip other command"; - "region_id" => self.region_id, - "command" => ?req, - ); - Ok(()) + match req.get_cmd_type() { + CmdType::Put => { + self.sink_put(req.take_put(), &mut rows_builder, &mut read_old_value)? } + CmdType::Delete => self.sink_delete(req.take_delete(), &mut rows_builder)?, + _ => debug!("cdc skip other command"; + "region_id" => self.region_id, + "command" => ?req), }; - if res.is_err() { - self.mark_failed(); - return res; - } } - let mut rows = Vec::with_capacity(txn_rows.len()); - for (_, (v, has_value)) in txn_rows { - if v.r_type == EventLogType::Prewrite && v.op_type == EventRowOpType::Put && !has_value + let (raws, txns) = rows_builder.finish_build(); + self.sink_downstream_raw(raws, index)?; + self.sink_downstream_tidb(txns)?; + Ok(()) + } + + fn sink_downstream_raw(&mut self, entries: Vec, index: u64) -> Result<()> { + let mut downstreams = Vec::with_capacity(self.downstreams.len()); + for d in &mut self.downstreams { + if d.kv_api == ChangeDataRequestKvApi::RawKv && d.state.load().ready_for_change_events() { - // It's possible that a prewrite command only contains lock but without - // default. It's not documented by classic Percolator but introduced with - // Large-Transaction. Those prewrites are not complete, we must skip them. + downstreams.push(d); + } + } + if downstreams.is_empty() { + return Ok(()); + } + + for downstream in downstreams { + let filtered_entries: Vec<_> = entries + .iter() + .filter(|x| downstream.observed_range.contains_raw_key(&x.key)) + .cloned() + .collect(); + if filtered_entries.is_empty() { continue; } - rows.push(v); + let event = Event { + region_id: self.region_id, + index, + request_id: downstream.req_id.0, + event: Some(Event_oneof_event::Entries(EventEntries { + entries: filtered_entries.into(), + ..Default::default() + })), + ..Default::default() + }; + downstream.sink_event(event, false)?; } - self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; - self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv) + Ok(()) } - fn sink_downstream( - &mut self, - entries: Vec, - index: u64, - kv_api: ChangeDataRequestKvApi, - ) -> Result<()> { - if entries.is_empty() { + fn sink_downstream_tidb(&mut self, entries: Vec<(EventRow, isize)>) -> Result<()> { + let mut downstreams = Vec::with_capacity(self.downstreams.len()); + for d in &mut self.downstreams { + if d.kv_api == ChangeDataRequestKvApi::TiDb && d.state.load().ready_for_change_events() + { + downstreams.push(d); + } + } + if downstreams.is_empty() { return Ok(()); } - // Filter the entries which are lossy DDL events. - // We don't need to send them to downstream. - let entries = entries - .iter() - .filter(|x| !TxnSource::is_lossy_ddl_reorg_source_set(x.txn_source)) - .cloned() - .collect::>(); - - let downstreams = self.downstreams(); - assert!( - !downstreams.is_empty(), - "region {} miss downstream", - self.region_id - ); - - // Collect the change event cause by user write, which cdc write source is not - // set. For changefeed which only need the user write, - // send the `filtered_entries`, or else, send them all. - let mut filtered_entries = None; for downstream in downstreams { - if downstream.filter_loop { - let filtered = entries - .iter() - .filter(|x| !TxnSource::is_cdc_write_source_set(x.txn_source)) - .cloned() - .collect::>(); - if !filtered.is_empty() { - filtered_entries = Some(filtered); + let mut filtered_entries = Vec::with_capacity(entries.len()); + for (entry, lock_count_modify) in &entries { + if !downstream.observed_range.contains_raw_key(&entry.key) { + continue; } - break; - } - } - let region_id = self.region_id; - let send = move |downstream: &Downstream| { - // No ready downstream or a downstream that does not match the kv_api type, will - // be ignored. There will be one region that contains both Txn & Raw entries. - // The judgement here is for sending entries to downstreams with correct kv_api. - if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { - return Ok(()); - } - if downstream.filter_loop && filtered_entries.is_none() { - return Ok(()); - } + if *lock_count_modify != 0 && downstream.lock_heap.is_some() { + let lock_heap = downstream.lock_heap.as_mut().unwrap(); + match lock_heap.entry(entry.start_ts.into()) { + BTreeMapEntry::Vacant(x) => { + x.insert(*lock_count_modify); + } + BTreeMapEntry::Occupied(mut x) => { + *x.get_mut() += *lock_count_modify; + assert!( + *x.get() >= 0, + "lock_count_modify should never be negative, start_ts: {}", + entry.start_ts + ); + if *x.get() == 0 { + x.remove(); + } + } + } + } - let entries_clone = if downstream.filter_loop { - downstream - .observed_range - .filter_entries(filtered_entries.clone().unwrap()) - } else { - downstream.observed_range.filter_entries(entries.clone()) - }; + if TxnSource::is_lossy_ddl_reorg_source_set(entry.txn_source) + || downstream.filter_loop + && TxnSource::is_cdc_write_source_set(entry.txn_source) + { + continue; + } - if entries_clone.is_empty() { - return Ok(()); + filtered_entries.push(entry.clone()); + } + if filtered_entries.is_empty() { + continue; } - let event = Event { - region_id, - request_id: downstream.get_req_id(), - index, + region_id: self.region_id, + request_id: downstream.req_id.0, event: Some(Event_oneof_event::Entries(EventEntries { - entries: entries_clone.into(), + entries: filtered_entries.into(), ..Default::default() })), ..Default::default() }; - - // Do not force send for real time change data events. - let force_send = false; - downstream.sink_event(event, force_send) - }; - match self.broadcast(send) { - Ok(()) => Ok(()), - Err(e) => { - self.mark_failed(); - Err(e) - } + downstream.sink_event(event, false)?; } + Ok(()) } fn sink_put( &mut self, put: PutRequest, - is_one_pc: bool, - txn_rows: &mut HashMap, (EventRow, bool)>, - raw_rows: &mut Vec, + rows_builder: &mut RowsBuilder, read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { let key_mode = ApiV2::parse_key_mode(put.get_key()); if key_mode == KeyMode::Raw { - self.sink_raw_put(put, raw_rows) + self.sink_raw_put(put, rows_builder) } else { - self.sink_txn_put(put, is_one_pc, txn_rows, read_old_value) + self.sink_txn_put(put, read_old_value, rows_builder) } } - fn sink_raw_put(&mut self, mut put: PutRequest, rows: &mut Vec) -> Result<()> { + fn sink_raw_put(&mut self, mut put: PutRequest, rows: &mut RowsBuilder) -> Result<()> { let mut row = EventRow::default(); decode_rawkv(put.take_key(), put.take_value(), &mut row)?; - rows.push(row); + rows.raws.push(row); Ok(()) } fn sink_txn_put( &mut self, mut put: PutRequest, - is_one_pc: bool, - rows: &mut HashMap, (EventRow, bool)>, mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, + rows: &mut RowsBuilder, ) -> Result<()> { match put.cf.as_str() { "write" => { - let (mut row, mut has_value) = (EventRow::default(), false); - if decode_write(put.take_key(), &put.value, &mut row, &mut has_value, true) { + let key = Key::from_encoded_slice(&put.key).truncate_ts().unwrap(); + let row = rows.txns_by_key.entry(key).or_default(); + if decode_write( + put.take_key(), + &put.value, + &mut row.v, + &mut row.has_value, + true, + ) { return Ok(()); } - let commit_ts = if is_one_pc { - set_event_row_type(&mut row, EventLogType::Committed); - let commit_ts = TimeStamp::from(row.commit_ts); - read_old_value(&mut row, commit_ts.prev())?; - Some(commit_ts) - } else { - // 2PC - if row.commit_ts == 0 { - None - } else { - Some(TimeStamp::from(row.commit_ts)) - } - }; - // validate commit_ts must be greater than the current resolved_ts - if let (Some(resolver), Some(commit_ts)) = (&self.resolver, commit_ts) { - let resolved_ts = resolver.resolved_ts(); - assert!( - commit_ts > resolved_ts, - "region {} commit_ts: {:?}, resolved_ts: {:?}", - self.region_id, - commit_ts, - resolved_ts - ); - } - - match rows.entry(row.key.clone()) { - HashMapEntry::Occupied(o) => { - let o = o.into_mut(); - mem::swap(&mut o.0.value, &mut row.value); - o.0 = row; - } - HashMapEntry::Vacant(v) => { - v.insert((row, has_value)); - } + if rows.is_one_pc { + set_event_row_type(&mut row.v, EventLogType::Committed); + let read_old_ts = TimeStamp::from(row.v.commit_ts).prev(); + read_old_value(&mut row.v, read_old_ts)?; } } "lock" => { - let (mut row, mut has_value) = (EventRow::default(), false); let lock = Lock::parse(put.get_value()).unwrap(); let for_update_ts = lock.for_update_ts; - if decode_lock(put.take_key(), lock, &mut row, &mut has_value) { + let txn_source = lock.txn_source; + let generation = lock.generation; + + let key = Key::from_encoded_slice(&put.key); + let row = rows.txns_by_key.entry(key.clone()).or_default(); + if decode_lock(put.take_key(), lock, &mut row.v, &mut row.has_value) { return Ok(()); } - let read_old_ts = std::cmp::max(for_update_ts, row.start_ts.into()); - read_old_value(&mut row, read_old_ts)?; - - // In order to compute resolved ts, we must track inflight txns. - match self.resolver { - Some(ref mut resolver) => { - resolver.track_lock(row.start_ts.into(), row.key.clone(), None)?; - } - None => { - assert!(self.pending.is_some(), "region resolver not ready"); - let pending = self.pending.as_mut().unwrap(); - pending.push_pending_lock(PendingLock::Track { - key: row.key.clone(), - start_ts: row.start_ts.into(), - })?; - } - } + assert_eq!(row.lock_count_modify, 0); + let mini_lock = MiniLock::new(row.v.start_ts, txn_source, generation); + row.lock_count_modify = self.push_lock(key, mini_lock)?; - let occupied = rows.entry(row.key.clone()).or_default(); - if occupied.1 { - assert!(!has_value); - has_value = true; - mem::swap(&mut occupied.0.value, &mut row.value); - } - *occupied = (row, has_value); + let read_old_ts = std::cmp::max(for_update_ts, row.v.start_ts.into()); + read_old_value(&mut row.v, read_old_ts)?; } "" | "default" => { let key = Key::from_encoded(put.take_key()).truncate_ts().unwrap(); - let row = rows.entry(key.into_raw().unwrap()).or_default(); - decode_default(put.take_value(), &mut row.0, &mut row.1); + let row = rows.txns_by_key.entry(key).or_default(); + decode_default(put.take_value(), &mut row.v, &mut row.has_value); } other => panic!("invalid cf {}", other), } Ok(()) } - fn sink_delete(&mut self, mut delete: DeleteRequest) -> Result<()> { + fn sink_delete(&mut self, mut delete: DeleteRequest, rows: &mut RowsBuilder) -> Result<()> { + // RawKV (API v2, and only API v2 can use CDC) has no lock and will write to + // default cf only. match delete.cf.as_str() { "lock" => { - let raw_key = Key::from_encoded(delete.take_key()).into_raw().unwrap(); - match self.resolver { - Some(ref mut resolver) => resolver.untrack_lock(&raw_key, None), - None => { - assert!(self.pending.is_some(), "region resolver not ready"); - let pending = self.pending.as_mut().unwrap(); - pending.push_pending_lock(PendingLock::Untrack { key: raw_key })?; - } + let key = Key::from_encoded(delete.take_key()); + let lock_count_modify = self.pop_lock(key.clone())?; + if lock_count_modify != 0 { + // If lock_count_modify isn't 0 it means the deletion must come from a commit + // or rollback, instead of any `Unlock` operations. + let row = rows.txns_by_key.get_mut(&key).unwrap(); + assert_eq!(row.lock_count_modify, 0); + row.lock_count_modify = lock_count_modify; } } "" | "default" | "write" => {} - other => { - panic!("invalid cf {}", other); - } + other => panic!("invalid cf {}", other), } Ok(()) } @@ -983,7 +1103,6 @@ impl Delegate { } _ => return Ok(()), }; - self.mark_failed(); Err(Error::request(store_err.into())) } @@ -993,10 +1112,9 @@ impl Delegate { } fn remove_downstream(&mut self, id: DownstreamId) -> Option { - let downstreams = self.downstreams_mut(); - if let Some(index) = downstreams.iter().position(|x| x.id == id) { - let downstream = downstreams.swap_remove(index); - if downstreams.is_empty() { + if let Some(index) = self.downstreams.iter().position(|x| x.id == id) { + let downstream = self.downstreams.swap_remove(index); + if self.downstreams.is_empty() { // Stop observing when the last downstream is removed. Otherwise the observer // will keep pushing events to the delegate. self.stop_observing(); @@ -1006,8 +1124,7 @@ impl Delegate { None } - fn check_epoch_on_ready(&self, downstream: &Downstream) -> Result<()> { - let region = self.region.as_ref().unwrap(); + fn check_epoch_on_ready(downstream: &Downstream, region: &Region) -> Result<()> { if let Err(e) = compare_region_epoch( &downstream.region_epoch, region, @@ -1020,7 +1137,7 @@ impl Delegate { "region_id" => region.id, "downstream_id" => ?downstream.id, "conn_id" => ?downstream.conn_id, - "req_id" => downstream.req_id, + "req_id" => ?downstream.req_id, "err" => ?e ); // Downstream is outdated, mark stop. @@ -1039,6 +1156,45 @@ impl Delegate { } } +#[derive(Default)] +struct RowsBuilder { + // map[Key]->(row, has_value, lock_count_modify) + txns_by_key: HashMap, + + raws: Vec, + + is_one_pc: bool, +} + +#[derive(Default)] +struct RowInBuilding { + v: EventRow, + has_value: bool, + lock_count_modify: isize, +} + +impl RowsBuilder { + fn finish_build(self) -> (Vec, Vec<(EventRow, isize)>) { + let mut txns = Vec::with_capacity(self.txns_by_key.len()); + for RowInBuilding { + v, + has_value, + lock_count_modify, + } in self.txns_by_key.into_values() + { + if v.r_type == EventLogType::Prewrite && v.op_type == EventRowOpType::Put && !has_value + { + // It's possible that a prewrite command only contains lock but without + // default. It's not documented by classic Percolator but introduced with + // Large-Transaction. Those prewrites are not complete, we must skip them. + continue; + } + txns.push((v, lock_count_modify)); + } + (self.raws, txns) + } +} + fn set_event_row_type(row: &mut EventRow, ty: EventLogType) { row.r_type = ty; } @@ -1108,29 +1264,27 @@ fn decode_write( false } -fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow, has_value: &mut bool) -> bool { +fn decode_lock(key: Vec, mut lock: Lock, row: &mut EventRow, has_value: &mut bool) -> bool { + let key = Key::from_encoded(key); let op_type = match lock.lock_type { LockType::Put => EventRowOpType::Put, LockType::Delete => EventRowOpType::Delete, other => { - debug!("cdc skip lock record"; - "type" => ?other, - "start_ts" => ?lock.ts, - "key" => &log_wrappers::Value::key(&key), - "for_update_ts" => ?lock.for_update_ts); + debug!("cdc skip lock record"; "lock" => ?other, "key" => %key); return true; } }; - let key = Key::from_encoded(key); + row.start_ts = lock.ts.into_inner(); + row.generation = lock.generation; row.key = key.into_raw().unwrap(); row.op_type = op_type as _; - // used for filter out the event. see `txn_source` field for more detail. row.txn_source = lock.txn_source; set_event_row_type(row, EventLogType::Prewrite); - if let Some(value) = lock.short_value { - row.value = value; + if let Some(value) = lock.short_value.take() { + assert!(!*has_value, "unexpected lock with value: {:?}", lock); *has_value = true; + row.value = value; } false @@ -1167,21 +1321,37 @@ fn decode_default(value: Vec, row: &mut EventRow, has_value: &mut bool) { } /// Observed key range. -#[derive(Clone, Default)] +#[derive(Clone)] pub struct ObservedRange { - start_key_encoded: Vec, - end_key_encoded: Vec, - start_key_raw: Vec, - end_key_raw: Vec, - pub(crate) all_key_covered: bool, + pub start_key_encoded: Key, + pub end_key_encoded: Key, + pub start_key_raw: Vec, + pub end_key_raw: Vec, + pub all_key_covered: bool, +} + +impl Default for ObservedRange { + fn default() -> Self { + ObservedRange { + start_key_encoded: Key::from_encoded(vec![]), + end_key_encoded: Key::from_encoded(vec![]), + start_key_raw: vec![], + end_key_raw: vec![], + all_key_covered: false, + } + } } impl ObservedRange { pub fn new(start_key_encoded: Vec, end_key_encoded: Vec) -> Result { - let start_key_raw = Key::from_encoded(start_key_encoded.clone()) + let start_key_encoded = Key::from_encoded(start_key_encoded); + let end_key_encoded = Key::from_encoded(end_key_encoded); + let start_key_raw = start_key_encoded + .clone() .into_raw() .map_err(|e| Error::Other(e.into()))?; - let end_key_raw = Key::from_encoded(end_key_encoded.clone()) + let end_key_raw = end_key_encoded + .clone() .into_raw() .map_err(|e| Error::Other(e.into()))?; Ok(ObservedRange { @@ -1196,9 +1366,10 @@ impl ObservedRange { #[allow(clippy::collapsible_if)] pub fn update_region_key_range(&mut self, region: &Region) { // Check observed key range in region. - if self.start_key_encoded <= region.start_key { + if self.start_key_encoded.as_encoded() <= ®ion.start_key { if self.end_key_encoded.is_empty() - || (region.end_key <= self.end_key_encoded && !region.end_key.is_empty()) + || (®ion.end_key <= self.end_key_encoded.as_encoded() + && !region.end_key.is_empty()) { // Observed range covers the region. self.all_key_covered = true; @@ -1217,7 +1388,15 @@ impl ObservedRange { } pub fn contains_encoded_key(&self, key: &[u8]) -> bool { - self.is_key_in_range(&self.start_key_encoded, &self.end_key_encoded, key) + self.is_key_in_range( + self.start_key_encoded.as_encoded(), + self.end_key_encoded.as_encoded(), + key, + ) + } + + pub fn contains_raw_key(&self, key: &[u8]) -> bool { + self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, key) } pub fn filter_entries(&self, mut entries: Vec) -> Vec { @@ -1228,8 +1407,21 @@ impl ObservedRange { entries.retain(|e| self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, &e.key)); entries } + + fn to_range(&self) -> (Bound<&Key>, Bound<&Key>) { + let start = Bound::Included(&self.start_key_encoded); + let end = if self.end_key_encoded.is_empty() { + Bound::Unbounded + } else { + Bound::Excluded(&self.end_key_encoded) + }; + (start, end) + } } +const WARN_LAG_THRESHOLD: Duration = Duration::from_secs(600); +const WARN_LAG_INTERVAL: Duration = Duration::from_secs(60); + #[cfg(test)] mod tests { use std::cell::Cell; @@ -1253,9 +1445,9 @@ mod tests { let region_epoch = region.get_region_epoch().clone(); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (sink, mut drain) = crate::channel::channel(1, quota); + let (sink, mut drain) = channel(ConnId::default(), 1, quota.clone()); let rx = drain.drain(); - let request_id = 123; + let request_id = RequestId(123); let mut downstream = Downstream::new( String::new(), region_epoch, @@ -1266,32 +1458,24 @@ mod tests { ObservedRange::default(), ); downstream.set_sink(sink); - let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); - let mut delegate = Delegate::new(region_id, Default::default(), memory_quota); + + let mut delegate = Delegate::new(region_id, quota, Default::default()); delegate.subscribe(downstream).unwrap(); assert!(delegate.handle.is_observing()); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let resolver = Resolver::new(region_id, memory_quota); - assert!( - delegate - .on_region_ready(resolver, region) - .unwrap() - .is_empty() - ); - assert!(delegate.downstreams()[0].observed_range.all_key_covered); + + assert!(delegate.init_lock_tracker()); + let fails = delegate + .finish_scan_locks(region, Default::default()) + .unwrap(); + assert!(fails.is_empty()); + assert!(delegate.downstreams[0].observed_range.all_key_covered); let rx_wrap = Cell::new(Some(rx)); let receive_error = || { let (event, rx) = block_on(rx_wrap.replace(None).unwrap().into_future()); rx_wrap.set(Some(rx)); - let event = event.unwrap(); - assert!( - matches!(event.0, CdcEvent::Event(_)), - "unknown event {:?}", - event - ); - if let CdcEvent::Event(mut e) = event.0 { - assert_eq!(e.get_request_id(), request_id); + if let CdcEvent::Event(mut e) = event.unwrap().0 { + assert_eq!(e.get_request_id(), request_id.0); let event = e.event.take().unwrap(); match event { Event_oneof_event::Error(err) => err, @@ -1316,6 +1500,10 @@ mod tests { let err = receive_error(); assert!(err.has_region_not_found()); + delegate.stop(Error::Sink(SendError::Congested)); + let err = receive_error(); + assert!(err.has_congested()); + let mut err_header = ErrorHeader::default(); err_header.set_epoch_not_match(Default::default()); delegate.stop(Error::request(err_header)); @@ -1384,8 +1572,8 @@ mod tests { #[test] fn test_delegate_subscribe_unsubscribe() { - let new_downstream = |id: u64, region_version: u64| { - let peer = format!("{}", id); + let new_downstream = |id: RequestId, region_version: u64| { + let peer = format!("{:?}", id); let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(region_version); epoch.set_version(region_version); @@ -1403,19 +1591,19 @@ mod tests { // Create a new delegate. let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op.clone(), memory_quota); + let mut delegate = Delegate::new(1, memory_quota, txn_extra_op.clone()); assert_eq!(txn_extra_op.load(), TxnExtraOp::Noop); assert!(delegate.handle.is_observing()); // Subscribe once. - let downstream1 = new_downstream(1, 1); + let downstream1 = new_downstream(RequestId(1), 1); let downstream1_id = downstream1.id; delegate.subscribe(downstream1).unwrap(); assert_eq!(txn_extra_op.load(), TxnExtraOp::ReadOldValue); assert!(delegate.handle.is_observing()); // Subscribe twice and then unsubscribe the second downstream. - let downstream2 = new_downstream(2, 1); + let downstream2 = new_downstream(RequestId(2), 1); let downstream2_id = downstream2.id; delegate.subscribe(downstream2).unwrap(); assert!(!delegate.unsubscribe(downstream2_id, None)); @@ -1423,14 +1611,14 @@ mod tests { assert!(delegate.handle.is_observing()); // `on_region_ready` when the delegate isn't resolved. - delegate.subscribe(new_downstream(1, 2)).unwrap(); + delegate.subscribe(new_downstream(RequestId(1), 2)).unwrap(); let mut region = Region::default(); region.mut_region_epoch().set_conf_ver(1); region.mut_region_epoch().set_version(1); { - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + assert!(delegate.init_lock_tracker()); let failures = delegate - .on_region_ready(Resolver::new(1, memory_quota), region) + .finish_scan_locks(region, Default::default()) .unwrap(); assert_eq!(failures.len(), 1); let id = failures[0].0.id; @@ -1441,7 +1629,9 @@ mod tests { assert!(delegate.handle.is_observing()); // Subscribe with an invalid epoch. - delegate.subscribe(new_downstream(1, 2)).unwrap_err(); + delegate + .subscribe(new_downstream(RequestId(1), 2)) + .unwrap_err(); assert_eq!(delegate.downstreams().len(), 1); // Unsubscribe all downstreams. @@ -1524,10 +1714,11 @@ mod tests { .unwrap(); let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); + let mut delegate = Delegate::new(1, memory_quota, txn_extra_op); assert!(delegate.handle.is_observing()); + assert!(delegate.init_lock_tracker()); - let mut map = HashMap::default(); + let mut rows_builder = RowsBuilder::default(); for k in b'a'..=b'e' { let mut put = PutRequest::default(); put.key = Key::from_raw(&[k]).into_encoded(); @@ -1537,7 +1728,7 @@ mod tests { put.key.clone(), 1.into(), 10, - None, + Some(b"test".to_vec()), TimeStamp::zero(), 0, TimeStamp::zero(), @@ -1545,35 +1736,26 @@ mod tests { ) .to_bytes(); delegate - .sink_txn_put( - put, - false, - &mut map, - |_: &mut EventRow, _: TimeStamp| Ok(()), - ) + .sink_txn_put(put, |_, _| Ok(()), &mut rows_builder) .unwrap(); } - assert_eq!(map.len(), 5); + assert_eq!(rows_builder.txns_by_key.len(), 5); - let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); - let downstream = Downstream { - id: DownstreamId::new(), - req_id: 1, - conn_id: ConnId::new(), - peer: String::new(), - region_epoch: RegionEpoch::default(), - sink: Some(sink), - state: Arc::new(AtomicCell::new(DownstreamState::Normal)), - scan_truncated: Arc::new(Default::default()), - kv_api: ChangeDataRequestKvApi::TiDb, - filter_loop: false, + let (sink, mut drain) = channel(ConnId::default(), 1, Arc::new(MemoryQuota::new(1024))); + let mut downstream = Downstream::new( + "peer".to_owned(), + RegionEpoch::default(), + RequestId(1), + ConnId::new(), + ChangeDataRequestKvApi::TiDb, + false, observed_range, - }; + ); + downstream.set_sink(sink); + downstream.get_state().store(DownstreamState::Normal); delegate.add_downstream(downstream); - let entries = map.values().map(|(r, _)| r).cloned().collect(); - delegate - .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) - .unwrap(); + let (_, entries) = rows_builder.finish_build(); + delegate.sink_downstream_tidb(entries).unwrap(); let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); let runtime = tokio::runtime::Runtime::new().unwrap(); @@ -1595,10 +1777,11 @@ mod tests { .unwrap(); let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); - let mut delegate = Delegate::new(1, txn_extra_op, memory_quota); + let mut delegate = Delegate::new(1, memory_quota, txn_extra_op); assert!(delegate.handle.is_observing()); + assert!(delegate.init_lock_tracker()); - let mut map = HashMap::default(); + let mut rows_builder = RowsBuilder::default(); for k in b'a'..=b'e' { let mut put = PutRequest::default(); put.key = Key::from_raw(&[k]).into_encoded(); @@ -1608,7 +1791,7 @@ mod tests { put.key.clone(), 1.into(), 10, - None, + Some(b"test".to_vec()), TimeStamp::zero(), 0, TimeStamp::zero(), @@ -1620,35 +1803,26 @@ mod tests { } put.value = lock.to_bytes(); delegate - .sink_txn_put( - put, - false, - &mut map, - |_: &mut EventRow, _: TimeStamp| Ok(()), - ) + .sink_txn_put(put, |_, _| Ok(()), &mut rows_builder) .unwrap(); } - assert_eq!(map.len(), 5); + assert_eq!(rows_builder.txns_by_key.len(), 5); - let (sink, mut drain) = channel(1, Arc::new(MemoryQuota::new(1024))); - let downstream = Downstream { - id: DownstreamId::new(), - req_id: 1, - conn_id: ConnId::new(), - peer: String::new(), - region_epoch: RegionEpoch::default(), - sink: Some(sink), - state: Arc::new(AtomicCell::new(DownstreamState::Normal)), - scan_truncated: Arc::new(Default::default()), - kv_api: ChangeDataRequestKvApi::TiDb, + let (sink, mut drain) = channel(ConnId::default(), 1, Arc::new(MemoryQuota::new(1024))); + let mut downstream = Downstream::new( + "peer".to_owned(), + RegionEpoch::default(), + RequestId(1), + ConnId::new(), + ChangeDataRequestKvApi::TiDb, filter_loop, observed_range, - }; + ); + downstream.set_sink(sink); + downstream.get_state().store(DownstreamState::Normal); delegate.add_downstream(downstream); - let entries = map.values().map(|(r, _)| r).cloned().collect(); - delegate - .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) - .unwrap(); + let (_, entries) = rows_builder.finish_build(); + delegate.sink_downstream_tidb(entries).unwrap(); let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); let runtime = tokio::runtime::Runtime::new().unwrap(); @@ -1726,4 +1900,66 @@ mod tests { } } } + + #[test] + fn test_lock_tracker() { + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let mut delegate = Delegate::new(1, quota.clone(), Default::default()); + assert!(delegate.init_lock_tracker()); + assert!(!delegate.init_lock_tracker()); + + let mut k1 = Vec::with_capacity(100); + k1.extend_from_slice(Key::from_raw(b"key1").as_encoded()); + let k1 = Key::from_encoded(k1); + assert_eq!(delegate.push_lock(k1, MiniLock::from_ts(100)).unwrap(), 0); + assert_eq!(quota.in_use(), 100); + + delegate.pop_lock(Key::from_raw(b"key1")).unwrap(); + assert_eq!(quota.in_use(), 117); + + let mut k2 = Vec::with_capacity(200); + k2.extend_from_slice(Key::from_raw(b"key2").as_encoded()); + let k2 = Key::from_encoded(k2); + assert_eq!(delegate.push_lock(k2, MiniLock::from_ts(100)).unwrap(), 0); + assert_eq!(quota.in_use(), 317); + + let mut scaned_locks = BTreeMap::default(); + scaned_locks.insert(Key::from_raw(b"key1"), MiniLock::from_ts(100)); + scaned_locks.insert(Key::from_raw(b"key2"), MiniLock::from_ts(100)); + scaned_locks.insert(Key::from_raw(b"key3"), MiniLock::from_ts(100)); + delegate + .finish_prepare_lock_tracker(Default::default(), scaned_locks) + .unwrap(); + assert_eq!(quota.in_use(), 34); + + delegate.pop_lock(Key::from_raw(b"key2")).unwrap(); + delegate.pop_lock(Key::from_raw(b"key3")).unwrap(); + assert_eq!(quota.in_use(), 0); + + let v = delegate + .push_lock(Key::from_raw(b"key1"), MiniLock::from_ts(300)) + .unwrap(); + assert_eq!(v, 1); + assert_eq!(quota.in_use(), 17); + let v = delegate + .push_lock(Key::from_raw(b"key1"), MiniLock::from_ts(300)) + .unwrap(); + assert_eq!(v, 0); + assert_eq!(quota.in_use(), 17); + } + + #[test] + fn test_lock_tracker_untrack_vacant() { + let quota = Arc::new(MemoryQuota::new(usize::MAX)); + let mut delegate = Delegate::new(1, quota.clone(), Default::default()); + assert!(delegate.init_lock_tracker()); + assert!(!delegate.init_lock_tracker()); + + delegate.pop_lock(Key::from_raw(b"key1")).unwrap(); + let mut scaned_locks = BTreeMap::default(); + scaned_locks.insert(Key::from_raw(b"key2"), MiniLock::from_ts(100)); + delegate + .finish_prepare_lock_tracker(Default::default(), scaned_locks) + .unwrap(); + } } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index a436da351c1..972be77067e 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1,12 +1,11 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cell::RefCell, cmp::{Ord, Ordering as CmpOrdering, PartialOrd, Reverse}, - collections::BinaryHeap, + collections::{BTreeMap, BinaryHeap}, fmt, sync::{ - atomic::{AtomicIsize, Ordering}, + atomic::{AtomicBool, AtomicIsize, Ordering}, Arc, Mutex as StdMutex, }, time::Duration, @@ -34,9 +33,9 @@ use pd_client::{Feature, PdClient}; use raftstore::{ coprocessor::{CmdBatch, ObserveId}, router::CdcHandle, - store::fsm::{store::StoreRegionMeta, ChangeObserver}, + store::fsm::store::StoreRegionMeta, }; -use resolved_ts::{resolve_by_raft, LeadershipResolver, Resolver}; +use resolved_ts::{resolve_by_raft, LeadershipResolver}; use security::SecurityManager; use tikv::{ config::{CdcConfig, ResolvedTsConfig}, @@ -57,40 +56,35 @@ use tokio::{ runtime::{Builder, Runtime}, sync::Semaphore, }; -use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; +use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ channel::{CdcEvent, SendError}, - delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, + delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState, MiniLock}, initializer::Initializer, metrics::*, old_value::{OldValueCache, OldValueCallback}, - service::{validate_kv_api, Conn, ConnId, FeatureGate}, + service::{validate_kv_api, Conn, ConnId, FeatureGate, RequestId}, CdcObserver, Error, }; const FEATURE_RESOLVED_TS_STORE: Feature = Feature::require(5, 0, 0); const METRICS_FLUSH_INTERVAL: u64 = 1_000; // 1s -// 10 minutes, it's the default gc life time of TiDB -// and is long enough for most transactions. -const WARN_RESOLVED_TS_LAG_THRESHOLD: Duration = Duration::from_secs(600); -// Suppress repeat resolved ts lag warning. -const WARN_RESOLVED_TS_COUNT_THRESHOLD: usize = 10; pub enum Deregister { Conn(ConnId), Request { conn_id: ConnId, - request_id: u64, + request_id: RequestId, }, Region { conn_id: ConnId, - request_id: u64, + request_id: RequestId, region_id: u64, }, Downstream { conn_id: ConnId, - request_id: u64, + request_id: RequestId, region_id: u64, downstream_id: DownstreamId, err: Option, @@ -169,7 +163,6 @@ pub enum Task { Register { request: ChangeDataRequest, downstream: Downstream, - conn_id: ConnId, }, Deregister(Deregister), OpenConn { @@ -189,10 +182,10 @@ pub enum Task { min_ts: TimeStamp, current_ts: TimeStamp, }, - ResolverReady { + FinishScanLocks { observe_id: ObserveId, region: Region, - resolver: Resolver, + locks: BTreeMap, }, RegisterMinTsEvent { leader_resolver: LeadershipResolver, @@ -203,11 +196,13 @@ pub enum Task { // the downstream switches to Normal after the previous commands was sunk. InitDownstream { region_id: u64, + observe_id: ObserveId, downstream_id: DownstreamId, downstream_state: Arc>, + sink: crate::channel::Sink, + build_resolver: Arc, // `incremental_scan_barrier` will be sent into `sink` to ensure all delta changes // are delivered to the downstream. And then incremental scan can start. - sink: crate::channel::Sink, incremental_scan_barrier: CdcEvent, cb: InitCallback, }, @@ -225,14 +220,13 @@ impl fmt::Debug for Task { Task::Register { ref request, ref downstream, - ref conn_id, .. } => de .field("type", &"register") .field("register request", request) .field("request", request) - .field("id", &downstream.get_id()) - .field("conn_id", conn_id) + .field("id", &downstream.id) + .field("conn_id", &downstream.conn_id) .finish(), Task::Deregister(deregister) => de .field("type", &"deregister") @@ -265,12 +259,12 @@ impl fmt::Debug for Task { .field("current_ts", current_ts) .field("min_ts", min_ts) .finish(), - Task::ResolverReady { + Task::FinishScanLocks { ref observe_id, ref region, .. } => de - .field("type", &"resolver_ready") + .field("type", &"finish_scan_locks") .field("observe_id", &observe_id) .field("region_id", ®ion.get_id()) .finish(), @@ -279,11 +273,13 @@ impl fmt::Debug for Task { } Task::InitDownstream { ref region_id, + ref observe_id, ref downstream_id, .. } => de .field("type", &"init_downstream") .field("region_id", ®ion_id) + .field("observe_id", &observe_id) .field("downstream", &downstream_id) .finish(), Task::TxnExtra(_) => de.field("type", &"txn_extra").finish(), @@ -299,8 +295,8 @@ impl fmt::Debug for Task { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -struct ResolvedRegion { +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub(crate) struct ResolvedRegion { region_id: u64, resolved_ts: TimeStamp, } @@ -317,13 +313,14 @@ impl Ord for ResolvedRegion { } } -struct ResolvedRegionHeap { +#[derive(Default, Debug)] +pub(crate) struct ResolvedRegionHeap { // BinaryHeap is max heap, so we reverse order to get a min heap. heap: BinaryHeap>, } impl ResolvedRegionHeap { - fn push(&mut self, region_id: u64, resolved_ts: TimeStamp) { + pub(crate) fn push(&mut self, region_id: u64, resolved_ts: TimeStamp) { self.heap.push(Reverse(ResolvedRegion { region_id, resolved_ts, @@ -350,14 +347,118 @@ impl ResolvedRegionHeap { fn is_empty(&self) -> bool { self.heap.is_empty() } +} - fn clear(&mut self) { - self.heap.clear(); - } +#[derive(Default, Debug)] +pub(crate) struct Advance { + // multiplexing means one region can be subscribed multiple times in one `Conn`, + // in which case progresses are grouped by (ConnId, request_id). + pub(crate) multiplexing: HashMap<(ConnId, RequestId), ResolvedRegionHeap>, + + // exclusive means one region can only be subscribed one time in one `Conn`, + // in which case progresses are grouped by ConnId. + pub(crate) exclusive: HashMap, + + // To be compatible with old TiCDC client before v4.0.8. + // TODO(qupeng): we can deprecate support for too old TiCDC clients. + // map[(ConnId, region_id)]->(request_id, ts). + pub(crate) compat: HashMap<(ConnId, u64), (RequestId, TimeStamp)>, + + pub(crate) scan_finished: usize, + + pub(crate) blocked_on_scan: usize, + + pub(crate) blocked_on_locks: usize, + + min_resolved_ts: u64, + min_ts_region_id: u64, +} + +impl Advance { + fn emit_resolved_ts(&mut self, connections: &HashMap) { + let handle_send_result = |conn: &Conn, res: Result<(), SendError>| match res { + Ok(_) => {} + Err(SendError::Disconnected) => { + debug!("cdc send event failed, disconnected"; + "conn_id" => ?conn.get_id(), "downstream" => ?conn.get_peer()); + } + Err(SendError::Full) | Err(SendError::Congested) => { + info!("cdc send event failed, full"; + "conn_id" => ?conn.get_id(), "downstream" => ?conn.get_peer()); + } + }; + + let mut batch_min_resolved_ts = 0; + let mut batch_min_ts_region_id = 0; + let mut batch_send = |ts: u64, conn: &Conn, req_id: RequestId, regions: Vec| { + if batch_min_resolved_ts == 0 || batch_min_resolved_ts > ts { + batch_min_resolved_ts = ts; + if !regions.is_empty() { + batch_min_ts_region_id = regions[0]; + } + } + + let mut resolved_ts = ResolvedTs::default(); + resolved_ts.ts = ts; + resolved_ts.request_id = req_id.0; + *resolved_ts.mut_regions() = regions; + + let res = conn + .get_sink() + .unbounded_send(CdcEvent::ResolvedTs(resolved_ts), false); + handle_send_result(conn, res); + }; + + let mut compat_min_resolved_ts = 0; + let mut compat_min_ts_region_id = 0; + let mut compat_send = |ts: u64, conn: &Conn, region_id: u64, req_id: RequestId| { + if compat_min_resolved_ts == 0 || compat_min_resolved_ts > ts { + compat_min_resolved_ts = ts; + compat_min_ts_region_id = region_id; + } + + let event = Event { + region_id, + request_id: req_id.0, + event: Some(Event_oneof_event::ResolvedTs(ts)), + ..Default::default() + }; + let res = conn + .get_sink() + .unbounded_send(CdcEvent::Event(event), false); + handle_send_result(conn, res); + }; + + let multiplexing = std::mem::take(&mut self.multiplexing).into_iter(); + let exclusive = std::mem::take(&mut self.exclusive).into_iter(); + let unioned = multiplexing + .map(|((a, b), c)| (a, b, c)) + .chain(exclusive.map(|(a, c)| (a, RequestId(0), c))); + + for (conn_id, req_id, mut region_ts_heap) in unioned { + let conn = connections.get(&conn_id).unwrap(); + let mut batch_count = 8; + while !region_ts_heap.is_empty() { + let (ts, regions) = region_ts_heap.pop(batch_count); + if conn.features().contains(FeatureGate::BATCH_RESOLVED_TS) { + batch_send(ts.into_inner(), conn, req_id, Vec::from_iter(regions)); + } + batch_count *= 4; + } + } - fn reset_and_shrink_to(&mut self, min_capacity: usize) { - self.clear(); - self.heap.shrink_to(min_capacity); + for ((conn_id, region_id), (req_id, ts)) in std::mem::take(&mut self.compat) { + let conn = connections.get(&conn_id).unwrap(); + compat_send(ts.into_inner(), conn, region_id, req_id); + } + + if batch_min_resolved_ts > 0 { + self.min_resolved_ts = batch_min_resolved_ts; + self.min_ts_region_id = batch_min_ts_region_id; + } else { + self.min_resolved_ts = compat_min_resolved_ts; + self.min_ts_region_id = compat_min_ts_region_id; + } } } @@ -384,9 +485,8 @@ pub struct Endpoint { resolved_ts_config: ResolvedTsConfig, api_version: ApiVersion, - // Incremental scan + // Incremental scan stuffs. workers: Runtime, - // The total number of scan tasks including running and pending. scan_task_counter: Arc, scan_concurrency_semaphore: Arc, scan_speed_limiter: Limiter, @@ -396,7 +496,6 @@ pub struct Endpoint { sink_memory_quota: Arc, old_value_cache: OldValueCache, - resolved_region_heap: RefCell, causal_ts_provider: Option>, @@ -406,7 +505,6 @@ pub struct Endpoint { min_ts_region_id: u64, resolved_region_count: usize, unresolved_region_count: usize, - warn_resolved_ts_repeat_count: usize, } impl, E: KvEngine, S: StoreRegionMeta> Endpoint { @@ -476,41 +574,42 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint conn, @@ -709,7 +804,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint region_id, "conn_id" => ?conn_id, - "req_id" => request_id, + "req_id" => ?request_id, "downstream_id" => ?downstream_id); return; } @@ -747,11 +842,11 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint self.config.incremental_scan_concurrency_limit as isize { + if scan_task_count >= self.config.incremental_scan_concurrency_limit as isize { debug!("cdc rejects registration, too many scan tasks"; "region_id" => region_id, "conn_id" => ?conn_id, - "req_id" => request_id, + "req_id" => ?request_id, "scan_task_count" => scan_task_count, "incremental_scan_concurrency_limit" => self.config.incremental_scan_concurrency_limit, ); @@ -786,7 +881,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint region_id, "conn_id" => ?conn_id, - "req_id" => request_id, + "req_id" => ?request_id, "downstream_id" => ?downstream_id); return; } @@ -798,8 +893,8 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint region_id, "conn_id" => ?conn.get_id(), - "req_id" => request_id, + "req_id" => ?request_id, "observe_id" => ?observe_id, "downstream_id" => ?downstream_id); + let observed_range = downstream.observed_range.clone(); let downstream_state = downstream.get_state(); - let checkpoint_ts = request.checkpoint_ts; let sched = self.scheduler.clone(); + let scan_truncated = downstream.scan_truncated.clone(); - let downstream_ = downstream.clone(); - if let Err(err) = delegate.subscribe(downstream) { + if let Err((err, downstream)) = delegate.subscribe(downstream) { let error_event = err.into_error_event(region_id); - let _ = downstream_.sink_error_event(region_id, error_event); + let _ = downstream.sink_error_event(region_id, error_event); conn.unsubscribe(request_id, region_id); if is_new_delegate { self.capture_regions.remove(®ion_id); @@ -839,43 +934,39 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint { CDC_SCAN_TASKS.with_label_values(&["finish"]).inc(); } @@ -883,7 +974,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint region_id, - "conn_id" => ?init.conn_id, "request_id" => init.request_id, + "conn_id" => ?init.conn_id, "request_id" => ?init.request_id, ); init.deregister_downstream(e) } @@ -909,7 +1000,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint, + ) { let region_id = region.get_id(); - let mut deregisters = Vec::new(); - if let Some(delegate) = self.capture_regions.get_mut(®ion_id) { - if delegate.handle.id == observe_id { - match delegate.on_region_ready(resolver, region) { + match self.capture_regions.get_mut(®ion_id) { + None => { + debug!("cdc region not found on region ready (finish scan locks)"; + "region_id" => region.get_id()); + } + Some(delegate) => { + if delegate.handle.id != observe_id { + debug!("cdc stale region ready"; + "region_id" => region.get_id(), + "observe_id" => ?observe_id, + "current_id" => ?delegate.handle.id); + return; + } + match delegate.finish_scan_locks(region, locks) { Ok(fails) => { + let mut deregisters = Vec::new(); for (downstream, e) in fails { deregisters.push(Deregister::Downstream { - conn_id: downstream.get_conn_id(), - request_id: downstream.get_req_id(), + conn_id: downstream.conn_id, + request_id: downstream.req_id, region_id, - downstream_id: downstream.get_id(), + downstream_id: downstream.id, err: Some(e), }); } + // Deregister downstreams if there is any downstream fails to subscribe. + for deregister in deregisters { + self.on_deregister(deregister); + } } - Err(e) => deregisters.push(Deregister::Delegate { + Err(e) => self.on_deregister(Deregister::Delegate { region_id, observe_id, err: e, }), } - } else { - debug!("cdc stale region ready"; - "region_id" => region.get_id(), - "observe_id" => ?observe_id, - "current_id" => ?delegate.handle.id); } - } else { - debug!("cdc region not found on region ready (finish building resolver)"; - "region_id" => region.get_id()); - } - - // Deregister downstreams if there is any downstream fails to subscribe. - for deregister in deregisters { - self.on_deregister(deregister); } } fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp, current_ts: TimeStamp) { - // Reset resolved_regions to empty. - let mut resolved_regions = self.resolved_region_heap.borrow_mut(); - resolved_regions.clear(); - - let total_region_count = regions.len(); - self.min_resolved_ts = TimeStamp::max(); - let mut advance_ok = 0; - let mut advance_failed_none = 0; - let mut advance_failed_same = 0; - let mut advance_failed_stale = 0; - for region_id in regions { - if let Some(delegate) = self.capture_regions.get_mut(®ion_id) { - let old_resolved_ts = delegate - .resolver - .as_ref() - .map_or(TimeStamp::zero(), |r| r.resolved_ts()); - if old_resolved_ts > min_ts { - advance_failed_stale += 1; - } - if let Some(resolved_ts) = delegate.on_min_ts(min_ts) { - if resolved_ts < self.min_resolved_ts { - self.min_resolved_ts = resolved_ts; - self.min_ts_region_id = region_id; - } - resolved_regions.push(region_id, resolved_ts); - if resolved_ts == old_resolved_ts { - advance_failed_same += 1; - } else { - advance_ok += 1; - } - } else { - advance_failed_none += 1; - } - } - } self.current_ts = current_ts; - let lag_millis = min_ts - .physical() - .saturating_sub(self.min_resolved_ts.physical()); - if Duration::from_millis(lag_millis) > WARN_RESOLVED_TS_LAG_THRESHOLD { - self.warn_resolved_ts_repeat_count += 1; - if self.warn_resolved_ts_repeat_count >= WARN_RESOLVED_TS_COUNT_THRESHOLD { - self.warn_resolved_ts_repeat_count = 0; - warn!("cdc resolved ts lag too large"; - "min_resolved_ts" => self.min_resolved_ts, - "min_ts_region_id" => self.min_ts_region_id, - "min_ts" => min_ts, - "lag" => ?Duration::from_millis(lag_millis), - "ok" => advance_ok, - "none" => advance_failed_none, - "stale" => advance_failed_stale, - "same" => advance_failed_same); - } - } - self.resolved_region_count = resolved_regions.heap.len(); - self.unresolved_region_count = total_region_count - self.resolved_region_count; - - // Separate broadcasting outlier regions and normal regions, - // so 1) downstreams know where they should send resolve lock requests, - // and 2) resolved ts of normal regions does not fallback. - // - // Regions are separated exponentially to reduce resolved ts events and - // save CPU for both TiKV and TiCDC. - let mut batch_count = 8; - while !resolved_regions.is_empty() { - let (outlier_min_resolved_ts, outlier_regions) = resolved_regions.pop(batch_count); - self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); - batch_count *= 4; - } - } - - fn broadcast_resolved_ts(&self, min_resolved_ts: TimeStamp, regions: HashSet) { - let send_cdc_event = |ts: u64, conn: &Conn, request_id: u64, regions: Vec| { - let mut resolved_ts = ResolvedTs::default(); - resolved_ts.ts = ts; - resolved_ts.request_id = request_id; - *resolved_ts.mut_regions() = regions; - - let force_send = false; - match conn - .get_sink() - .unbounded_send(CdcEvent::ResolvedTs(resolved_ts), force_send) - { - Ok(_) => (), - Err(SendError::Disconnected) => { - debug!("cdc send event failed, disconnected"; - "conn_id" => ?conn.get_id(), "downstream" => ?conn.get_peer()); - } - Err(SendError::Full) | Err(SendError::Congested) => { - info!("cdc send event failed, full"; - "conn_id" => ?conn.get_id(), "downstream" => ?conn.get_peer()); - } - } - }; + self.min_resolved_ts = current_ts; - // multiplexing is for STREAM_MULTIPLEXING enabled. - let mut multiplexing = HashMap::<(ConnId, u64), Vec>::default(); - // one_way is fro STREAM_MULTIPLEXING disabled. - let mut one_way = HashMap::, Vec)>::default(); - for region_id in ®ions { - let d = match self.capture_regions.get(region_id) { - Some(d) => d, - None => continue, - }; - for downstream in d.downstreams() { - if !downstream.get_state().load().ready_for_advancing_ts() { - continue; - } - let conn_id = downstream.get_conn_id(); - let features = self.connections.get(&conn_id).unwrap().features(); - if features.contains(FeatureGate::STREAM_MULTIPLEXING) { - multiplexing - .entry((conn_id, downstream.get_req_id())) - .or_insert_with(Default::default) - .push(*region_id); - } else { - let x = one_way.entry(conn_id).or_insert_with(Default::default); - x.0.push(downstream.get_req_id()); - x.1.push(*region_id); - } - } - } - - let min_resolved_ts = min_resolved_ts.into_inner(); - - for ((conn_id, request_id), regions) in multiplexing { - let conn = self.connections.get(&conn_id).unwrap(); - if conn.features().contains(FeatureGate::BATCH_RESOLVED_TS) { - send_cdc_event(min_resolved_ts, conn, request_id, regions); - } else { - for region_id in regions { - self.broadcast_resolved_ts_compact( - conn, - request_id, - region_id, - min_resolved_ts, - ); - } - } - } - for (conn_id, reqs_regions) in one_way { - let conn = self.connections.get(&conn_id).unwrap(); - if conn.features().contains(FeatureGate::BATCH_RESOLVED_TS) { - send_cdc_event(min_resolved_ts, conn, 0, reqs_regions.1); - } else { - for i in 0..reqs_regions.0.len() { - self.broadcast_resolved_ts_compact( - conn, - reqs_regions.0[i], - reqs_regions.1[i], - min_resolved_ts, - ); - } + let mut advance = Advance::default(); + for region_id in regions { + if let Some(d) = self.capture_regions.get_mut(®ion_id) { + d.on_min_ts(min_ts, current_ts, &self.connections, &mut advance); } } - } - fn broadcast_resolved_ts_compact( - &self, - conn: &Conn, - request_id: u64, - region_id: u64, - resolved_ts: u64, - ) { - let downstream_id = conn.get_downstream(request_id, region_id).unwrap(); - let delegate = self.capture_regions.get(®ion_id).unwrap(); - let downstream = delegate.downstream(downstream_id).unwrap(); - if !downstream.get_state().load().ready_for_advancing_ts() { - return; - } - let resolved_ts_event = Event { - region_id, - request_id, - event: Some(Event_oneof_event::ResolvedTs(resolved_ts)), - ..Default::default() - }; - let force_send = false; - let _ = downstream.sink_event(resolved_ts_event, force_send); + self.resolved_region_count = advance.scan_finished; + self.unresolved_region_count = advance.blocked_on_scan; + advance.emit_resolved_ts(&self.connections); + self.min_resolved_ts = advance.min_resolved_ts.into(); + self.min_ts_region_id = advance.min_ts_region_id; } fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver, event_time: Instant) { @@ -1180,7 +1115,6 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint pd_client.get_tso().await.unwrap_or_default(), }; let mut min_ts = min_ts_pd; - let mut min_ts_min_lock = min_ts_pd; // Sync with concurrency manager so that it can work correctly when // optimizations like async commit is enabled. @@ -1191,7 +1125,6 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint (), // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not // advance normally. - Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), + Err(err) => panic!("failed to register min ts event, error: {:?}", err), } } else { // During shutdown, tso runtime drops future immediately, @@ -1241,13 +1174,6 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint panic!("failed to schedule min ts event, error: {:?}", err), } } - let lag_millis = min_ts_pd.physical().saturating_sub(min_ts.physical()); - if Duration::from_millis(lag_millis) > WARN_RESOLVED_TS_LAG_THRESHOLD { - // TODO: Suppress repeat logs by using WARN_RESOLVED_TS_COUNT_THRESHOLD. - info!("cdc min_ts lag too large"; - "min_ts" => min_ts, "min_ts_pd" => min_ts_pd, - "min_ts_min_lock" => min_ts_min_lock); - } }; self.tso_worker.spawn(fut); } @@ -1284,13 +1210,12 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable Task::Register { request, downstream, - conn_id, - } => self.on_register(request, downstream, conn_id), - Task::ResolverReady { + } => self.on_register(request, downstream), + Task::FinishScanLocks { observe_id, - resolver, region, - } => self.on_region_ready(observe_id, resolver, region), + locks, + } => self.finish_scan_locks(observe_id, region, locks), Task::Deregister(deregister) => self.on_deregister(deregister), Task::MultiBatch { multi, @@ -1310,25 +1235,39 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable } => self.register_min_ts_event(leader_resolver, event_time), Task::InitDownstream { region_id, + observe_id, downstream_id, downstream_state, sink, + build_resolver, incremental_scan_barrier, cb, } => { + match self.capture_regions.get_mut(®ion_id) { + Some(delegate) if delegate.handle.id == observe_id => { + if delegate.init_lock_tracker() { + build_resolver.store(true, Ordering::Release); + } + } + _ => return, + } if let Err(e) = sink.unbounded_send(incremental_scan_barrier, true) { error!("cdc failed to schedule barrier for delta before delta scan"; "region_id" => region_id, + "observe_id" => ?observe_id, + "downstream_id" => ?downstream_id, "error" => ?e); return; } if on_init_downstream(&downstream_state) { info!("cdc downstream starts to initialize"; "region_id" => region_id, + "observe_id" => ?observe_id, "downstream_id" => ?downstream_id); } else { - warn!("cdc downstream fails to initialize"; + warn!("cdc downstream fails to initialize: canceled"; "region_id" => region_id, + "observe_id" => ?observe_id, "downstream_id" => ?downstream_id); } cb(); @@ -1355,11 +1294,6 @@ impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable for Endpoint { fn on_timeout(&mut self) { - // Reclaim resolved_region_heap memory. - self.resolved_region_heap - .borrow_mut() - .reset_and_shrink_to(self.capture_regions.len()); - CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); CDC_CAPTURED_REGION_COUNT.set(self.capture_regions.len() as i64); CDC_REGION_RESOLVE_STATUS_GAUGE_VEC @@ -1586,10 +1520,10 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_version_task( @@ -1608,7 +1542,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::RawKv, false, @@ -1618,7 +1552,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1644,7 +1577,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 2, + RequestId(2), conn_id, ChangeDataRequestKvApi::TxnKv, false, @@ -1654,7 +1587,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1681,7 +1613,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 3, + RequestId(3), conn_id, ChangeDataRequestKvApi::TxnKv, false, @@ -1691,7 +1623,6 @@ mod tests { suite.run(Task::Register { request: req, downstream, - conn_id, }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1869,14 +1800,14 @@ mod tests { #[test] fn test_raftstore_is_busy() { let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, _rx) = channel::channel(1, quota); + let (tx, _rx) = channel::channel(ConnId::default(), 1, quota); let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); // Fill the channel. suite.add_region(1 /* region id */, 1 /* cap */); suite.fill_raft_rx(1); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_version_task( @@ -1892,7 +1823,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -1901,7 +1832,6 @@ mod tests { suite.run(Task::Register { request: req, downstream, - conn_id, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); @@ -1925,10 +1855,10 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -1945,7 +1875,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -1954,7 +1884,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); suite @@ -1967,7 +1896,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -1976,7 +1905,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -2012,7 +1940,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2022,7 +1950,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); // Region 100 is inserted into capture_regions. assert_eq!(suite.endpoint.capture_regions.len(), 2); @@ -2045,7 +1972,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2054,7 +1981,6 @@ mod tests { suite.run(Task::Register { request: req, downstream, - conn_id, }); // Drop CaptureChange message, it should cause scan task failure. let timeout = Duration::from_millis(100); @@ -2062,9 +1988,9 @@ mod tests { assert_eq!(suite.endpoint.capture_regions.len(), 3); let task = suite.task_rx.recv_timeout(timeout).unwrap(); match task.unwrap() { - Task::Deregister(Deregister::Delegate { region_id, err, .. }) => { + Task::Deregister(Deregister::Downstream { region_id, err, .. }) => { assert_eq!(region_id, 101); - assert!(matches!(err, Error::Other(_)), "{:?}", err); + assert!(matches!(err, Some(Error::Other(_))), "{:?}", err); } other => panic!("unexpected task {:?}", other), } @@ -2092,10 +2018,10 @@ mod tests { suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -2112,7 +2038,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2121,7 +2047,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); @@ -2130,7 +2055,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 2, + RequestId(2), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2139,7 +2064,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -2197,11 +2121,11 @@ mod tests { suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); let mut region = Region::default(); region.set_id(1); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -2217,7 +2141,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2227,12 +2151,14 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let resolver = Resolver::new(1, memory_quota); let observe_id = suite.endpoint.capture_regions[&1].handle.id; - suite.on_region_ready(observe_id, resolver, region.clone()); + suite + .capture_regions + .get_mut(&1) + .unwrap() + .init_lock_tracker(); + suite.finish_scan_locks(observe_id, region.clone(), Default::default()); suite.run(Task::MinTs { regions: vec![1], min_ts: TimeStamp::from(1), @@ -2253,7 +2179,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2264,13 +2190,15 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let resolver = Resolver::new(2, memory_quota); region.set_id(2); let observe_id = suite.endpoint.capture_regions[&2].handle.id; - suite.on_region_ready(observe_id, resolver, region); + suite + .capture_regions + .get_mut(&2) + .unwrap() + .init_lock_tracker(); + suite.finish_scan_locks(observe_id, region, Default::default()); suite.run(Task::MinTs { regions: vec![1, 2], min_ts: TimeStamp::from(2), @@ -2289,11 +2217,11 @@ mod tests { // Register region 3 to another conn which is not support batch resolved ts. let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx2) = channel::channel(1, quota); + let (tx, mut rx2) = channel::channel(ConnId::default(), 1, quota); let mut rx2 = rx2.drain(); let mut region = Region::default(); region.set_id(3); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_version_task( @@ -2306,7 +2234,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 3, + RequestId(3), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2317,13 +2245,15 @@ mod tests { suite.run(Task::Register { request: req, downstream, - conn_id, }); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let resolver = Resolver::new(3, memory_quota); region.set_id(3); let observe_id = suite.endpoint.capture_regions[&3].handle.id; - suite.on_region_ready(observe_id, resolver, region); + suite + .capture_regions + .get_mut(&3) + .unwrap() + .init_lock_tracker(); + suite.finish_scan_locks(observe_id, region, Default::default()); suite.run(Task::MinTs { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), @@ -2364,10 +2294,10 @@ mod tests { let mut suite = mock_endpoint(&CdcConfig::default(), None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); suite.run(set_conn_version_task( @@ -2383,17 +2313,16 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, ObservedRange::default(), ); - let downstream_id = downstream.get_id(); + let downstream_id = downstream.id; suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); @@ -2401,7 +2330,7 @@ mod tests { err_header.set_not_leader(Default::default()); let deregister = Deregister::Downstream { conn_id, - request_id: 0, + request_id: RequestId(0), region_id: 1, downstream_id, err: Some(Error::request(err_header.clone())), @@ -2427,23 +2356,22 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, ObservedRange::default(), ); - let new_downstream_id = downstream.get_id(); + let new_downstream_id = downstream.id; suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); let deregister = Deregister::Downstream { conn_id, - request_id: 0, + request_id: RequestId(0), region_id: 1, downstream_id, err: Some(Error::request(err_header.clone())), @@ -2454,7 +2382,7 @@ mod tests { let deregister = Deregister::Downstream { conn_id, - request_id: 0, + request_id: RequestId(0), region_id: 1, downstream_id: new_downstream_id, err: Some(Error::request(err_header.clone())), @@ -2481,7 +2409,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch, - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2490,7 +2418,6 @@ mod tests { suite.run(Task::Register { request: req, downstream, - conn_id, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); let deregister = Deregister::Delegate { @@ -2520,9 +2447,9 @@ mod tests { let mut conn_rxs = vec![]; let quota = Arc::new(MemoryQuota::new(usize::MAX)); for region_ids in [vec![1, 2], vec![3]] { - let (tx, rx) = channel::channel(1, quota.clone()); + let (tx, rx) = channel::channel(ConnId::default(), 1, quota.clone()); conn_rxs.push(rx); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); let version = FeatureGate::batch_resolved_ts(); @@ -2538,7 +2465,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2548,14 +2475,16 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let resolver = Resolver::new(region_id, memory_quota); let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; let mut region = Region::default(); region.set_id(region_id); - suite.on_region_ready(observe_id, resolver, region); + suite + .capture_regions + .get_mut(®ion_id) + .unwrap() + .init_lock_tracker(); + suite.finish_scan_locks(observe_id, region, Default::default()); } } @@ -2635,8 +2564,8 @@ mod tests { let quota = Arc::new(MemoryQuota::new(usize::MAX)); // Open conn a - let (tx1, _rx1) = channel::channel(1, quota.clone()); - let conn_a = Conn::new(tx1, String::new()); + let (tx1, _rx1) = channel::channel(ConnId::default(), 1, quota.clone()); + let conn_a = Conn::new(ConnId::default(), tx1, String::new()); let conn_id_a = conn_a.get_id(); suite.run(Task::OpenConn { conn: conn_a }); suite.run(set_conn_version_task( @@ -2645,9 +2574,9 @@ mod tests { )); // Open conn b - let (tx2, mut rx2) = channel::channel(1, quota); + let (tx2, mut rx2) = channel::channel(ConnId::default(), 1, quota); let mut rx2 = rx2.drain(); - let conn_b = Conn::new(tx2, String::new()); + let conn_b = Conn::new(ConnId::default(), tx2, String::new()); let conn_id_b = conn_b.get_id(); suite.run(Task::OpenConn { conn: conn_b }); suite.run(set_conn_version_task( @@ -2665,7 +2594,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch_2.clone(), - 0, + RequestId(0), conn_id_a, ChangeDataRequestKvApi::TiDb, false, @@ -2674,7 +2603,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id: conn_id_a, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); let observe_id = suite.endpoint.capture_regions[&1].handle.id; @@ -2689,7 +2617,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch_1, - 0, + RequestId(0), conn_id_b, ChangeDataRequestKvApi::TiDb, false, @@ -2698,7 +2626,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id: conn_id_b, }); assert_eq!(suite.endpoint.capture_regions.len(), 1); @@ -2710,11 +2637,15 @@ mod tests { let mut region = Region::default(); region.id = 1; region.set_region_epoch(region_epoch_2); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - suite.run(Task::ResolverReady { + suite + .capture_regions + .get_mut(&1) + .unwrap() + .init_lock_tracker(); + suite.run(Task::FinishScanLocks { observe_id, region: region.clone(), - resolver: Resolver::new(1, memory_quota), + locks: Default::default(), }); // Deregister deletgate due to epoch not match for conn b. @@ -2780,14 +2711,6 @@ mod tests { assert_eq!(ts, 3.into()); assert_eq!(regions.len(), 1); assert!(regions.contains(&3)); - - heap1.reset_and_shrink_to(3); - assert_eq!(3, heap1.heap.capacity()); - assert!(heap1.heap.is_empty()); - - heap1.push(1, 1.into()); - heap1.clear(); - assert!(heap1.heap.is_empty()); } #[test] @@ -2799,10 +2722,10 @@ mod tests { }; let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); // Enable batch resolved ts in the test. @@ -2823,7 +2746,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 0, + RequestId(0), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2834,22 +2757,26 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); - let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let mut resolver = Resolver::new(id, memory_quota); - resolver - .track_lock(TimeStamp::compose(0, id), vec![], None) - .unwrap(); + let mut locks = BTreeMap::::default(); + locks.insert( + Key::from_encoded(vec![]), + MiniLock::from_ts(TimeStamp::compose(0, id)), + ); let mut region = Region::default(); region.id = id; region.set_region_epoch(region_epoch); + suite + .capture_regions + .get_mut(&id) + .unwrap() + .init_lock_tracker(); let failed = suite .capture_regions .get_mut(&id) .unwrap() - .on_region_ready(resolver, region) + .finish_scan_locks(region, locks) .unwrap(); assert!(failed.is_empty()); } @@ -2892,10 +2819,10 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, mut rx) = channel::channel(1, quota); + let (tx, mut rx) = channel::channel(ConnId::default(), 1, quota); let mut rx = rx.drain(); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -2912,7 +2839,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2921,7 +2848,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.connections[&conn_id].downstreams_count(), 1); @@ -2930,7 +2856,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 2, + RequestId(2), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2939,7 +2865,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.connections[&conn_id].downstreams_count(), 2); @@ -2948,7 +2873,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 2, + RequestId(2), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -2957,7 +2882,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.connections[&conn_id].downstreams_count(), 2); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) @@ -2973,7 +2897,7 @@ mod tests { // Deregister an unexist downstream. suite.run(Task::Deregister(Deregister::Downstream { conn_id, - request_id: 1, + request_id: RequestId(1), region_id: 1, downstream_id: DownstreamId::new(), err: None, @@ -2989,10 +2913,10 @@ mod tests { assert_eq!(suite.connections[&conn_id].downstreams_count(), 2); // Deregister an exist downstream. - let downstream_id = suite.capture_regions[&1].downstreams()[0].get_id(); + let downstream_id = suite.capture_regions[&1].downstreams()[0].id; suite.run(Task::Deregister(Deregister::Downstream { conn_id, - request_id: 1, + request_id: RequestId(1), region_id: 1, downstream_id, err: Some(Error::Rocks("test error".to_owned())), @@ -3013,7 +2937,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -3022,7 +2946,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.connections[&conn_id].downstreams_count(), 2); @@ -3053,7 +2976,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - i as _, + RequestId(i), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -3062,20 +2985,19 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); - assert_eq!(suite.connections[&conn_id].downstreams_count(), i); + assert_eq!(suite.connections[&conn_id].downstreams_count(), i as usize); } // Deregister the request. suite.run(Task::Deregister(Deregister::Request { conn_id, - request_id: 1, + request_id: RequestId(1), })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 1); suite.run(Task::Deregister(Deregister::Request { conn_id, - request_id: 2, + request_id: RequestId(2), })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); assert_eq!(suite.capture_regions.len(), 0); @@ -3099,7 +3021,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -3108,7 +3030,6 @@ mod tests { suite.run(Task::Register { request: req.clone(), downstream, - conn_id, }); assert_eq!(suite.connections[&conn_id].downstreams_count(), i as usize); } @@ -3116,7 +3037,7 @@ mod tests { // Deregister regions one by one in the request. suite.run(Task::Deregister(Deregister::Region { conn_id, - request_id: 1, + request_id: RequestId(1), region_id: 1, })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 1); @@ -3124,7 +3045,7 @@ mod tests { suite.run(Task::Deregister(Deregister::Region { conn_id, - request_id: 1, + request_id: RequestId(1), region_id: 2, })); assert_eq!(suite.connections[&conn_id].downstreams_count(), 0); @@ -3152,9 +3073,9 @@ mod tests { let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (tx, _rx) = channel::channel(1, quota); + let (tx, _rx) = channel::channel(ConnId::default(), 1, quota); - let conn = Conn::new(tx, String::new()); + let conn = Conn::new(ConnId::default(), tx, String::new()); let conn_id = conn.get_id(); suite.run(Task::OpenConn { conn }); @@ -3168,7 +3089,7 @@ mod tests { let downstream = Downstream::new( "".to_string(), region_epoch.clone(), - 1, + RequestId(1), conn_id, ChangeDataRequestKvApi::TiDb, false, @@ -3177,7 +3098,6 @@ mod tests { suite.run(Task::Register { request: req, downstream, - conn_id, }); assert!(suite.connections.is_empty()); } diff --git a/components/cdc/src/errors.rs b/components/cdc/src/errors.rs index e7bd7605e7d..fa0458bc5c6 100644 --- a/components/cdc/src/errors.rs +++ b/components/cdc/src/errors.rs @@ -3,7 +3,7 @@ use std::{error, io::Error as IoError, result}; use engine_traits::Error as EngineTraitsError; -use kvproto::{cdcpb::Error as ErrorEvent, errorpb}; +use kvproto::{cdcpb, cdcpb::Error as ErrorEvent, errorpb}; use thiserror::Error; use tikv::storage::{ kv::{Error as KvError, ErrorInner as EngineErrorInner}, @@ -78,7 +78,7 @@ impl Error { ) } - pub fn extract_region_error(self) -> errorpb::Error { + fn extract_region_error(self) -> errorpb::Error { match self { Error::Kv(KvError(box EngineErrorInner::Request(e))) | Error::Txn(TxnError(box TxnErrorInner::Engine(KvError( @@ -99,18 +99,27 @@ impl Error { pub fn into_error_event(self, region_id: u64) -> ErrorEvent { let mut err_event = ErrorEvent::default(); - let mut err = self.extract_region_error(); - if err.has_not_leader() { - let not_leader = err.take_not_leader(); - err_event.set_not_leader(not_leader); - } else if err.has_epoch_not_match() { - let epoch_not_match = err.take_epoch_not_match(); - err_event.set_epoch_not_match(epoch_not_match); + if matches!( + self, + Error::Sink(SendError::Congested) | Error::MemoryQuotaExceeded(MemoryQuotaExceeded) + ) { + let mut congested = cdcpb::Congested::default(); + congested.set_region_id(region_id); + err_event.set_congested(congested); } else { - // TODO: Add more errors to the cdc protocol - let mut region_not_found = errorpb::RegionNotFound::default(); - region_not_found.set_region_id(region_id); - err_event.set_region_not_found(region_not_found); + let mut err = self.extract_region_error(); + if err.has_not_leader() { + let not_leader = err.take_not_leader(); + err_event.set_not_leader(not_leader); + } else if err.has_epoch_not_match() { + let epoch_not_match = err.take_epoch_not_match(); + err_event.set_epoch_not_match(epoch_not_match); + } else { + // TODO: Add more errors to the cdc protocol + let mut region_not_found = errorpb::RegionNotFound::default(); + region_not_found.set_region_id(region_id); + err_event.set_region_not_found(region_not_found); + } } err_event } diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index ed8b0f9f5ca..c38cb850f48 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + collections::BTreeMap, sync::{ atomic::{AtomicBool, Ordering}, Arc, @@ -22,27 +23,25 @@ use kvproto::{ metapb::{Region, RegionEpoch}, }; use raftstore::{ - coprocessor::ObserveId, + coprocessor::ObserveHandle, router::CdcHandle, store::{ fsm::ChangeObserver, msg::{Callback, ReadResponse}, }, }; -use resolved_ts::{Resolver, TsSource}; use tikv::storage::{ kv::Snapshot, - mvcc::{DeltaScanner, ScannerBuilder}, + mvcc::{DeltaScanner, MvccReader, ScannerBuilder}, raw::raw_mvcc::{RawMvccIterator, RawMvccSnapshot}, txn::{TxnEntry, TxnEntryScanner}, Statistics, }; -use tikv_kv::Iterator; +use tikv_kv::{Iterator, ScanMode}; use tikv_util::{ box_err, codec::number, debug, defer, error, info, - memory::MemoryQuota, sys::inspector::{self_thread_inspector, ThreadInspector}, time::{duration_to_sec, Instant, Limiter}, warn, @@ -50,19 +49,22 @@ use tikv_util::{ Either, }; use tokio::sync::Semaphore; -use txn_types::{Key, KvPair, Lock, LockType, OldValue, TimeStamp}; +use txn_types::{Key, KvPair, LockType, OldValue, TimeStamp}; use crate::{ channel::CdcEvent, - delegate::{post_init_downstream, Delegate, DownstreamId, DownstreamState, ObservedRange}, + delegate::{ + post_init_downstream, Delegate, DownstreamId, DownstreamState, MiniLock, ObservedRange, + }, endpoint::Deregister, metrics::*, old_value::{near_seek_old_value, OldValueCursors}, - service::ConnId, + service::{ConnId, RequestId}, Error, Result, Task, }; -struct ScanStat { +#[derive(Copy, Clone, Debug, Default)] +pub(crate) struct ScanStat { // Fetched bytes to the scanner. emit: usize, // Bytes from the device, `None` if not possible to get it. @@ -82,20 +84,29 @@ pub(crate) enum Scanner { } pub(crate) struct Initializer { - pub(crate) tablet: Option, - pub(crate) sched: Scheduler, - pub(crate) sink: crate::channel::Sink, - - pub(crate) observed_range: ObservedRange, pub(crate) region_id: u64, + pub(crate) conn_id: ConnId, + pub(crate) request_id: RequestId, + pub(crate) checkpoint_ts: TimeStamp, pub(crate) region_epoch: RegionEpoch, - pub(crate) observe_id: ObserveId, + + // `build_resolver` can only be determined after snapshot is acquired. + // If a region is subscribed more than one times, the downstream with the + // earliest snapshot will build the lock resolver. + // + // `build_resolver` won't be changed after set in `InitDownstream`. + pub(crate) build_resolver: Arc, + + pub(crate) observed_range: ObservedRange, + pub(crate) observe_handle: ObserveHandle, pub(crate) downstream_id: DownstreamId, pub(crate) downstream_state: Arc>, pub(crate) scan_truncated: Arc, - pub(crate) conn_id: ConnId, - pub(crate) request_id: u64, - pub(crate) checkpoint_ts: TimeStamp, + + pub(crate) tablet: Option, + pub(crate) sched: Scheduler, + pub(crate) sink: crate::channel::Sink, + pub(crate) concurrency_semaphore: Arc, pub(crate) scan_speed_limiter: Limiter, pub(crate) fetch_speed_limiter: Limiter, @@ -103,47 +114,60 @@ pub(crate) struct Initializer { pub(crate) max_scan_batch_bytes: usize, pub(crate) max_scan_batch_size: usize, - pub(crate) build_resolver: bool, pub(crate) ts_filter_ratio: f64, - pub(crate) kv_api: ChangeDataRequestKvApi, - pub(crate) filter_loop: bool, } impl Initializer { - pub(crate) async fn initialize>( - &mut self, - change_observer: ChangeObserver, - cdc_handle: T, - concurrency_semaphore: Arc, - memory_quota: Arc, - ) -> Result<()> { + pub(crate) async fn initialize(&mut self, cdc_handle: T) -> Result<()> + where + T: 'static + CdcHandle, + { fail_point!("cdc_before_initialize"); + let concurrency_semaphore = self.concurrency_semaphore.clone(); let _permit = concurrency_semaphore.acquire().await; + let region_id = self.region_id; + let downstream_id = self.downstream_id; + let observe_id = self.observe_handle.id; + // when there are a lot of pending incremental scan tasks, they may be stopped, + // check the state here to accelerate tasks cancel process. + if self.downstream_state.load() == DownstreamState::Stopped { + info!("cdc async incremental scan canceled before start"; + "region_id" => region_id, + "downstream_id" => ?downstream_id, + "observe_id" => ?observe_id, + "conn_id" => ?self.conn_id); + return Err(Error::Other(box_err!("scan canceled"))); + } + // To avoid holding too many snapshots and holding them too long, // we need to acquire scan concurrency permit before taking snapshot. let sched = self.sched.clone(); - let region_id = self.region_id; let region_epoch = self.region_epoch.clone(); - let downstream_id = self.downstream_id; let downstream_state = self.downstream_state.clone(); let (cb, fut) = tikv_util::future::paired_future_callback(); let sink = self.sink.clone(); + let build_resolver = self.build_resolver.clone(); let (incremental_scan_barrier_cb, incremental_scan_barrier_fut) = tikv_util::future::paired_future_callback(); let barrier = CdcEvent::Barrier(Some(incremental_scan_barrier_cb)); if let Err(e) = cdc_handle.capture_change( self.region_id, region_epoch, - change_observer, + ChangeObserver::from_cdc(self.region_id, self.observe_handle.clone()), + // NOTE: raftstore handles requests in serial for every region. + // That's why we can determine whether to build a lock resolver or not + // without check and compare snapshot sequence number. Callback::read(Box::new(move |resp| { if let Err(e) = sched.schedule(Task::InitDownstream { region_id, + observe_id, downstream_id, downstream_state, sink, + build_resolver, incremental_scan_barrier: barrier, cb: Box::new(move || cb(resp)), }) { @@ -164,21 +188,24 @@ impl Initializer { } match fut.await { - Ok(resp) => self.on_change_cmd_response(resp, memory_quota).await, + Ok(resp) => self.on_change_cmd_response(resp).await, Err(e) => Err(Error::Other(box_err!(e))), } } - pub(crate) async fn on_change_cmd_response( + pub(crate) async fn on_change_cmd_response( &mut self, - mut resp: ReadResponse, - memory_quota: Arc, - ) -> Result<()> { + mut resp: ReadResponse, + ) -> Result<()> + where + S: EngineSnapshot + 'static, + { if let Some(region_snapshot) = resp.snapshot { let region = region_snapshot.get_region().clone(); assert_eq!(self.region_id, region.get_id()); - self.async_incremental_scan(region_snapshot, region, memory_quota) + self.async_incremental_scan(region_snapshot, region) .await + .map(|_| ()) } else { assert!( resp.response.get_header().has_error(), @@ -190,26 +217,29 @@ impl Initializer { } } - pub(crate) async fn async_incremental_scan( + pub(crate) async fn async_incremental_scan( &mut self, snap: S, region: Region, - memory_quota: Arc, - ) -> Result<()> { + ) -> Result + where + S: Snapshot + 'static, + { CDC_SCAN_TASKS.with_label_values(&["ongoing"]).inc(); defer!(CDC_SCAN_TASKS.with_label_values(&["ongoing"]).dec()); - let region_id = region.get_id(); + let region_id = self.region_id; let downstream_id = self.downstream_id; - let observe_id = self.observe_id; + let observe_id = self.observe_handle.id; let conn_id = self.conn_id; - let kv_api = self.kv_api; - let on_cancel = || -> Result<()> { - info!("cdc async incremental scan canceled"; + let on_cancel = || -> Result { + info!( + "cdc async incremental scan canceled"; "region_id" => region_id, "downstream_id" => ?downstream_id, "observe_id" => ?observe_id, - "conn_id" => ?conn_id); + "conn_id" =>?conn_id, + ); Err(box_err!("scan canceled")) }; @@ -218,31 +248,70 @@ impl Initializer { } self.observed_range.update_region_key_range(®ion); - debug!("cdc async incremental scan"; + + // Be compatible with old TiCDC clients, which won't give `observed_range`. + let (start_key, end_key): (Key, Key); + if self.observed_range.start_key_encoded.as_encoded() <= ®ion.start_key { + start_key = Key::from_encoded_slice(®ion.start_key); + } else { + start_key = self.observed_range.start_key_encoded.clone(); + } + if self.observed_range.end_key_encoded.is_empty() + || self.observed_range.end_key_encoded.as_encoded() >= ®ion.end_key + && !region.end_key.is_empty() + { + end_key = Key::from_encoded_slice(®ion.end_key); + } else { + end_key = self.observed_range.end_key_encoded.clone(); + } + + debug!( + "cdc async incremental scan"; "region_id" => region_id, "downstream_id" => ?downstream_id, - "observe_id" => ?self.observe_id, + "observe_id" => ?observe_id, + "conn_id" => ?conn_id, "all_key_covered" => ?self.observed_range.all_key_covered, - "start_key" => log_wrappers::Value::key(snap.lower_bound().unwrap_or_default()), - "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); + "start_key" => log_wrappers::Value::key(start_key.as_encoded()), + "end_key" => log_wrappers::Value::key(end_key.as_encoded()) + ); - let mut resolver = if self.build_resolver { - Some(Resolver::new(region_id, memory_quota)) - } else { - None + if self.build_resolver.load(Ordering::Acquire) { + // Scan and collect locks if build_resolver is true. The range + // should be the whole region span instead of subscribed span, + // because those locks will be shared between multiple Downstreams. + let mut reader = MvccReader::new(snap.clone(), Some(ScanMode::Forward), false); + let (key_locks, has_remain) = + reader.scan_locks_from_storage(None, None, |_, _| true, 0)?; + assert!(!has_remain); + let mut locks = BTreeMap::::new(); + for (key, lock) in key_locks { + // When `decode_lock`, only consider `Put` and `Delete` + if matches!(lock.lock_type, LockType::Put | LockType::Delete) { + let mini_lock = MiniLock::new(lock.ts, lock.txn_source, lock.generation); + locks.insert(key, mini_lock); + } + } + self.finish_scan_locks(region, locks); }; let (mut hint_min_ts, mut old_value_cursors) = (None, None); - let mut scanner = if kv_api == ChangeDataRequestKvApi::TiDb { - if self.ts_filter_is_helpful(&snap) { + let mut scanner = if self.kv_api == ChangeDataRequestKvApi::TiDb { + if self.ts_filter_is_helpful(&start_key, &end_key) { hint_min_ts = Some(self.checkpoint_ts); old_value_cursors = Some(OldValueCursors::new(&snap)); } + let upper_boundary = if end_key.as_encoded().is_empty() { + // Region upper boundary could be an empty slice. + None + } else { + Some(end_key) + }; // Time range: (checkpoint_ts, max] let txnkv_scanner = ScannerBuilder::new(snap, TimeStamp::max()) .fill_cache(false) - .range(None, None) + .range(Some(start_key), upper_boundary) .hint_min_ts(hint_min_ts) .build_delta_scanner(self.checkpoint_ts, TxnExtraOp::ReadOldValue) .unwrap(); @@ -273,8 +342,8 @@ impl Initializer { DownstreamState::Initializing | DownstreamState::Stopped )); + let mut scan_stat = ScanStat::default(); let scan_long_time = AtomicBool::new(false); - defer!(if scan_long_time.load(Ordering::SeqCst) { CDC_SCAN_LONG_DURATION_REGIONS.dec(); }); @@ -285,10 +354,11 @@ impl Initializer { && start.saturating_elapsed() > Duration::from_secs(60) { CDC_SCAN_LONG_DURATION_REGIONS.inc(); - scan_long_time.store(true, Ordering::SeqCst); - warn!("cdc incremental scan takes too long"; "region_id" => region_id, "conn_id" => ?self.conn_id, - "downstream_id" => ?self.downstream_id, "takes" => ?start.saturating_elapsed()); + warn!( + "cdc incremental scan takes too long"; "region_id" => region_id, "conn_id" => ?self.conn_id, + "downstream_id" => ?self.downstream_id, "takes" => ?start.saturating_elapsed() + ); } // When downstream_state is Stopped, it means the corresponding // delegate is stopped. The initialization can be safely canceled. @@ -296,8 +366,9 @@ impl Initializer { return on_cancel(); } let cursors = old_value_cursors.as_mut(); - let resolver = resolver.as_mut(); - let entries = self.scan_batch(&mut scanner, cursors, resolver).await?; + let entries = self + .scan_batch(&mut scanner, cursors, &mut scan_stat) + .await?; if let Some(None) = entries.last() { // If the last element is None, it means scanning is finished. done = true; @@ -315,19 +386,16 @@ impl Initializer { } let takes = start.saturating_elapsed(); info!("cdc async incremental scan finished"; - "region_id" => region.get_id(), - "conn_id" => ?self.conn_id, - "downstream_id" => ?self.downstream_id, + "region_id" => region_id, + "downstream_id" => ?downstream_id, + "observe_id" => ?observe_id, + "conn_id" => ?conn_id, "takes" => ?takes, ); - if let Some(resolver) = resolver { - self.finish_building_resolver(resolver, region); - } - CDC_SCAN_DURATION_HISTOGRAM.observe(takes.as_secs_f64()); CDC_SCAN_SINK_DURATION_HISTOGRAM.observe(duration_to_sec(sink_time)); - Ok(()) + Ok(scan_stat) } // It's extracted from `Initializer::scan_batch` to avoid becoming an @@ -414,40 +482,25 @@ impl Initializer { &self, scanner: &mut Scanner, old_value_cursors: Option<&mut OldValueCursors>, - resolver: Option<&mut Resolver>, + scan_stat: &mut ScanStat, ) -> Result>> { let mut entries = Vec::with_capacity(self.max_scan_batch_size); - let ScanStat { - emit, - disk_read, - perf_delta, - } = self.do_scan(scanner, old_value_cursors, &mut entries)?; + let delta = self.do_scan(scanner, old_value_cursors, &mut entries)?; + scan_stat.emit += delta.emit; + scan_stat.perf_delta += delta.perf_delta; + if let Some(disk_read) = delta.disk_read { + *scan_stat.disk_read.get_or_insert(0) += disk_read; + } - TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += perf_delta); + TLS_CDC_PERF_STATS.with(|x| *x.borrow_mut() += delta.perf_delta); tls_flush_perf_stats(); - if let Some(bytes) = disk_read { + if let Some(bytes) = delta.disk_read { CDC_SCAN_DISK_READ_BYTES.inc_by(bytes as _); self.scan_speed_limiter.consume(bytes).await; } - CDC_SCAN_BYTES.inc_by(emit as _); - self.fetch_speed_limiter.consume(emit as _).await; - - if let Some(resolver) = resolver { - // Track the locks. - for entry in entries.iter().flatten() { - if let KvEntry::TxnEntry(TxnEntry::Prewrite { ref lock, .. }) = entry { - let (encoded_key, value) = lock; - let key = Key::from_encoded_slice(encoded_key).into_raw().unwrap(); - let lock = Lock::parse(value)?; - match lock.lock_type { - LockType::Put | LockType::Delete => { - resolver.track_lock(lock.ts, key, None)?; - } - _ => (), - }; - } - } - } + CDC_SCAN_BYTES.inc_by(delta.emit as _); + self.fetch_speed_limiter.consume(delta.emit as _).await; + Ok(entries) } @@ -484,24 +537,22 @@ impl Initializer { Ok(()) } - fn finish_building_resolver(&self, mut resolver: Resolver, region: Region) { - let observe_id = self.observe_id; - let rts = resolver.resolve(TimeStamp::zero(), None, TsSource::Cdc); + fn finish_scan_locks(&self, region: Region, locks: BTreeMap) { + let observe_id = self.observe_handle.id; info!( - "cdc resolver initialized and schedule resolver ready"; + "cdc has scanned all incremental scan locks"; "region_id" => region.get_id(), "conn_id" => ?self.conn_id, "downstream_id" => ?self.downstream_id, - "resolved_ts" => rts, - "lock_count" => resolver.locks().len(), + "lock_count" => locks.len(), "observe_id" => ?observe_id, ); fail_point!("before_schedule_resolver_ready"); - if let Err(e) = self.sched.schedule(Task::ResolverReady { + if let Err(e) = self.sched.schedule(Task::FinishScanLocks { observe_id, - resolver, region, + locks, }) { error!("cdc schedule task failed"; "error" => ?e); } @@ -509,7 +560,8 @@ impl Initializer { // Deregister downstream when the Initializer fails to initialize. pub(crate) fn deregister_downstream(&self, err: Error) { - let deregister = if self.build_resolver || err.has_region_error() { + let build_resolver = self.build_resolver.load(Ordering::Acquire); + let deregister = if build_resolver || err.has_region_error() { // Deregister delegate on the conditions, // * It fails to build a resolver. A delegate requires a resolver to advance // resolved ts. @@ -517,7 +569,7 @@ impl Initializer { // error and can not serve. Deregister::Delegate { region_id: self.region_id, - observe_id: self.observe_id, + observe_id: self.observe_handle.id, err, } } else { @@ -535,13 +587,13 @@ impl Initializer { } } - fn ts_filter_is_helpful(&self, snap: &S) -> bool { + fn ts_filter_is_helpful(&self, start_key: &Key, end_key: &Key) -> bool { if self.ts_filter_ratio < f64::EPSILON { return false; } + let start_key = data_key(start_key.as_encoded()); + let end_key = data_end_key(end_key.as_encoded()); - let start_key = data_key(snap.lower_bound().unwrap_or_default()); - let end_key = data_end_key(snap.upper_bound().unwrap_or_default()); let range = Range::new(&start_key, &end_key); let tablet = match self.tablet.as_ref() { Some(t) => t, @@ -593,6 +645,7 @@ mod tests { collections::BTreeMap, fmt::Display, sync::{ + atomic::AtomicBool, mpsc::{channel, sync_channel, Receiver, RecvTimeoutError, Sender}, Arc, }, @@ -606,8 +659,7 @@ mod tests { cdcpb::{EventLogType, Event_oneof_event}, errorpb::Error as ErrorHeader, }; - use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; - use resolved_ts::TxnLocks; + use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter}; use test_raftstore::MockRaftStoreRouter; use tikv::{ config::DbConfig, @@ -667,7 +719,7 @@ mod tests { ) { let (receiver_worker, rx) = new_receiver_worker(); let quota = Arc::new(MemoryQuota::new(usize::MAX)); - let (sink, drain) = crate::channel::channel(buffer, quota); + let (sink, drain) = crate::channel::channel(ConnId::default(), buffer, quota); let pool = Builder::new_multi_thread() .thread_name("test-initializer-worker") @@ -677,6 +729,19 @@ mod tests { .unwrap(); let downstream_state = Arc::new(AtomicCell::new(DownstreamState::Initializing)); let initializer = Initializer { + region_id: 1, + conn_id: ConnId::new(), + request_id: RequestId(0), + checkpoint_ts: 1.into(), + region_epoch: RegionEpoch::default(), + + build_resolver: Arc::new(AtomicBool::new(true)), + observed_range: ObservedRange::default(), + observe_handle: ObserveHandle::new(), + downstream_id: DownstreamId::new(), + downstream_state, + scan_truncated: Arc::new(Default::default()), + tablet: engine.or_else(|| { TestEngineBuilder::new() .build_without_cache() @@ -685,21 +750,13 @@ mod tests { }), sched: receiver_worker.scheduler(), sink, - observed_range: ObservedRange::default(), - region_id: 1, - region_epoch: RegionEpoch::default(), - observe_id: ObserveId::new(), - downstream_id: DownstreamId::new(), - downstream_state, - scan_truncated: Arc::new(Default::default()), - conn_id: ConnId::new(), - request_id: 0, - checkpoint_ts: 1.into(), + concurrency_semaphore: Arc::new(Semaphore::new(1)), + scan_speed_limiter: Limiter::new(scan_limit as _), fetch_speed_limiter: Limiter::new(fetch_limit as _), max_scan_batch_bytes: 1024 * 1024, max_scan_batch_size: 1024, - build_resolver: true, + ts_filter_ratio: 1.0, // always enable it. kv_api, filter_loop, @@ -709,109 +766,77 @@ mod tests { } #[test] - fn test_initializer_build_resolver() { + fn test_initializer_scan_locks() { let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); - let mut expected_locks = BTreeMap::::new(); + let mut expected_locks = BTreeMap::::new(); - // Only observe ["", "b\0x90"] + // Only observe ["b\x00", "b\0x90"] let observed_range = ObservedRange::new( - Key::from_raw(&[]).into_encoded(), + Key::from_raw(&[b'k', 0]).into_encoded(), Key::from_raw(&[b'k', 90]).into_encoded(), ) .unwrap(); - let mut total_bytes = 0; + // Pessimistic locks should not be tracked for i in 0..10 { - let k = &[b'k', i]; - total_bytes += k.len(); - let ts = TimeStamp::new(i as _); + let (k, ts) = (&[b'k', i], TimeStamp::new(i as _)); must_acquire_pessimistic_lock(&mut engine, k, k, ts, ts); } for i in 10..100 { - let (k, v) = (&[b'k', i], &[b'v', i]); - total_bytes += k.len(); - total_bytes += v.len(); - let ts = TimeStamp::new(i as _); + let (k, v, ts) = (&[b'k', i], &[b'v', i], TimeStamp::new(i as _)); must_prewrite_put(&mut engine, k, v, k, ts); - let txn_locks = expected_locks.entry(ts).or_insert_with(|| { - let mut txn_locks = TxnLocks::default(); - txn_locks.sample_lock = Some(k.to_vec().into()); - txn_locks - }); - txn_locks.lock_count += 1; + expected_locks.insert(Key::from_raw(k), MiniLock::from_ts(ts)); } let region = Region::default(); let snap = engine.snapshot(Default::default()).unwrap(); - // Buffer must be large enough to unblock async incremental scan. - let buffer = 1000; let (mut worker, pool, mut initializer, rx, mut drain) = mock_initializer( - total_bytes, - total_bytes, - buffer, + usize::MAX, + usize::MAX, + 1000, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, false, ); initializer.observed_range = observed_range.clone(); + initializer.build_resolver.store(true, Ordering::Release); + initializer + .downstream_state + .store(DownstreamState::Initializing); + let check_result = || { let task = rx.recv().unwrap(); match task { - Task::ResolverReady { resolver, .. } => { - assert_eq!(resolver.locks(), &expected_locks); - } + Task::FinishScanLocks { locks, .. } => assert_eq!(locks, expected_locks), t => panic!("unexpected task {} received", t), } }; - // To not block test by barrier. + pool.spawn(async move { let mut d = drain.drain(); while let Some((e, _)) = d.next().await { if let CdcEvent::Event(e) = e { for e in e.get_entries().get_entries() { - let key = Key::from_raw(&e.key).into_encoded(); - assert!(observed_range.contains_encoded_key(&key), "{:?}", e); + if e.r_type == EventLogType::Prewrite { + let key = Key::from_raw(&e.key).into_encoded(); + assert!(observed_range.contains_encoded_key(&key)); + } } } } }); - let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); - block_on(initializer.async_incremental_scan( - snap.clone(), - region.clone(), - memory_quota.clone(), - )) - .unwrap(); - check_result(); - - initializer - .downstream_state - .store(DownstreamState::Initializing); - initializer.max_scan_batch_bytes = total_bytes; - block_on(initializer.async_incremental_scan( - snap.clone(), - region.clone(), - memory_quota.clone(), - )) - .unwrap(); + block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); check_result(); + initializer.build_resolver.store(false, Ordering::Release); initializer .downstream_state .store(DownstreamState::Initializing); - initializer.build_resolver = false; - block_on(initializer.async_incremental_scan( - snap.clone(), - region.clone(), - memory_quota.clone(), - )) - .unwrap(); - - let task = rx.recv_timeout(Duration::from_millis(100)); - match task { + block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); + match rx.recv_timeout(Duration::from_millis(100)) { Ok(t) => panic!("unexpected task {} received", t), Err(RecvTimeoutError::Timeout) => (), Err(e) => panic!("unexpected err {:?}", e), @@ -819,28 +844,14 @@ mod tests { // Test cancellation. initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.async_incremental_scan(snap.clone(), region, memory_quota.clone())) - .unwrap_err(); - - // Cancel error should trigger a deregsiter. - let mut region = Region::default(); - region.set_id(initializer.region_id); - region.mut_peers().push(Default::default()); - let snapshot = Some(RegionSnapshot::from_snapshot(snap, Arc::new(region))); - let resp = ReadResponse { - snapshot, - response: Default::default(), - txn_extra_op: Default::default(), - }; - block_on(initializer.on_change_cmd_response(resp.clone(), memory_quota.clone())) - .unwrap_err(); + block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap_err(); // Disconnect sink by dropping runtime (it also drops drain). - drop(pool); initializer .downstream_state .store(DownstreamState::Initializing); - block_on(initializer.on_change_cmd_response(resp, memory_quota)).unwrap_err(); + drop(pool); + block_on(initializer.async_incremental_scan(snap, region)).unwrap_err(); worker.stop(); } @@ -869,9 +880,8 @@ mod tests { filter_loop, ); let th = pool.spawn(async move { - let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default(), memory_quota) + .async_incremental_scan(snap, Region::default()) .await .unwrap(); }); @@ -956,9 +966,8 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let th = pool.spawn(async move { - let memory_qutoa = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default(), memory_qutoa) + .async_incremental_scan(snap, Region::default()) .await .unwrap(); }); @@ -1009,7 +1018,7 @@ mod tests { fn test_initializer_deregister_downstream() { let total_bytes = 1; let buffer = 1; - let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + let (mut worker, _pool, initializer, rx, _drain) = mock_initializer( total_bytes, total_bytes, buffer, @@ -1019,7 +1028,7 @@ mod tests { ); // Errors reported by region should deregister region. - initializer.build_resolver = false; + initializer.build_resolver.store(false, Ordering::Release); initializer.deregister_downstream(Error::request(ErrorHeader::default())); let task = rx.recv_timeout(Duration::from_millis(100)); match task { @@ -1030,7 +1039,7 @@ mod tests { Err(e) => panic!("unexpected err {:?}", e), } - initializer.build_resolver = false; + initializer.build_resolver.store(false, Ordering::Release); initializer.deregister_downstream(Error::Other(box_err!("test"))); let task = rx.recv_timeout(Duration::from_millis(100)); match task { @@ -1042,7 +1051,7 @@ mod tests { } // Test deregister region when resolver fails to build. - initializer.build_resolver = true; + initializer.build_resolver.store(true, Ordering::Release); initializer.deregister_downstream(Error::Other(box_err!("test"))); let task = rx.recv_timeout(Duration::from_millis(100)); match task { @@ -1068,24 +1077,14 @@ mod tests { let (mut worker, pool, mut initializer, _rx, _drain) = mock_initializer(total_bytes, total_bytes, buffer, None, kv_api, false); - let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); - let concurrency_semaphore = Arc::new(Semaphore::new(1)); - let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); - initializer.downstream_state.store(DownstreamState::Stopped); - block_on(initializer.initialize( - change_cmd, - raft_router.clone(), - concurrency_semaphore.clone(), - memory_quota.clone(), - )) - .unwrap_err(); + block_on(initializer.initialize(raft_router.clone())).unwrap_err(); let (tx, rx) = sync_channel(1); - let concurrency_semaphore_ = concurrency_semaphore.clone(); + let concurrency_semaphore = initializer.concurrency_semaphore.clone(); pool.spawn(async move { - let _permit = concurrency_semaphore_.acquire().await; + let _permit = concurrency_semaphore.acquire().await; tx.send(()).unwrap(); tx.send(()).unwrap(); tx.send(()).unwrap(); @@ -1093,19 +1092,8 @@ mod tests { rx.recv_timeout(Duration::from_millis(200)).unwrap(); let (tx1, rx1) = sync_channel(1); - let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); pool.spawn(async move { - // Migrated to 2021 migration. This let statement is probably not needed, see - // https://doc.rust-lang.org/edition-guide/rust-2021/disjoint-capture-in-closures.html - let _ = ( - &initializer, - &change_cmd, - &raft_router, - &concurrency_semaphore, - ); - let res = initializer - .initialize(change_cmd, raft_router, concurrency_semaphore, memory_quota) - .await; + let res = initializer.initialize(raft_router).await; tx1.send(res).unwrap(); }); // Must timeout because there is no enough permit. @@ -1154,9 +1142,8 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let th = pool.spawn(async move { - let memory_quota = Arc::new(MemoryQuota::new(usize::MAX)); initializer - .async_incremental_scan(snap, Region::default(), memory_quota) + .async_incremental_scan(snap, Region::default()) .await .unwrap(); }); @@ -1171,4 +1158,67 @@ mod tests { block_on(th).unwrap(); worker.stop(); } + + #[test] + fn test_initialize_scan_range() { + let mut cfg = DbConfig::default(); + cfg.writecf.disable_auto_compactions = true; + let mut engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); + + // Must start with 'z', otherwise table property collector doesn't work. + let ka = Key::from_raw(b"zaaa").into_encoded(); + let km = Key::from_raw(b"zmmm").into_encoded(); + let ky = Key::from_raw(b"zyyy").into_encoded(); + let kz = Key::from_raw(b"zzzz").into_encoded(); + + // Incremental scan iterator shouldn't access the key because it's out of range. + must_prewrite_put(&mut engine, &ka, b"value", &ka, 200); + must_commit(&mut engine, &ka, 200, 210); + for cf in &[CF_WRITE, CF_DEFAULT] { + let kv = engine.kv_engine().unwrap(); + kv.flush_cf(cf, true).unwrap(); + } + + // Incremental scan iterator shouldn't access the key because it's skiped by ts + // filter. + must_prewrite_put(&mut engine, &km, b"value", &km, 100); + must_commit(&mut engine, &km, 100, 110); + for cf in &[CF_WRITE, CF_DEFAULT] { + let kv = engine.kv_engine().unwrap(); + kv.flush_cf(cf, true).unwrap(); + } + + must_prewrite_put(&mut engine, &ky, b"value", &ky, 200); + must_commit(&mut engine, &ky, 200, 210); + for cf in &[CF_WRITE, CF_DEFAULT] { + let kv = engine.kv_engine().unwrap(); + kv.flush_cf(cf, true).unwrap(); + } + + let (mut _worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + usize::MAX, + usize::MAX, + 1000, + engine.kv_engine(), + ChangeDataRequestKvApi::TiDb, + false, + ); + + initializer.observed_range = ObservedRange::new(km, kz).unwrap(); + initializer.checkpoint_ts = 150.into(); + + let th = pool.spawn(async move { + let snap = engine.snapshot(Default::default()).unwrap(); + let region = Region::default(); + let scan_stat = initializer + .async_incremental_scan(snap, region) + .await + .unwrap(); + let block_reads = scan_stat.perf_delta.block_read_count; + let block_gets = scan_stat.perf_delta.block_cache_hit_count; + assert_eq!(block_reads + block_gets, 1); + }); + while block_on(drain.drain().next()).is_some() {} + block_on(th).unwrap(); + } } diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 965a31ac7ff..fda4a456217 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -56,7 +56,7 @@ impl CdcObserver { .register_region_change_observer(100, BoxRegionChangeObserver::new(self.clone())); } - /// Subscribe an region, the observer will sink events of the region into + /// Subscribe a region, the observer will sink events of the region into /// its scheduler. /// /// Return previous ObserveId if there is one. @@ -121,8 +121,7 @@ impl CmdObserver for CdcObserver { // Create a snapshot here for preventing the old value was GC-ed. // TODO: only need it after enabling old value, may add a flag to indicate // whether to get it. - let snapshot = - RegionSnapshot::from_snapshot(Arc::new(engine.snapshot(None)), Arc::new(region)); + let snapshot = RegionSnapshot::from_snapshot(Arc::new(engine.snapshot()), Arc::new(region)); let get_old_value = move |key, query_ts, old_value_cache: &mut OldValueCache, @@ -264,7 +263,7 @@ mod tests { region.mut_peers().push(new_peer(3, 3)); let mut ctx = ObserverContext::new(®ion); - observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); + observer.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Follower)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); let oid = ObserveId::new(); @@ -330,7 +329,7 @@ mod tests { }; // No event if it changes to leader. - observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Leader)); + observer.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Leader)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); // unsubscribed fail if observer id is different. @@ -339,13 +338,13 @@ mod tests { // No event if it is unsubscribed. let oid_ = observer.unsubscribe_region(1, oid).unwrap(); assert_eq!(oid_, oid); - observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); + observer.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Follower)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); // No event if it is unsubscribed. region.set_id(999); let mut ctx = ObserverContext::new(®ion); - observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); + observer.on_role_change(&mut ctx, &RoleChange::new_for_test(StateRole::Follower)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); } } diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index 269a70d477e..1ab5a3a7510 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -310,7 +310,7 @@ mod tests { value: Option, ) -> Statistics { let key = key.clone().append_ts(ts.into()); - let snapshot = Arc::new(kv_engine.snapshot(None)); + let snapshot = Arc::new(kv_engine.snapshot()); let mut cursor = new_write_cursor_on_key(&snapshot, &key); let load_default = Either::Left(&snapshot); let mut stats = Statistics::default(); @@ -529,7 +529,7 @@ mod tests { must_commit(&mut engine, &key, 200, 201); } - let snapshot = Arc::new(kv_engine.snapshot(None)); + let snapshot = Arc::new(kv_engine.snapshot()); let mut cursor = new_old_value_cursor(&snapshot, CF_WRITE); let mut default_cursor = new_old_value_cursor(&snapshot, CF_DEFAULT); let mut load_default = |use_default_cursor: bool| { @@ -601,7 +601,7 @@ mod tests { } let key = format!("zkey-{:0>3}", 0).into_bytes(); - let snapshot = Arc::new(kv_engine.snapshot(None)); + let snapshot = Arc::new(kv_engine.snapshot()); let perf_instant = ReadPerfInstant::new(); let value = get_old_value( &snapshot, diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index 4cb89d16394..0ecd2eb606c 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -35,6 +35,9 @@ pub fn validate_kv_api(kv_api: ChangeDataRequestKvApi, api_version: ApiVersion) #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] pub struct ConnId(usize); +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub struct RequestId(pub u64); + impl ConnId { pub fn new() -> ConnId { ConnId(CONNECTION_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) @@ -89,7 +92,7 @@ pub struct Conn { #[derive(PartialEq, Eq, Hash)] struct DownstreamKey { - request_id: u64, + request_id: RequestId, region_id: u64, } @@ -100,9 +103,9 @@ struct DownstreamValue { } impl Conn { - pub fn new(sink: Sink, peer: String) -> Conn { + pub fn new(conn_id: ConnId, sink: Sink, peer: String) -> Conn { Conn { - id: ConnId::new(), + id: conn_id, sink, downstreams: HashMap::default(), peer, @@ -143,7 +146,7 @@ impl Conn { &self.sink } - pub fn get_downstream(&self, request_id: u64, region_id: u64) -> Option { + pub fn get_downstream(&self, request_id: RequestId, region_id: u64) -> Option { let key = DownstreamKey { request_id, region_id, @@ -153,7 +156,7 @@ impl Conn { pub fn subscribe( &mut self, - request_id: u64, + request_id: RequestId, region_id: u64, downstream_id: DownstreamId, downstream_state: Arc>, @@ -174,7 +177,7 @@ impl Conn { } } - pub fn unsubscribe(&mut self, request_id: u64, region_id: u64) -> Option { + pub fn unsubscribe(&mut self, request_id: RequestId, region_id: u64) -> Option { let key = DownstreamKey { request_id, region_id, @@ -182,7 +185,7 @@ impl Conn { self.downstreams.remove(&key).map(|value| value.id) } - pub fn unsubscribe_request(&mut self, request_id: u64) -> Vec<(u64, DownstreamId)> { + pub fn unsubscribe_request(&mut self, request_id: RequestId) -> Vec<(u64, DownstreamId)> { let mut downstreams = Vec::new(); self.downstreams.retain(|key, value| -> bool { if key.request_id == request_id { @@ -196,7 +199,7 @@ impl Conn { pub fn iter_downstreams(&self, mut f: F) where - F: FnMut(u64, u64, DownstreamId, &Arc>), + F: FnMut(RequestId, u64, DownstreamId, &Arc>), { for (key, value) in &self.downstreams { f(key.request_id, key.region_id, value.id, &value.state); @@ -276,18 +279,15 @@ impl Service { peer: &str, ) -> semver::Version { let version_field = request.get_header().get_ticdc_version(); - match semver::Version::parse(version_field) { - Ok(v) => v, - Err(e) => { - warn!( - "empty or invalid TiCDC version, please upgrading TiCDC"; - "version" => version_field, - "downstream" => ?peer, "region_id" => request.region_id, - "error" => ?e, - ); - semver::Version::new(0, 0, 0) - } - } + semver::Version::parse(version_field).unwrap_or_else(|e| { + warn!( + "empty or invalid TiCDC version, please upgrading TiCDC"; + "version" => version_field, + "downstream" => ?peer, "region_id" => request.region_id, + "error" => ?e, + ); + semver::Version::new(0, 0, 0) + }) } fn set_conn_version( @@ -334,22 +334,23 @@ impl Service { request: ChangeDataRequest, conn_id: ConnId, ) -> Result<(), String> { - let observed_range = - match ObservedRange::new(request.start_key.clone(), request.end_key.clone()) { - Ok(observed_range) => observed_range, - Err(e) => { - warn!( - "cdc invalid observed start key or end key version"; - "downstream" => ?peer, "region_id" => request.region_id, - "error" => ?e, - ); - ObservedRange::default() - } - }; + let observed_range = ObservedRange::new(request.start_key.clone(), request.end_key.clone()) + .unwrap_or_else(|e| { + warn!( + "cdc invalid observed start key or end key version"; + "downstream" => ?peer, + "region_id" => request.region_id, + "request_id" => request.region_id, + "error" => ?e, + "start_key" => log_wrappers::Value::key(&request.start_key), + "end_key" => log_wrappers::Value::key(&request.end_key), + ); + ObservedRange::default() + }); let downstream = Downstream::new( peer.to_owned(), request.get_region_epoch().clone(), - request.request_id, + RequestId(request.request_id), conn_id, request.kv_api, request.filter_loop, @@ -358,7 +359,6 @@ impl Service { let task = Task::Register { request, downstream, - conn_id, }; scheduler.schedule(task).map_err(|e| format!("{:?}", e)) } @@ -371,13 +371,13 @@ impl Service { let task = if request.region_id != 0 { Task::Deregister(Deregister::Region { conn_id, - request_id: request.request_id, + request_id: RequestId(request.request_id), region_id: request.region_id, }) } else { Task::Deregister(Deregister::Request { conn_id, - request_id: request.request_id, + request_id: RequestId(request.request_id), }) }; scheduler.schedule(task).map_err(|e| format!("{:?}", e)) @@ -405,10 +405,10 @@ impl Service { event_feed_v2: bool, ) { sink.enhance_batch(true); + let conn_id = ConnId::new(); let (event_sink, mut event_drain) = - channel(CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); - let conn = Conn::new(event_sink, ctx.peer()); - let conn_id = conn.get_id(); + channel(conn_id, CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); + let conn = Conn::new(conn_id, event_sink, ctx.peer()); let mut explicit_features = vec![]; if event_feed_v2 { diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 2e9103727e4..d6e3c904fd2 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -656,3 +656,70 @@ fn test_delegate_fail_during_incremental_scan() { recv_timeout(&mut recver, Duration::from_secs(1)).unwrap_err(); recv.replace(Some(recver)); } + +// The case shows it's possible that unordered Prewrite events on one same key +// can be sent to TiCDC clients. Generally it only happens when a region changes +// during a Pipelined-DML transaction. +// +// To ensure TiCDC can handle the situation, `generation` should be carried in +// Prewrite events. +#[test] +fn test_cdc_pipeline_dml() { + let mut cluster = new_server_cluster(0, 1); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + + let prewrite_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let (k, v) = (b"key".to_vec(), vec![b'x'; 16]); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_flush(rid, vec![mutation], k.clone(), prewrite_tso, 1); + + fail::cfg("cdc_incremental_scan_start", "pause").unwrap(); + + let cf_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + req.checkpoint_ts = cf_tso.into_inner(); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + sleep_ms(100); + + let (k, v) = (b"key".to_vec(), vec![b'y'; 16]); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_flush(rid, vec![mutation], k.clone(), prewrite_tso, 2); + + let events = receive_event(false).take_events().into_vec(); + for entry in events[0].get_entries().get_entries() { + assert_eq!(entry.r_type, EventLogType::Prewrite); + assert_eq!(entry.generation, 2); + assert_eq!(entry.value, vec![b'y'; 16]); + } + + let commit_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(rid, vec![b"key".to_vec()], prewrite_tso, commit_tso); + + let events = receive_event(false).take_events().into_vec(); + for entry in events[0].get_entries().get_entries() { + assert_eq!(entry.r_type, EventLogType::Commit); + assert_eq!(entry.start_ts, prewrite_tso.into_inner()); + assert_eq!(entry.commit_ts, commit_tso.into_inner()); + } + + fail::remove("cdc_incremental_scan_start"); + + let events = receive_event(false).take_events().into_vec(); + let entries = events[0].get_entries().get_entries(); + assert_eq!(entries[0].r_type, EventLogType::Prewrite); + assert_eq!(entries[0].generation, 1); + assert_eq!(entries[0].value, vec![b'x'; 16]); + assert_eq!(entries[1].r_type, EventLogType::Initialized); +} diff --git a/components/cdc/tests/failpoints/test_memory_quota.rs b/components/cdc/tests/failpoints/test_memory_quota.rs index 5b564ba61ec..3d331deda3d 100644 --- a/components/cdc/tests/failpoints/test_memory_quota.rs +++ b/components/cdc/tests/failpoints/test_memory_quota.rs @@ -78,7 +78,7 @@ fn test_resolver_track_lock_memory_quota_exceeded() { match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(e) => { // Unknown errors are translated into region_not_found. - assert!(e.has_region_not_found(), "{:?}", e); + assert!(e.has_congested(), "{:?}", e); } other => panic!("unknown event {:?}", other), } @@ -118,31 +118,19 @@ fn test_pending_on_region_ready_memory_quota_exceeded() { fail::cfg("cdc_event_size", "return(0)").unwrap(); // Trigger memory quota exceeded error. - fail::cfg("cdc_pending_on_region_ready", "return").unwrap(); + fail::cfg("cdc_finish_scan_locks_memory_quota_exceed", "return").unwrap(); let req = suite.new_changedata_request(1); let (mut req_tx, _event_feed_wrap, receive_event) = new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); - let event = receive_event(false); - event.events.into_iter().for_each(|e| { - match e.event.unwrap() { - // Even if there is no write, - // it should always outputs an Initialized event. - Event_oneof_event::Entries(es) => { - assert!(es.entries.len() == 1, "{:?}", es); - let e = &es.entries[0]; - assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); - } - other => panic!("unknown event {:?}", other), - } - }); - // MemoryQuotaExceeded error is triggered on_region_ready. + + // MemoryQuotaExceeded error is triggered. let mut events = receive_event(false).events.to_vec(); assert_eq!(events.len(), 1, "{:?}", events); match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(e) => { // Unknown errors are translated into region_not_found. - assert!(e.has_region_not_found(), "{:?}", e); + assert!(e.has_congested(), "{:?}", e); } other => panic!("unknown event {:?}", other), } @@ -164,7 +152,8 @@ fn test_pending_on_region_ready_memory_quota_exceeded() { "find unexpected delegate" ); - fail::remove("cdc_incremental_scan_start"); + fail::remove("cdc_event_size"); + fail::remove("cdc_finish_scan_locks_memory_quota_exceed"); suite.stop(); } @@ -205,7 +194,7 @@ fn test_pending_push_lock_memory_quota_exceeded() { match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(e) => { // Unknown errors are translated into region_not_found. - assert!(e.has_region_not_found(), "{:?}", e); + assert!(e.has_congested(), "{:?}", e); } other => panic!("unknown event {:?}", other), } @@ -227,6 +216,7 @@ fn test_pending_push_lock_memory_quota_exceeded() { "find unexpected delegate" ); + fail::remove("cdc_event_size"); fail::remove("cdc_incremental_scan_start"); suite.stop(); } @@ -265,7 +255,7 @@ fn test_scan_lock_memory_quota_exceeded() { match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(e) => { // Unknown errors are translated into region_not_found. - assert!(e.has_region_not_found(), "{:?}", e); + assert!(e.has_congested(), "{:?}", e); } other => panic!("unknown event {:?}", other), } @@ -285,5 +275,6 @@ fn test_scan_lock_memory_quota_exceeded() { "find unexpected delegate" ); + fail::remove("cdc_event_size"); suite.stop(); } diff --git a/components/cdc/tests/failpoints/test_register.rs b/components/cdc/tests/failpoints/test_register.rs index 2b6be3744af..2128dff08e1 100644 --- a/components/cdc/tests/failpoints/test_register.rs +++ b/components/cdc/tests/failpoints/test_register.rs @@ -97,7 +97,7 @@ fn test_region_ready_after_deregister_impl() { .obs .get(&leader.get_store_id()) .unwrap() - .on_role_change(&mut context, &RoleChange::new(StateRole::Follower)); + .on_role_change(&mut context, &RoleChange::new_for_test(StateRole::Follower)); // Then CDC should not panic fail::remove(fp); diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index c1ac1706d52..c6ccfff0ad5 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -13,9 +13,9 @@ use raft::eraftpb::MessageType; use test_raftstore::*; use tikv::server::DEFAULT_CLUSTER_ID; use tikv_util::{config::ReadableDuration, HandyRwLock}; -use txn_types::{Key, Lock, LockType}; +use txn_types::{Key, Lock, LockType, TimeStamp}; -use crate::{new_event_feed, TestSuite, TestSuiteBuilder}; +use crate::{new_event_feed, new_event_feed_v2, TestSuite, TestSuiteBuilder}; #[test] fn test_cdc_basic() { @@ -934,12 +934,15 @@ fn test_cdc_batch_size_limit_impl() { let mut events = receive_event(false).events.to_vec(); assert_eq!(events.len(), 1, "{:?}", events); match events.pop().unwrap().event.unwrap() { - Event_oneof_event::Entries(es) => { - assert_eq!(es.entries.len(), 2); - let e = &es.entries[0]; + Event_oneof_event::Entries(mut es) => { + let mut entries = es.take_entries().into_vec(); + assert_eq!(entries.len(), 2); + entries.sort_by(|a, b| a.key.cmp(&b.key)); + + let e = &entries[0]; assert_eq!(e.get_type(), EventLogType::Prewrite, "{:?}", e.get_type()); assert_eq!(e.key, b"xk3", "{:?}", e.key); - let e = &es.entries[1]; + let e = &entries[1]; assert_eq!(e.get_type(), EventLogType::Prewrite, "{:?}", e.get_type()); assert_eq!(e.key, b"xk4", "{:?}", e.key); } @@ -1321,19 +1324,20 @@ fn test_cdc_1pc_impl() { let req = suite.new_changedata_request(1); let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + // Wait until the region subscription is initialized. let event = receive_event(false); - event.events.into_iter().for_each(|e| { - match e.event.unwrap() { - // Even if there is no write, - // it should always outputs an Initialized event. + event + .events + .into_iter() + .for_each(|e| match e.event.unwrap() { Event_oneof_event::Entries(es) => { assert!(es.entries.len() == 1, "{:?}", es); let e = &es.entries[0]; assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); } other => panic!("unknown event {:?}", other), - } - }); + }); let (k1, v1) = (b"xk1", b"v1"); let (k2, v2) = (b"xk2", &[0u8; 512]); @@ -1377,16 +1381,19 @@ fn test_cdc_1pc_impl() { if !events.is_empty() { assert_eq!(events.len(), 1); match events.pop().unwrap().event.unwrap() { - Event_oneof_event::Entries(entries) => { - assert_eq!(entries.entries.len(), 2); - let (e0, e1) = (&entries.entries[0], &entries.entries[1]); + Event_oneof_event::Entries(mut es) => { + let mut entries = es.take_entries().into_vec(); + assert_eq!(entries.len(), 2); + entries.sort_by(|a, b| a.key.cmp(&b.key)); + + let (e0, e1) = (&entries[0], &entries[1]); assert_eq!(e0.get_type(), EventLogType::Committed); - assert_eq!(e0.get_key(), k2); - assert_eq!(e0.get_value(), v2); + assert_eq!(e0.get_key(), k1); + assert_eq!(e0.get_value(), v1); assert!(e0.commit_ts > resolved_ts); assert_eq!(e1.get_type(), EventLogType::Committed); - assert_eq!(e1.get_key(), k1); - assert_eq!(e1.get_value(), v1); + assert_eq!(e1.get_key(), k2); + assert_eq!(e1.get_value(), v2); assert!(e1.commit_ts > resolved_ts); break; } @@ -1906,18 +1913,17 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { let req = suite.new_changedata_request(1); let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); - let event = receive_event(false); - event - .events - .into_iter() - .for_each(|e| match e.event.unwrap() { + + for e in receive_event(false).events.into_vec() { + match e.event.unwrap() { Event_oneof_event::Entries(es) => { assert!(es.entries.len() == 1, "{:?}", es); let e = &es.entries[0]; assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); } other => panic!("unknown event {:?}", other), - }); + }; + } sleep_ms(1000); @@ -2208,7 +2214,7 @@ fn test_cdc_write_rollback_when_no_lock_impl() { let k1 = b"xk1".to_vec(); m1.set_op(Op::Put); m1.key = k1.clone(); - m1.value = b"v1".to_vec(); + m1.value = vec![b'x'; 16]; suite.must_kv_prewrite(1, vec![m1], k1.clone(), 10.into()); // Wait until resolved_ts advanced to 10 @@ -2732,3 +2738,171 @@ fn test_cdc_filter_key_range() { suite.stop(); } + +#[test] +fn test_cdc_partial_subscription() { + let mut cluster = new_server_cluster(0, 1); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + + let prewrite_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let (k, v) = (b"key".to_vec(), vec![b'x'; 16]); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(rid, vec![mutation], k.clone(), prewrite_tso); + + let cf_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + req.checkpoint_ts = cf_tso.into_inner(); + req.set_start_key(Key::from_raw(b"x").into_encoded()); + req.set_end_key(Key::from_raw(b"z").into_encoded()); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + let cdc_event = receive_event(false); + 'WaitInit: for event in cdc_event.get_events() { + for entry in event.get_entries().get_entries() { + match entry.get_type() { + EventLogType::Prewrite => {} + EventLogType::Initialized => break 'WaitInit, + _ => unreachable!(), + } + } + } + + for _ in 0..10 { + let cdc_event = receive_event(true); + if cdc_event.has_resolved_ts() { + let resolved_ts = cdc_event.get_resolved_ts(); + if resolved_ts.ts > prewrite_tso.into_inner() { + return; + } + } + } + panic!("resolved_ts should exceed prewrite_tso"); +} + +#[test] +fn test_cdc_rollback_prewrites_with_txn_source() { + let mut cluster = new_server_cluster(0, 1); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let cf_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + req.checkpoint_ts = cf_tso.into_inner(); + req.filter_loop = true; + req.set_start_key(Key::from_raw(b"a").into_encoded()); + req.set_end_key(Key::from_raw(b"z").into_encoded()); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + let cdc_event = receive_event(false); + 'WaitInit: for event in cdc_event.get_events() { + for entry in event.get_entries().get_entries() { + match entry.get_type() { + EventLogType::Prewrite => {} + EventLogType::Initialized => break 'WaitInit, + _ => unreachable!(), + } + } + } + + let start_tso = cf_tso.next(); + let k = b"key".to_vec(); + let v = vec![b'x'; 16 * 1024]; + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite_with_source(rid, vec![mutation], k.clone(), start_tso, 1); + + loop { + let cdc_event = receive_event(true); + if cdc_event.has_resolved_ts() { + let resolved_ts = cdc_event.get_resolved_ts().get_ts(); + assert_eq!(resolved_ts, start_tso.into_inner()); + break; + } + } + + suite.must_kv_rollback(rid, vec![k.clone()], start_tso); + + // We can't receive the prewrite because it's with a txn_source, + // but we can receive the rollback. + let mut rollbacked = false; + for _ in 0..5 { + let cdc_event = receive_event(true); + if !rollbacked { + for event in cdc_event.get_events() { + for entry in event.get_entries().get_entries() { + match entry.get_type() { + EventLogType::Rollback => rollbacked = true, + _ => unreachable!(), + } + } + } + } else { + let resolved_ts = cdc_event.get_resolved_ts().get_ts(); + if resolved_ts > 5 { + return; + } + } + } + panic!("resolved ts must be advanced correctly"); +} + +#[test] +fn test_cdc_pessimistic_lock_unlock() { + let mut cluster = new_server_cluster(0, 1); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let region = suite.cluster.get_region(&[]); + let rid = region.id; + let cf_tso = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + + let (mut req_tx, _, receive_event) = new_event_feed_v2(suite.get_region_cdc_client(rid)); + let mut req = suite.new_changedata_request(rid); + req.request_id = 1; + req.checkpoint_ts = cf_tso.into_inner(); + req.filter_loop = true; + req.set_start_key(Key::from_raw(b"a").into_encoded()); + req.set_end_key(Key::from_raw(b"z").into_encoded()); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + let cdc_event = receive_event(false); + 'WaitInit: for event in cdc_event.get_events() { + for entry in event.get_entries().get_entries() { + match entry.get_type() { + EventLogType::Prewrite => {} + EventLogType::Initialized => break 'WaitInit, + _ => unreachable!(), + } + } + } + + let start_tso = cf_tso.next(); + let k = b"key".to_vec(); + let v = vec![b'x'; 16 * 1024]; + let mut mutation = Mutation::default(); + mutation.set_op(Op::PessimisticLock); + mutation.key = k.clone(); + mutation.value = v; + let for_update_tso = TimeStamp::from(start_tso.into_inner() + 10); + suite.must_acquire_pessimistic_lock(rid, vec![mutation], k.clone(), start_tso, for_update_tso); + std::thread::sleep(Duration::from_millis(500)); + + suite.must_release_pessimistic_lock(rid, k.clone(), start_tso, for_update_tso); + std::thread::sleep(Duration::from_millis(500)); +} diff --git a/components/cdc/tests/integrations/test_flow_control.rs b/components/cdc/tests/integrations/test_flow_control.rs index fdfd136d9c7..77edbcc7e2d 100644 --- a/components/cdc/tests/integrations/test_flow_control.rs +++ b/components/cdc/tests/integrations/test_flow_control.rs @@ -75,7 +75,7 @@ fn test_cdc_congest() { match events.pop().unwrap().event.unwrap() { Event_oneof_event::Error(e) => { // Unknown errors are translated into region_not_found. - assert!(e.has_region_not_found(), "{:?}", e); + assert!(e.has_congested(), "{:?}", e); } other => panic!("unknown event {:?}", other), } diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index b36cf46df93..1faea9391a8 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -134,7 +134,7 @@ fn create_event_feed( } pub struct TestSuiteBuilder { - cluster: Option>>, + cluster: Option>, memory_quota: Option, } @@ -147,10 +147,7 @@ impl TestSuiteBuilder { } #[must_use] - pub fn cluster( - mut self, - cluster: Cluster>, - ) -> TestSuiteBuilder { + pub fn cluster(mut self, cluster: Cluster) -> TestSuiteBuilder { self.cluster = Some(cluster); self } @@ -167,7 +164,7 @@ impl TestSuiteBuilder { pub fn build_with_cluster_runner(self, mut runner: F) -> TestSuite where - F: FnMut(&mut Cluster>), + F: FnMut(&mut Cluster), { init(); let memory_quota = self.memory_quota.unwrap_or(usize::MAX); @@ -201,7 +198,7 @@ impl TestSuiteBuilder { let scheduler = worker.scheduler(); let cdc_ob = cdc::CdcObserver::new(scheduler.clone()); obs.insert(id, cdc_ob.clone()); - sim.coprocessor_hooks.entry(id).or_default().push(Box::new( + sim.coprocessor_hosts.entry(id).or_default().push(Box::new( move |host: &mut CoprocessorHost| { cdc_ob.register_to(host); }, @@ -257,7 +254,7 @@ impl TestSuiteBuilder { } pub struct TestSuite { - pub cluster: Cluster>, + pub cluster: Cluster, pub endpoints: HashMap>, pub obs: HashMap, tikv_cli: HashMap, @@ -346,6 +343,51 @@ impl TestSuite { ); } + pub fn must_kv_flush( + &mut self, + region_id: u64, + muts: Vec, + pk: Vec, + ts: TimeStamp, + generation: u64, + ) { + self.must_kv_flush_with_source(region_id, muts, pk, ts, generation, 0); + } + + pub fn must_kv_flush_with_source( + &mut self, + region_id: u64, + muts: Vec, + pk: Vec, + ts: TimeStamp, + generation: u64, + txn_source: u64, + ) { + let mut flush_req = FlushRequest::default(); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + flush_req.set_context(context); + flush_req.set_mutations(muts.into_iter().collect()); + flush_req.primary_key = pk; + flush_req.start_ts = ts.into_inner(); + flush_req.generation = generation; + flush_req.lock_ttl = flush_req.start_ts + 1; + let flush_resp = self + .get_tikv_client(region_id) + .kv_flush(&flush_req) + .unwrap(); + assert!( + !flush_resp.has_region_error(), + "{:?}", + flush_resp.get_region_error() + ); + assert!( + flush_resp.errors.is_empty(), + "{:?}", + flush_resp.get_errors() + ); + } + pub fn must_kv_put(&mut self, region_id: u64, key: Vec, value: Vec) { let mut rawkv_req = RawPutRequest::default(); rawkv_req.set_context(self.get_context(region_id)); @@ -475,6 +517,26 @@ impl TestSuite { ); } + pub fn must_release_pessimistic_lock( + &mut self, + region_id: u64, + pk: Vec, + start_ts: TimeStamp, + for_update_ts: TimeStamp, + ) { + let mut req = PessimisticRollbackRequest::default(); + req.set_context(self.get_context(region_id)); + req.start_version = start_ts.into_inner(); + req.for_update_ts = for_update_ts.into_inner(); + req.set_keys(vec![pk].into_iter().collect()); + let resp = self + .get_tikv_client(region_id) + .kv_pessimistic_rollback(&req) + .unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); + } + pub fn must_kv_pessimistic_prewrite( &mut self, region_id: u64, @@ -509,6 +571,27 @@ impl TestSuite { ); } + pub fn must_kv_txn_heartbeat( + &mut self, + region_id: u64, + pk: Vec, + ts: TimeStamp, + advise_lock_ttl: TimeStamp, + ) { + let mut heartbeat_req = TxnHeartBeatRequest::default(); + heartbeat_req.set_context(self.get_context(region_id)); + heartbeat_req.primary_lock = pk; + heartbeat_req.start_version = ts.into_inner(); + heartbeat_req.advise_lock_ttl = advise_lock_ttl.into_inner(); + let heartbeat_resp = self + .get_tikv_client(region_id) + .kv_txn_heart_beat(&heartbeat_req) + .unwrap(); + assert!(!heartbeat_resp.has_region_error()); + assert!(!heartbeat_resp.has_error()); + assert_eq!(heartbeat_resp.lock_ttl, advise_lock_ttl.into_inner()); + } + pub fn async_kv_commit( &mut self, region_id: u64, @@ -526,6 +609,23 @@ impl TestSuite { .unwrap() } + pub fn async_kv_txn_heartbeat( + &mut self, + region_id: u64, + pk: Vec, + ts: TimeStamp, + advise_lock_ttl: TimeStamp, + ) -> ClientUnaryReceiver { + let mut heartbeat_req = TxnHeartBeatRequest::default(); + heartbeat_req.set_context(self.get_context(region_id)); + heartbeat_req.primary_lock = pk; + heartbeat_req.start_version = ts.into_inner(); + heartbeat_req.advise_lock_ttl = advise_lock_ttl.into_inner(); + self.get_tikv_client(region_id) + .kv_txn_heart_beat_async(&heartbeat_req) + .unwrap() + } + pub fn get_context(&mut self, region_id: u64) -> Context { let epoch = self.cluster.get_region_epoch(region_id); let leader = self.cluster.leader_of_region(region_id).unwrap(); diff --git a/components/cloud/Cargo.toml b/components/cloud/Cargo.toml index 3a103679094..0036f67e0b8 100644 --- a/components/cloud/Cargo.toml +++ b/components/cloud/Cargo.toml @@ -9,16 +9,15 @@ license = "Apache-2.0" async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } +futures = "0.3" futures-io = "0.3" kvproto = { workspace = true } lazy_static = "1.3" -openssl = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } -rusoto_core = "0.46.0" thiserror = "1.0" tikv_util = { workspace = true } url = "2.0" +uuid = { version = "0.8", features = ["v4"] } [dev-dependencies] -fail = "0.5" diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 75cddac7cea..15c1fb11744 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -19,14 +19,8 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } # This is only a dependency to vendor openssl for rusoto. It's not clear exactly # how openssl is built for tikv, but it seems to be controlled by grpcio. This # makes `cargo test -p aws` link correctly. -grpcio = { workspace = true } -http = "0.2.0" -hyper = "0.14" -hyper-tls = "0.5" kvproto = { workspace = true } -lazy_static = "1.3" md5 = "0.7.0" -prometheus = { version = "0.13", default-features = false, features = ["nightly"] } rusoto_core = "0.46.0" rusoto_credential = "0.46.0" rusoto_kms = { version = "0.46.0", features = ["serialize_structs"] } diff --git a/components/cloud/aws/src/kms.rs b/components/cloud/aws/src/kms.rs index 87b4c48d568..560a27ed0b7 100644 --- a/components/cloud/aws/src/kms.rs +++ b/components/cloud/aws/src/kms.rs @@ -8,7 +8,7 @@ use cloud::{ kms::{Config, CryptographyType, DataKeyPair, EncryptedKey, KeyId, KmsProvider, PlainKey}, }; use rusoto_core::{request::DispatchSignedRequest, RusotoError}; -use rusoto_credential::ProvideAwsCredentials; +use rusoto_credential::{AwsCredentials, ProvideAwsCredentials, StaticProvider}; use rusoto_kms::{ DecryptError, DecryptRequest, GenerateDataKeyError, GenerateDataKeyRequest, Kms, KmsClient, }; @@ -62,17 +62,35 @@ impl AwsKms { }) } - fn new_with_dispatcher(config: Config, dispatcher: D) -> Result - where - D: DispatchSignedRequest + Send + Sync + 'static, - { - let credentials_provider = util::CredentialsProvider::new()?; - Self::new_with_creds_dispatcher(config, dispatcher, credentials_provider) - } - pub fn new(config: Config) -> Result { let dispatcher = util::new_http_client()?; - Self::new_with_dispatcher(config, dispatcher) + match config.aws.as_ref() { + Some(aws_config) => { + if let (Some(access_key), Some(secret_access_key)) = ( + aws_config.access_key.clone(), + aws_config.secret_access_key.clone(), + ) { + // Use provided AWS credentials + let credentials = AwsCredentials::new( + access_key, + secret_access_key, + None, // session token + None, // expiration + ); + let static_provider = StaticProvider::from(credentials); + Self::new_with_creds_dispatcher(config, dispatcher, static_provider) + } else { + // Fall back to default credentials provider + let provider = util::CredentialsProvider::new()?; + Self::new_with_creds_dispatcher(config, dispatcher, provider) + } + } + None => { + // No AWS config provided, use default credentials provider + let provider = util::CredentialsProvider::new()?; + Self::new_with_creds_dispatcher(config, dispatcher, provider) + } + } } } @@ -226,6 +244,7 @@ mod tests { }, azure: None, gcp: None, + aws: None, }; let dispatcher = @@ -271,6 +290,7 @@ mod tests { }, azure: None, gcp: None, + aws: None, }; // IncorrectKeyException diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 71c890a61c3..1211e67ad6a 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -2,17 +2,22 @@ use std::{ error::Error as StdError, io, + pin::Pin, time::{Duration, SystemTime}, }; use async_trait::async_trait; use cloud::{ - blob::{none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty}, + blob::{ + none_to_empty, BlobConfig, BlobObject, BlobStorage, BucketConf, DeletableStorage, + IterableStorage, PutResource, StringNonEmpty, + }, metrics::CLOUD_REQUEST_HISTOGRAM_VEC, }; use fail::fail_point; +use futures::stream::{self, Stream}; use futures_util::{ - future::FutureExt, + future::{FutureExt, LocalBoxFuture}, io::{AsyncRead, AsyncReadExt}, stream::TryStreamExt, }; @@ -29,6 +34,7 @@ use crate::util::{self, retry_and_count}; const CONNECTION_TIMEOUT: Duration = Duration::from_secs(900); pub const STORAGE_VENDOR_NAME_AWS: &str = "aws"; +const DEFAULT_SEP: char = '/'; #[derive(Clone)] pub struct AccessKeyPair { @@ -236,11 +242,22 @@ impl S3Storage { fn maybe_prefix_key(&self, key: &str) -> String { if let Some(prefix) = &self.config.bucket.prefix { - return format!("{}/{}", *prefix, key); + return format!("{}{}{}", *prefix, DEFAULT_SEP, key); } key.to_owned() } + fn strip_prefix_if_needed(&self, key: String) -> String { + if let Some(prefix) = &self.config.bucket.prefix { + if key.starts_with(prefix.as_str()) { + return key[prefix.len()..] + .trim_start_matches(DEFAULT_SEP) + .to_owned(); + } + } + key + } + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { let key = self.maybe_prefix_key(name); let bucket = self.config.bucket.bucket.clone(); @@ -595,7 +612,7 @@ impl BlobStorage for S3Storage { async fn put( &self, name: &str, - mut reader: PutResource, + mut reader: PutResource<'_>, content_length: u64, ) -> io::Result<()> { let key = self.maybe_prefix_key(name); @@ -626,6 +643,103 @@ impl BlobStorage for S3Storage { } } +struct S3PrefixIter<'cli> { + cli: &'cli S3Storage, + finished: bool, + cont_token: Option, + prefix: String, +} + +impl<'cli> S3PrefixIter<'cli> { + async fn next_page(&mut self) -> io::Result>> { + if self.finished { + return Ok(None); + } + let mut input = ListObjectsV2Request::default(); + input.bucket = String::clone(&self.cli.config.bucket.bucket); + input.prefix = Some(self.cli.maybe_prefix_key(&self.prefix)); + input.continuation_token = self.cont_token.clone(); + let now = Instant::now(); + let res = retry_and_count( + || self.cli.client.list_objects_v2(input.clone()), + "get_one_page", + ) + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["s3", "list_objects_v2"]) + .observe(now.saturating_elapsed().as_secs_f64()); + + self.finished = !res.is_truncated.ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidData, "no IsTruncated in response") + })? || res.next_continuation_token.is_none(); + self.cont_token = res.next_continuation_token; + let data = res + .contents + .unwrap_or_default() + .into_iter() + .map(|data| BlobObject { + key: self + .cli + .strip_prefix_if_needed(data.key.unwrap_or_default()), + }) + .collect::>(); + Ok(Some(data)) + } +} + +impl DeletableStorage for S3Storage { + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + let key = self.maybe_prefix_key(name); + async move { + let now = Instant::now(); + let res = retry_and_count( + || { + self.client.delete_object(DeleteObjectRequest { + bucket: self.config.bucket.bucket.to_string(), + key: key.clone(), + ..Default::default() + }) + }, + "delete_object", + ) + .await; + CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["s3", "delete_object"]) + .observe(now.saturating_elapsed().as_secs_f64()); + match res { + Ok(_) => Ok(()), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("failed to delete object {}", e), + )), + } + } + .boxed_local() + } +} + +impl IterableStorage for S3Storage { + fn iter_prefix( + &self, + prefix: &str, + ) -> Pin> + '_>> { + let walker = S3PrefixIter { + cli: self, + finished: false, + cont_token: None, + prefix: prefix.to_owned(), + }; + let s = stream::try_unfold(walker, |mut w| async move { + let res = w.next_page().await?; + io::Result::Ok(res.map(|v| (v, w))) + }) + .map_ok(|data| stream::iter(data.into_iter().map(Ok))) + .try_flatten(); + Box::pin(s) + } +} + #[cfg(test)] mod tests { use std::assert_matches::assert_matches; @@ -636,6 +750,128 @@ mod tests { use super::*; + fn make_list_bucket_result( + name: &str, + pfx: &str, + next_cont_token: Option<&str>, + is_truncated: bool, + max_keys: u64, + items: impl IntoIterator, + ) -> MockRequestDispatcher { + let items = items.into_iter().collect::>(); + let mut s = format!( + r#" + + + {} + {} + {} + {} + {} + {}"#, + name, + pfx, + next_cont_token.unwrap_or(""), + items.len(), + max_keys, + is_truncated + ); + for item in items { + s.push_str(&format!( + r#" + + {} + STANDARD + "#, + item + )); + } + s.push_str("\n"); + MockRequestDispatcher::with_status(200).with_body(&s) + } + + #[tokio::test] + async fn test_list_objects() { + const BUCKET: &str = "breeze"; + const PREFIX: &str = "/my/great/prefix"; + + let bucket_name = StringNonEmpty::required(BUCKET.to_string()).unwrap(); + let bucket = BucketConf::default(bucket_name); + let mut config = Config::default(bucket); + let multi_part_size = 2; + // set multi_part_size to use upload_part function + config.multi_part_size = multi_part_size; + + let check_cont_tok = |cont: Option| { + move |r: &SignedRequest| { + assert_eq!( + r.params.get("continuation-token").and_then(|v| v.as_ref()), + cont.as_ref() + ); + } + }; + + let files = |pfx, max| { + let mut i = 0; + std::iter::repeat_with(move || { + i += 1; + format!("{}-{}", pfx, i) + }) + .take(max) + }; + + // split magic_contents into 3 parts, so we mock 5 requests here(1 begin + 3 + // part + 1 complete) + let dispatcher = MultipleMockRequestDispatcher::new(vec![ + make_list_bucket_result(BUCKET, PREFIX, Some("foo"), true, 16, files("foo", 16)) + .with_request_checker(check_cont_tok(None)), + make_list_bucket_result(BUCKET, PREFIX, Some("bar"), true, 16, files("bar", 16)) + .with_request_checker(check_cont_tok(Some("foo".to_owned()))), + make_list_bucket_result(BUCKET, PREFIX, None, false, 16, files("quux", 8)) + .with_request_checker(check_cont_tok(Some("bar".to_owned()))), + MockRequestDispatcher::with_status(400).with_request_checker(|req| { + panic!("Walk haven't stopped. The last request is {:?}", req) + }), + ]); + + let credentials_provider = StaticProvider::new_minimal(String::new(), String::new()); + let s = S3Storage::new_creds_dispatcher(config, dispatcher, credentials_provider).unwrap(); + assert_eq!( + s.iter_prefix(PREFIX) + .map_ok(|v| v.key) + .try_collect::>() + .await + .unwrap(), + files("foo", 16) + .chain(files("bar", 16)) + .chain(files("quux", 8)) + .collect::>() + ); + } + + #[test] + #[ignore] + fn test_somewhat() { + let mut bucket = BucketConf::default(StringNonEmpty::opt("astro".to_owned()).unwrap()); + bucket.endpoint = StringNonEmpty::opt("http://10.2.7.193:9000".to_owned()); + let s3 = Config::default(bucket); + let s3 = Config { + access_key_pair: Some(AccessKeyPair { + access_key: StringNonEmpty::opt("minioadmin".to_owned()).unwrap(), + secret_access_key: StringNonEmpty::opt("minioadmin".to_owned()).unwrap(), + session_token: None, + }), + force_path_style: true, + ..s3 + }; + + let storage = S3Storage::new(s3).unwrap(); + let s = storage.iter_prefix("tpcc-1000-incr-with-crc64/v1/backupmeta"); + let items = block_on_external_io(TryStreamExt::try_collect::>(s)); + println!("{:?}", items); + println!("{}", items.unwrap().len()); + } + #[test] fn test_s3_get_content_md5() { // base64 encode md5sum "helloworld" diff --git a/components/cloud/aws/src/util.rs b/components/cloud/aws/src/util.rs index a2dc1ca8c76..6ee27bb0c42 100644 --- a/components/cloud/aws/src/util.rs +++ b/components/cloud/aws/src/util.rs @@ -87,7 +87,7 @@ where retry_ext( action, RetryExt::default().with_fail_hook(move |err: &E| { - warn!("aws request meet error."; "err" => %err, "retry?" => %err.is_retryable(), "context" => %name, "uuid" => %id); + warn!("aws request fails"; "err" => %err, "retry?" => %err.is_retryable(), "context" => %name, "uuid" => %id); metrics::CLOUD_ERROR_VEC.with_label_values(&["aws", name]).inc(); }), ).await diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 41a7a2821e4..a9aaf2244c6 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -5,9 +5,6 @@ edition = "2021" publish = false license = "Apache-2.0" -[features] -failpoints = ["fail/failpoints"] - [dependencies] async-trait = "0.1" # TODO: The azure sdk with the newest version needs the rustc v1.70, but current version of rustc in TiKV is v1.67. @@ -19,7 +16,6 @@ azure_storage = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = " azure_storage_blobs = { git = "https://github.com/tikv/azure-sdk-for-rust", branch = "release-7.5-fips" } base64 = "0.13" cloud = { workspace = true } -fail = "0.5" futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } kvproto = { workspace = true } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index 662c5643584..bdf30ae3490 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -15,10 +15,12 @@ use azure_identity::{ClientSecretCredential, TokenCredentialOptions}; use azure_storage::{prelude::*, ConnectionString, ConnectionStringBuilder}; use azure_storage_blobs::{blob::operations::PutBlockBlobBuilder, prelude::*}; use cloud::blob::{ - none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty, + none_to_empty, unimplemented, BlobConfig, BlobObject, BlobStorage, BucketConf, + DeletableStorage, IterableStorage, PutResource, StringNonEmpty, }; use futures::TryFutureExt; use futures_util::{ + future::FutureExt, io::{AsyncRead, AsyncReadExt}, stream, stream::StreamExt, @@ -725,7 +727,7 @@ impl BlobStorage for AzureStorage { async fn put( &self, name: &str, - mut reader: PutResource, + mut reader: PutResource<'_>, content_length: u64, ) -> io::Result<()> { let name = self.maybe_prefix_key(name); @@ -745,6 +747,23 @@ impl BlobStorage for AzureStorage { } } +impl IterableStorage for AzureStorage { + fn iter_prefix( + &self, + _prefix: &str, + ) -> std::pin::Pin< + Box> + '_>, + > { + Box::pin(futures::future::err(unimplemented()).into_stream()) + } +} + +impl DeletableStorage for AzureStorage { + fn delete(&self, _name: &str) -> futures::prelude::future::LocalBoxFuture<'_, io::Result<()>> { + Box::pin(futures::future::err(unimplemented())) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/cloud/azure/src/kms.rs b/components/cloud/azure/src/kms.rs index f1afd021c1f..a7d72c2afa4 100644 --- a/components/cloud/azure/src/kms.rs +++ b/components/cloud/azure/src/kms.rs @@ -282,6 +282,7 @@ mod tests { }, azure: Some(err_azure_cfg.clone()), gcp: None, + aws: None, }; AzureKms::new(err_config.clone()).unwrap_err(); let azure_cfg = SubConfigAzure { @@ -326,6 +327,7 @@ mod tests { }, azure: Some(azure_cfg), gcp: None, + aws: None, }; if config.vendor != STORAGE_VENDOR_NAME_AZURE { AzureKms::new(config).unwrap(); diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index f6c774fee7e..f2829c0b573 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -10,7 +10,6 @@ async-trait = "0.1" base64 = "0.13.0" cloud = { workspace = true } crc32c = "0.6" -crypto = { workspace = true } futures-util = { version = "0.3", default-features = false, features = ["io"] } http = "0.2.0" hyper = "0.14" diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index bee9714e03d..60727a6a3de 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -3,20 +3,23 @@ use std::{fmt::Display, io}; use async_trait::async_trait; use cloud::{ - blob::{none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty}, + blob::{ + none_to_empty, BlobConfig, BlobObject, BlobStorage, BucketConf, DeletableStorage, + IterableStorage, PutResource, StringNonEmpty, + }, metrics, }; use futures_util::{ - future::TryFutureExt, + future::{FutureExt, LocalBoxFuture, TryFutureExt}, io::{self as async_io, AsyncRead, Cursor}, - stream::{StreamExt, TryStreamExt}, + stream::{self, Stream, StreamExt, TryStreamExt}, }; use http::HeaderValue; use hyper::{Body, Request, Response}; pub use kvproto::brpb::Gcs as InputConfig; use tame_gcs::{ common::{PredefinedAcl, StorageClass}, - objects::{InsertObjectOptional, Metadata, Object}, + objects::{InsertObjectOptional, ListOptional, ListResponse, Metadata, Object}, types::{BucketName, ObjectId}, }; use tame_oauth::gcp::ServiceAccountInfo; @@ -27,9 +30,10 @@ use tikv_util::{ use crate::{ client::{status_code_error, GcpClient, RequestError}, - utils::retry, + utils::{self, retry}, }; +const DEFAULT_SEP: char = '/'; const GOOGLE_APIS: &str = "https://www.googleapis.com"; const HARDCODED_ENDPOINTS_SUFFIX: &[&str] = &["upload/storage/v1/", "storage/v1/"]; @@ -112,7 +116,7 @@ pub struct GcsStorage { client: GcpClient, } -trait ResultExt { +pub trait ResultExt { type Ok; // Maps the error of this result as an `std::io::Error` with `Other` error @@ -134,6 +138,36 @@ impl ResultExt for Result { } } +impl DeletableStorage for GcsStorage { + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + let name = name.to_owned(); + async move { + let key = self.maybe_prefix_key(&name); + let oid = ObjectId::new(self.config.bucket.bucket.to_string(), key) + .or_invalid_input(format_args!("invalid object id"))?; + let now = Instant::now(); + retry( + || async { + let req = Object::delete(&oid, None).map_err(RequestError::Gcs)?; + self.make_request( + req.map(|_: io::Empty| Body::empty()), + tame_gcs::Scopes::ReadWrite, + ) + .await + }, + "delete", + ) + .await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "delete"]) + .observe(now.saturating_elapsed_secs()); + + Ok(()) + } + .boxed_local() + } +} + impl GcsStorage { pub fn from_input(input: InputConfig) -> io::Result { Self::new(Config::from_input(input)?) @@ -147,7 +181,7 @@ impl GcsStorage { fn maybe_prefix_key(&self, key: &str) -> String { if let Some(prefix) = &self.config.bucket.prefix { - return format!("{}/{}", prefix, key); + return format!("{}{}{}", prefix, DEFAULT_SEP, key); } key.to_owned() } @@ -170,6 +204,17 @@ impl GcsStorage { self.client.make_request(req, scope).await } + fn strip_prefix_if_needed(&self, key: String) -> String { + if let Some(prefix) = &self.config.bucket.prefix { + if key.starts_with(prefix.as_str()) { + return key[prefix.len()..] + .trim_start_matches(DEFAULT_SEP) + .to_owned(); + } + } + key + } + fn error_to_async_read(kind: io::ErrorKind, e: E) -> cloud::blob::BlobStream<'static> where E: Into>, @@ -202,7 +247,9 @@ impl GcsStorage { if response.status().is_success() { Ok(response.into_body().map_err(|e| { io::Error::new( - io::ErrorKind::Other, + // Given the status is success, if the content stream has been cut down, + // there must be some network unavailable, which should generally be retryable. + io::ErrorKind::Interrupted, format!("download from GCS error: {}", e), ) })) @@ -279,7 +326,12 @@ impl BlobStorage for GcsStorage { Box::new(self.config.clone()) as Box } - async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()> { + async fn put( + &self, + name: &str, + reader: PutResource<'_>, + content_length: u64, + ) -> io::Result<()> { if content_length == 0 { // It is probably better to just write the empty file // However, currently going forward results in a body write aborted error @@ -345,6 +397,82 @@ impl BlobStorage for GcsStorage { } } +struct GcsPrefixIter<'cli> { + cli: &'cli GcsStorage, + page_token: Option, + prefix: String, + finished: bool, +} + +impl<'cli> GcsPrefixIter<'cli> { + async fn one_page(&mut self) -> io::Result>> { + if self.finished { + return Ok(None); + } + + let mut opt = ListOptional::default(); + let bucket = + BucketName::try_from(self.cli.config.bucket.bucket.to_string()).or_invalid_input( + format_args!("invalid bucket {}", self.cli.config.bucket.bucket), + )?; + let prefix = self.cli.maybe_prefix_key(&self.prefix); + opt.prefix = Some(&prefix); + opt.page_token = self.page_token.as_deref(); + let now = Instant::now(); + let req = Object::list(&bucket, Some(opt)).or_io_error(format_args!( + "failed to list with prefix {} page_token {:?}", + self.prefix, self.page_token + ))?; + let res = self + .cli + .make_request(req.map(|_e| Body::empty()), tame_gcs::Scopes::ReadOnly) + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + let resp = utils::read_from_http_body::(res).await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "list"]) + .observe(now.saturating_elapsed_secs()); + + debug!("requesting paging GCP"; "prefix" => %self.prefix, "page_token" => self.page_token.as_deref(), + "response_size" => resp.objects.len(), "new_page_token" => resp.page_token.as_deref()); + // GCP returns an empty page token when returning the last page... + // We need to break there or we will enter an infinity loop... + if resp.page_token.is_none() { + self.finished = true; + } + self.page_token = resp.page_token; + let items = resp + .objects + .into_iter() + .map(|v| BlobObject { + key: self.cli.strip_prefix_if_needed(v.name.unwrap_or_default()), + }) + .collect::>(); + Ok(Some(items)) + } +} + +impl IterableStorage for GcsStorage { + fn iter_prefix( + &self, + prefix: &str, + ) -> std::pin::Pin> + '_>> { + let walker = GcsPrefixIter { + cli: self, + page_token: None, + prefix: prefix.to_owned(), + finished: false, + }; + let s = stream::try_unfold(walker, |mut w| async move { + let res = w.one_page().await?; + io::Result::Ok(res.map(|v| (v, w))) + }) + .map_ok(|data| stream::iter(data.into_iter().map(Ok))) + .try_flatten(); + Box::pin(s) + } +} + #[cfg(test)] mod tests { extern crate test; diff --git a/components/cloud/gcp/src/kms.rs b/components/cloud/gcp/src/kms.rs index ec1c689adcd..3d4c84d06a1 100644 --- a/components/cloud/gcp/src/kms.rs +++ b/components/cloud/gcp/src/kms.rs @@ -358,6 +358,7 @@ mod tests { gcp: Some(SubConfigGcp { credential_file_path: None, }), + aws: None, }; _ = GcpKms::new(cfg).unwrap_err(); @@ -378,6 +379,7 @@ mod tests { gcp: Some(SubConfigGcp { credential_file_path: None, }), + aws: None, }; let res = GcpKms::new(cfg).unwrap(); diff --git a/components/cloud/gcp/src/lib.rs b/components/cloud/gcp/src/lib.rs index 4d81dd7189e..807f2456b6c 100644 --- a/components/cloud/gcp/src/lib.rs +++ b/components/cloud/gcp/src/lib.rs @@ -14,9 +14,11 @@ pub use kms::GcpKms; pub const STORAGE_VENDOR_NAME_GCP: &str = "gcp"; pub mod utils { - use std::future::Future; + use std::{future::Future, io}; use cloud::metrics; + use hyper::{body::Bytes, Body}; + use tame_gcs::ApiResponse; use tikv_util::stream::{retry_ext, RetryError, RetryExt}; pub async fn retry(action: G, name: &'static str) -> Result where @@ -29,4 +31,17 @@ pub mod utils { metrics::CLOUD_ERROR_VEC.with_label_values(&["gcp", name]).inc(); })).await } + + pub async fn read_from_http_body>( + b: http::Response, + ) -> io::Result { + use crate::gcs::ResultExt; + let (headers, body) = b.into_parts(); + let bytes = hyper::body::to_bytes(body).await.or_io_error(format_args!( + "cannot read bytes from http response {:?}", + headers + ))?; + let cached_resp = http::Response::from_parts(headers, bytes); + M::try_from_parts(cached_resp).or_invalid_input(format_args!("invalid response format")) + } } diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index a0b5c26953c..c4b369fbe93 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -1,8 +1,9 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{io, marker::Unpin, pin::Pin, task::Poll}; +use std::{fmt::Display, io, marker::Unpin, panic::Location, pin::Pin, task::Poll}; use async_trait::async_trait; +use futures::{future::LocalBoxFuture, stream::Stream}; use futures_io::AsyncRead; pub trait BlobConfig: 'static + Send + Sync { @@ -16,11 +17,11 @@ pub trait BlobConfig: 'static + Send + Sync { /// /// See the documentation of [external_storage::UnpinReader] for why those /// wrappers exists. -pub struct PutResource(pub Box); +pub struct PutResource<'a>(pub Box); pub type BlobStream<'a> = Box; -impl AsyncRead for PutResource { +impl<'a> AsyncRead for PutResource<'a> { fn poll_read( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -30,8 +31,8 @@ impl AsyncRead for PutResource { } } -impl From> for PutResource { - fn from(s: Box) -> Self { +impl<'a> From> for PutResource<'a> { + fn from(s: Box) -> Self { Self(s) } } @@ -43,7 +44,8 @@ pub trait BlobStorage: 'static + Send + Sync { fn config(&self) -> Box; /// Write all contents of the read to the given path. - async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()>; + async fn put(&self, name: &str, reader: PutResource<'_>, content_length: u64) + -> io::Result<()>; /// Read all contents of the given path. fn get(&self, name: &str) -> BlobStream<'_>; @@ -52,6 +54,42 @@ pub trait BlobStorage: 'static + Send + Sync { fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_>; } +pub trait DeletableStorage { + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>>; +} + +#[track_caller] +pub fn unimplemented() -> io::Error { + io::Error::new( + io::ErrorKind::Unsupported, + format!( + "this method isn't supported, check more details at {:?}", + Location::caller() + ), + ) +} + +#[derive(Debug)] +pub struct BlobObject { + pub key: String, +} + +impl Display for BlobObject { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.key) + } +} + +/// An storage that its content can be enumerated by prefix. +pub trait IterableStorage { + /// Walk the prefix of the blob storage. + /// It returns the stream of items. + fn iter_prefix( + &self, + prefix: &str, + ) -> Pin> + '_>>; +} + impl BlobConfig for dyn BlobStorage { fn name(&self) -> &'static str { self.config().name() @@ -68,7 +106,12 @@ impl BlobStorage for Box { (**self).config() } - async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()> { + async fn put( + &self, + name: &str, + reader: PutResource<'_>, + content_length: u64, + ) -> io::Result<()> { let fut = (**self).put(name, reader, content_length); fut.await } diff --git a/components/cloud/src/kms.rs b/components/cloud/src/kms.rs index c5d41c96ea0..e87220ab867 100644 --- a/components/cloud/src/kms.rs +++ b/components/cloud/src/kms.rs @@ -41,6 +41,12 @@ pub struct SubConfigGcp { pub credential_file_path: Option, } +#[derive(Debug, Default, Clone)] +pub struct SubConfigAws { + pub access_key: Option, + pub secret_access_key: Option, +} + #[derive(Debug, Clone)] pub struct Config { pub key_id: KeyId, @@ -48,6 +54,7 @@ pub struct Config { pub vendor: String, pub azure: Option, pub gcp: Option, + pub aws: Option, } impl Config { @@ -61,6 +68,7 @@ impl Config { vendor: mk.vendor, azure: None, gcp: None, + aws: None, }) } diff --git a/components/crossbeam-skiplist/Cargo.toml b/components/crossbeam-skiplist/Cargo.toml index b3a4af33663..e75373677a8 100644 --- a/components/crossbeam-skiplist/Cargo.toml +++ b/components/crossbeam-skiplist/Cargo.toml @@ -30,11 +30,11 @@ alloc = ["crossbeam-epoch/alloc"] [dependencies] crossbeam-epoch = { workspace = true } +# Add the official version of `crossbeam-skiplist` to dependency tree, so that +# our CI can keep track of upstream critical bug-fixes and security updates. +crossbeam-skiplist-offical = { version = "0.1.3", package = "crossbeam-skiplist" } crossbeam-utils = { workspace = true } -[dev-dependencies] -rand = "0.8" - # Rename test and example binary names to pass TiKV jemalloc check. [[test]] name = "crossbeam_skiplist_base" diff --git a/components/crossbeam-skiplist/src/lib.rs b/components/crossbeam-skiplist/src/lib.rs index 8320c9a1a68..222b735024a 100644 --- a/components/crossbeam-skiplist/src/lib.rs +++ b/components/crossbeam-skiplist/src/lib.rs @@ -257,3 +257,9 @@ pub mod set; #[cfg(feature = "std")] #[doc(inline)] pub use crate::{map::SkipMap, set::SkipSet}; + +// Prevent cargo machete warnings. +mod seal { + pub trait Sealed {} + impl Sealed for crossbeam_skiplist_offical::SkipList<(), ()> {} +} diff --git a/components/crypto/Cargo.toml b/components/crypto/Cargo.toml index 924e8e89e20..1f233510e0e 100644 --- a/components/crypto/Cargo.toml +++ b/components/crypto/Cargo.toml @@ -7,7 +7,6 @@ license = "Apache-2.0" [dependencies] openssl = { workspace = true } -openssl-sys = { workspace = true } slog = { workspace = true } # better to not use slog-global, but pass in the logger slog-global = { workspace = true } diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index 73a4f61bb95..cd63bf29154 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -10,16 +10,15 @@ failpoints = ["fail/failpoints"] # openssl/vendored is necssary in order to conditionally building SM4 encryption # support, as SM4 is disabled on various openssl distributions, such as Rocky Linux 9. sm4 = ["openssl/vendored"] +testexport = [] [dependencies] async-trait = "0.1" byteorder = "1.2" -bytes = "1.0" cloud = { workspace = true } crc32fast = "1.2" crossbeam = { workspace = true } crypto = { workspace = true } -derive_more = "0.99.3" error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } @@ -32,6 +31,7 @@ online_config = { workspace = true } openssl = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } +rand = "0.8.0" # For simplicity and compliance with FIPS 140 requirements for random number # generation, do not use the 'rand' crate in encryption-related code. # rand = "*" @@ -40,8 +40,8 @@ serde_derive = "1.0" slog = { workspace = true } # better to not use slog-global, but pass in the logger slog-global = { workspace = true } +tempfile = "3.1" thiserror = "1.0" -tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt"] } walkdir = "2" diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index 90323356c79..14fe508621b 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -9,17 +9,13 @@ license = "Apache-2.0" sm4 = ["encryption/sm4"] [dependencies] -async-trait = "0.1" aws = { workspace = true } azure = { workspace = true } cloud = { workspace = true } -derive_more = "0.99.3" encryption = { workspace = true } -error_code = { workspace = true } file_system = { workspace = true } gcp = { workspace = true } kvproto = { workspace = true } -openssl = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } slog = { workspace = true } # better to not use slog-global, but pass in the logger diff --git a/components/encryption/export/examples/ecli.rs b/components/encryption/export/examples/ecli.rs index 140b69c06aa..106b22ded75 100644 --- a/components/encryption/export/examples/ecli.rs +++ b/components/encryption/export/examples/ecli.rs @@ -4,7 +4,7 @@ use std::io::{Read, Write}; use azure::STORAGE_VENDOR_NAME_AZURE; pub use cloud::kms::Config as CloudConfig; -use encryption::GcpConfig; +use encryption::{GcpConfig, KmsBackend}; use encryption_export::{create_cloud_backend, AzureConfig, Backend, Error, KmsConfig, Result}; use file_system::{File, OpenOptions}; use gcp::STORAGE_VENDOR_NAME_GCP; @@ -99,7 +99,7 @@ struct SubCommandGcp { fn create_aws_backend( cmd: &SubCommandAws, credential_file: Option<&String>, -) -> Result> { +) -> Result> { let mut config = KmsConfig::default(); if let Some(credential_file) = credential_file { @@ -122,7 +122,7 @@ fn create_aws_backend( fn create_azure_backend( cmd: &SubCommandAzure, credential_file: Option<&String>, -) -> Result> { +) -> Result> { let mut config = KmsConfig::default(); config.vendor = STORAGE_VENDOR_NAME_AZURE.to_owned(); @@ -146,7 +146,7 @@ fn create_azure_backend( fn create_gcp_backend( cmd: &SubCommandGcp, credential_file: Option<&String>, -) -> Result> { +) -> Result> { let mut config = KmsConfig::default(); config.gcp = Some(GcpConfig { credential_file_path: credential_file diff --git a/components/encryption/export/src/lib.rs b/components/encryption/export/src/lib.rs index 365714063e6..1b7d23d5042 100644 --- a/components/encryption/export/src/lib.rs +++ b/components/encryption/export/src/lib.rs @@ -3,11 +3,10 @@ use std::path::Path; use aws::{AwsKms, STORAGE_VENDOR_NAME_AWS}; use azure::{AzureKms, STORAGE_VENDOR_NAME_AZURE}; -use cloud::kms::Config as CloudConfig; pub use encryption::{ - clean_up_dir, clean_up_trash, trash_dir_all, AzureConfig, Backend, DataKeyImporter, - DataKeyManager, DataKeyManagerArgs, DecrypterReader, EncryptionConfig, Error, FileConfig, Iv, - KmsBackend, KmsConfig, MasterKeyConfig, Result, + clean_up_dir, clean_up_trash, trash_dir_all, AsyncBackend, AzureConfig, Backend, + DataKeyImporter, DataKeyManager, DataKeyManagerArgs, DecrypterReader, EncryptionConfig, Error, + FileConfig, Iv, KmsBackend, KmsConfig, MasterKeyConfig, Result, }; use encryption::{cloud_convert_error, FileBackend, PlaintextBackend}; use gcp::{GcpKms, STORAGE_VENDOR_NAME_GCP}; @@ -27,6 +26,15 @@ pub fn data_key_manager_from_config( DataKeyManager::new(master_key, previous_master_key, args) } +pub fn create_async_backend(config: &MasterKeyConfig) -> Result> { + let result = create_async_backend_inner(config); + if let Err(e) = result { + error!("failed to access master key, {}", e); + return Err(e); + }; + result +} + pub fn create_backend(config: &MasterKeyConfig) -> Result> { let result = create_backend_inner(config); if let Err(e) = result { @@ -36,41 +44,41 @@ pub fn create_backend(config: &MasterKeyConfig) -> Result> { result } -pub fn create_cloud_backend(config: &KmsConfig) -> Result> { - info!("Encryption init aws backend"; +pub fn create_cloud_backend(config: &KmsConfig) -> Result> { + info!("Encryption init KMS backend"; "region" => &config.region, "endpoint" => &config.endpoint, "key_id" => &config.key_id, "vendor" => &config.vendor, ); + let cloud_config = config.to_cloud_config()?; match config.vendor.as_str() { STORAGE_VENDOR_NAME_AWS | "" => { - let conf = CloudConfig::from_proto(config.clone().into_proto()) - .map_err(cloud_convert_error("aws from proto".to_owned()))?; - let kms_provider = - Box::new(AwsKms::new(conf).map_err(cloud_convert_error("new AWS KMS".to_owned()))?); - Ok(Box::new(KmsBackend::new(kms_provider)?) as Box) + let kms_provider = Box::new( + AwsKms::new(cloud_config).map_err(cloud_convert_error("new AWS KMS".to_owned()))?, + ); + Ok(Box::new(KmsBackend::new(kms_provider)?)) } STORAGE_VENDOR_NAME_AZURE => { - if config.azure.is_none() { + // sanity check + if cloud_config.azure.is_none() { return Err(Error::Other(box_err!( "invalid configurations for Azure KMS" ))); } - let (mk, azure_kms_cfg) = config.clone().convert_to_azure_kms_config(); - let conf = CloudConfig::from_azure_kms_config(mk, azure_kms_cfg) - .map_err(cloud_convert_error("azure from proto".to_owned()))?; - let keyvault_provider = Box::new( - AzureKms::new(conf).map_err(cloud_convert_error("new Azure KMS".to_owned()))?, + let kms_provider = Box::new( + AzureKms::new(cloud_config) + .map_err(cloud_convert_error("new Azure KMS".to_owned()))?, ); - Ok(Box::new(KmsBackend::new(keyvault_provider)?)) + Ok(Box::new(KmsBackend::new(kms_provider)?)) } STORAGE_VENDOR_NAME_GCP => { - let (mk, gcp_cfg) = config.clone().convert_to_gcp_config(); - let conf = CloudConfig::from_gcp_kms_config(mk, gcp_cfg) - .map_err(cloud_convert_error("gcp from proto".to_owned()))?; + // sanity check + if cloud_config.gcp.is_none() { + return Err(Error::Other(box_err!("invalid configurations for GCP KMS"))); + } let kms_provider = - GcpKms::new(conf).map_err(cloud_convert_error("new GCP KMS".to_owned()))?; + GcpKms::new(cloud_config).map_err(cloud_convert_error("new GCP KMS".to_owned()))?; Ok(Box::new(KmsBackend::new(Box::new(kms_provider))?)) } provider => Err(Error::Other(box_err!("provider not found {}", provider))), @@ -83,7 +91,17 @@ fn create_backend_inner(config: &MasterKeyConfig) -> Result> { MasterKeyConfig::File { config } => { Box::new(FileBackend::new(Path::new(&config.path))?) as _ } - MasterKeyConfig::Kms { config } => return create_cloud_backend(config), + MasterKeyConfig::Kms { config } => create_cloud_backend(config)? as Box, + }) +} + +fn create_async_backend_inner(config: &MasterKeyConfig) -> Result> { + Ok(match config { + MasterKeyConfig::Plaintext => Box::new(PlaintextBackend {}) as _, + MasterKeyConfig::File { config } => { + Box::new(FileBackend::new(Path::new(&config.path))?) as _ + } + MasterKeyConfig::Kms { config } => create_cloud_backend(config)? as Box, }) } @@ -108,6 +126,7 @@ mod tests { ..AzureConfig::default() }), gcp: None, + aws: None, }; let invalid_config = KmsConfig { azure: None, diff --git a/components/encryption/src/backup/backup_encryption.rs b/components/encryption/src/backup/backup_encryption.rs new file mode 100644 index 00000000000..fc2a53654ea --- /dev/null +++ b/components/encryption/src/backup/backup_encryption.rs @@ -0,0 +1,83 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use kvproto::{ + brpb::CipherInfo, + encryptionpb::{EncryptedContent, EncryptionMethod}, +}; + +use crate::{DataKeyManager, Error, MultiMasterKeyBackend}; + +/// BackupEncryptionManager handles encryption operations for backup processes. +#[derive(Clone)] +pub struct BackupEncryptionManager { + // Plaintext data key directly passed from user in stream back request, + // only used to encrypt log backup files uploaded to external storage, + // Not recommended in production. + pub plaintext_data_key: Option, + // encryption method used to encrypt log backup files for master key based approach + pub master_key_based_file_encryption_method: EncryptionMethod, + // backend that can contain multiple master keys, initialized as zero key inside + pub multi_master_key_backend: MultiMasterKeyBackend, + // used to encrypt local temp files, configured by TiKV + pub tikv_data_key_manager: Option>, +} +impl BackupEncryptionManager { + pub fn new( + plaintext_data_key: Option, + master_key_based_file_encryption_method: EncryptionMethod, + multi_master_key_backend: MultiMasterKeyBackend, + tikv_data_key_manager: Option>, + ) -> Self { + BackupEncryptionManager { + plaintext_data_key, + master_key_based_file_encryption_method, + multi_master_key_backend, + tikv_data_key_manager, + } + } + + pub fn default() -> Self { + BackupEncryptionManager { + plaintext_data_key: None, + master_key_based_file_encryption_method: EncryptionMethod::default(), + multi_master_key_backend: MultiMasterKeyBackend::new(), + tikv_data_key_manager: None, + } + } + + pub fn opt_data_key_manager(&self) -> Option> { + self.tikv_data_key_manager.clone() + } + + pub async fn encrypt_data_key( + &self, + plaintext_data_key: &[u8], + ) -> Result { + self.multi_master_key_backend + .encrypt(plaintext_data_key) + .await + } + + pub async fn decrypt_data_key( + &self, + encrypted_content: &EncryptedContent, + ) -> Result, Error> { + self.multi_master_key_backend + .decrypt(encrypted_content) + .await + } + + // check if master key backend ready to encrypt and decrypt keys. + pub async fn is_master_key_backend_initialized(&self) -> bool { + self.master_key_based_file_encryption_method != EncryptionMethod::Unknown + && self.master_key_based_file_encryption_method != EncryptionMethod::Plaintext + && self.multi_master_key_backend.is_initialized().await + } + + pub fn generate_data_key(&self) -> Result, Error> { + self.multi_master_key_backend + .generate_data_key(self.master_key_based_file_encryption_method) + } +} diff --git a/components/hybrid_engine/tests/failpoints/mod.rs b/components/encryption/src/backup/mod.rs similarity index 71% rename from components/hybrid_engine/tests/failpoints/mod.rs rename to components/encryption/src/backup/mod.rs index 25c77e4c418..6f7d0290826 100644 --- a/components/hybrid_engine/tests/failpoints/mod.rs +++ b/components/encryption/src/backup/mod.rs @@ -1,3 +1,2 @@ // Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -mod test_write_batch; +pub mod backup_encryption; diff --git a/components/encryption/src/config.rs b/components/encryption/src/config.rs index 4c5805248e8..2b8ed30a73d 100644 --- a/components/encryption/src/config.rs +++ b/components/encryption/src/config.rs @@ -1,10 +1,19 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use cloud::kms::{SubConfigAzure, SubConfigGcp}; -use kvproto::encryptionpb::{EncryptionMethod, MasterKeyKms}; +use std::result; + +use cloud::{ + kms::{SubConfigAws, SubConfigAzure, SubConfigGcp}, + Config as CloudConfig, +}; +use kvproto::encryptionpb::{EncryptionMethod, MasterKey, MasterKeyKms, MasterKey_oneof_backend}; use online_config::OnlineConfig; use serde_derive::{Deserialize, Serialize}; -use tikv_util::config::ReadableDuration; +use tikv_util::{box_err, config::ReadableDuration}; + +use crate::Error; + +pub type Result = result::Result; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] @@ -40,7 +49,7 @@ impl Default for EncryptionConfig { } } -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct FileConfig { @@ -48,7 +57,7 @@ pub struct FileConfig { } // TODO: the representation of Azure KMS to users needs to be discussed. -#[derive(Clone, Default, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct AzureConfig { @@ -85,7 +94,7 @@ impl std::fmt::Debug for AzureConfig { } // TODO: the representation of GCP KMS to users needs to be discussed. -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct GcpConfig { @@ -95,7 +104,17 @@ pub struct GcpConfig { pub credential_file_path: Option, } -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct AwsConfig { + /// optional since can read from env if not directly passed from user. + pub access_key: Option, + /// optional since can read from env if not directly passed from user. + pub secret_access_key: Option, +} + +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, OnlineConfig, Eq, Hash)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct KmsConfig { @@ -109,60 +128,97 @@ pub struct KmsConfig { // Gcp Kms configuration. #[online_config(skip)] pub gcp: Option, + // optional aws Kms configuration. + #[online_config(skip)] + pub aws: Option, } +// Note: could merge SubConfigAzure and SubConfigGcp into KmsConfig impl KmsConfig { - pub fn into_proto(self) -> MasterKeyKms { - MasterKeyKms { - key_id: self.key_id, - region: self.region, - endpoint: self.endpoint, - vendor: self.vendor, - ..MasterKeyKms::default() + pub fn from_proto(proto: &MasterKeyKms) -> Self { + let mut config = KmsConfig { + key_id: proto.key_id.clone(), + region: proto.region.clone(), + endpoint: proto.endpoint.clone(), + vendor: proto.vendor.clone(), + azure: None, + gcp: None, + aws: None, + }; + if proto.has_azure_kms() { + let azure_config_proto = proto.azure_kms.as_ref().unwrap(); + let azure_config = AzureConfig { + tenant_id: azure_config_proto.tenant_id.clone(), + client_id: azure_config_proto.client_id.clone(), + keyvault_url: azure_config_proto.key_vault_url.clone(), + hsm_name: azure_config_proto.hsm_name.clone(), + hsm_url: azure_config_proto.hsm_url.clone(), + client_certificate: string_to_option(azure_config_proto.client_certificate.clone()), + client_certificate_path: string_to_option( + azure_config_proto.client_certificate_path.clone(), + ), + client_certificate_password: azure_config_proto.client_certificate_password.clone(), + client_secret: string_to_option(azure_config_proto.client_secret.clone()), + }; + config.azure = Some(azure_config); + } + if proto.has_gcp_kms() { + let gcp_config_proto = proto.gcp_kms.as_ref().unwrap(); + let gcp_config = GcpConfig { + credential_file_path: string_to_option(gcp_config_proto.credential.clone()), + }; + config.gcp = Some(gcp_config); + } + if proto.has_aws_kms() { + let aws_config_proto = proto.aws_kms.as_ref().unwrap(); + let aws_config = AwsConfig { + access_key: string_to_option(aws_config_proto.access_key.clone()), + secret_access_key: string_to_option(aws_config_proto.secret_access_key.clone()), + }; + config.aws = Some(aws_config) } + config } - - pub fn convert_to_azure_kms_config(self) -> (MasterKeyKms, SubConfigAzure) { - let azure_kms_cfg = { - let cfg = self.azure.unwrap(); - SubConfigAzure { - tenant_id: cfg.tenant_id, - client_id: cfg.client_id, - keyvault_url: cfg.keyvault_url, - hsm_name: cfg.hsm_name, - hsm_url: cfg.hsm_url, - client_certificate: cfg.client_certificate, - client_certificate_path: cfg.client_certificate_path, - client_certificate_password: cfg.client_certificate_password, - client_secret: cfg.client_secret, - } - }; - let mk = MasterKeyKms { - key_id: self.key_id, - region: self.region, - endpoint: self.endpoint, - vendor: self.vendor, - ..MasterKeyKms::default() - }; - (mk, azure_kms_cfg) + pub fn to_cloud_config(&self) -> Result { + Ok(CloudConfig { + key_id: cloud::kms::KeyId::new(self.key_id.clone()) + .map_err(|_| Error::Other(box_err!("key id should not be empty")))?, + location: cloud::kms::Location { + region: self.region.clone(), + endpoint: self.endpoint.clone(), + }, + vendor: self.vendor.clone(), + azure: self.azure.as_ref().map(|azure| SubConfigAzure { + tenant_id: azure.tenant_id.clone(), + client_id: azure.client_id.clone(), + keyvault_url: azure.keyvault_url.clone(), + hsm_name: azure.hsm_name.clone(), + hsm_url: azure.hsm_url.clone(), + client_certificate: azure.client_certificate.clone(), + client_certificate_path: azure.client_certificate_path.clone(), + client_certificate_password: azure.client_certificate_password.clone(), + client_secret: azure.client_secret.clone(), + }), + gcp: self.gcp.as_ref().map(|gcp| SubConfigGcp { + credential_file_path: gcp.credential_file_path.clone(), + }), + aws: self.aws.as_ref().map(|aws| SubConfigAws { + access_key: aws.access_key.clone(), + secret_access_key: aws.secret_access_key.clone(), + }), + }) } +} - pub fn convert_to_gcp_config(self) -> (MasterKeyKms, SubConfigGcp) { - let gcp_cfg = SubConfigGcp { - credential_file_path: self.gcp.unwrap().credential_file_path, - }; - let mk = MasterKeyKms { - key_id: self.key_id, - region: self.region, - endpoint: self.endpoint, - vendor: self.vendor, - ..MasterKeyKms::default() - }; - (mk, gcp_cfg) +fn string_to_option(string: String) -> Option { + if string.is_empty() { + None + } else { + Some(string) } } -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] #[serde(rename_all = "kebab-case", tag = "type")] #[derive(Default)] pub enum MasterKeyConfig { @@ -186,6 +242,26 @@ pub enum MasterKeyConfig { }, } +impl MasterKeyConfig { + pub fn from_proto(proto: &MasterKey) -> Option { + if let Some(backend) = &proto.backend { + match backend { + MasterKey_oneof_backend::Plaintext(_) => Some(MasterKeyConfig::Plaintext), + MasterKey_oneof_backend::File(key_file) => Some(MasterKeyConfig::File { + config: FileConfig { + path: key_file.path.clone(), + }, + }), + MasterKey_oneof_backend::Kms(kms) => Some(MasterKeyConfig::Kms { + config: KmsConfig::from_proto(kms), + }), + } + } else { + None + } + } +} + mod encryption_method_serde { use std::fmt; @@ -253,6 +329,8 @@ mod encryption_method_serde { #[cfg(test)] mod tests { + use kvproto::encryptionpb; + use super::*; #[test] @@ -268,12 +346,31 @@ mod tests { vendor: "".to_owned(), azure: None, gcp: None, + aws: None, }, }, previous_master_key: MasterKeyConfig::Plaintext, enable_file_dictionary_log: true, file_dictionary_rewrite_threshold: 1000000, }; + + let kms_config_aws = EncryptionConfig { + master_key: MasterKeyConfig::Kms { + config: KmsConfig { + key_id: "key_id".to_owned(), + region: "region".to_owned(), + endpoint: "endpoint".to_owned(), + vendor: "aws".to_owned(), + azure: None, + gcp: None, + aws: Some(AwsConfig { + access_key: Some("foo".into()), + secret_access_key: Some("bar".into()), + }), + }, + }, + ..kms_config.clone() + }; let kms_config_azure = EncryptionConfig { master_key: MasterKeyConfig::Kms { config: KmsConfig { @@ -290,6 +387,7 @@ mod tests { ..AzureConfig::default() }), gcp: None, + aws: None, }, }, ..kms_config.clone() @@ -306,24 +404,31 @@ mod tests { gcp: Some(GcpConfig { credential_file_path: Some("/tmp/credential.json".into()), }), + aws: None, }, }, ..kms_config.clone() }; // KMS with default(aws). - let kms_str = r#" + let kms_str_aws = r#" data-encryption-method = "aes128-ctr" data-key-rotation-period = "14d" enable-file-dictionary-log = true file-dictionary-rewrite-threshold = 1000000 [previous-master-key] type = "plaintext" + [master-key] type = "kms" key-id = "key_id" region = "region" endpoint = "endpoint" + vendor = "aws" + + [master-key.aws] + access-key = "foo" + secret-access-key = "bar" "#; // KMS with azure let kms_str_azure = r#" @@ -367,7 +472,7 @@ mod tests { credential-file-path = '/tmp/credential.json' "#; for (kms_cfg, kms_str) in [ - (kms_config, kms_str), + (kms_config_aws, kms_str_aws), (kms_config_azure, kms_str_azure), (kms_config_gcp, kms_str_gcp), ] { @@ -380,4 +485,178 @@ mod tests { ); } } + + #[test] + fn test_from_proto() { + // Test case 1: Basic KMS config without vendor-specific details + let proto = MasterKeyKms { + key_id: "test_key".to_string(), + region: "test_region".to_string(), + endpoint: "test_endpoint".to_string(), + vendor: "test_vendor".to_string(), + ..Default::default() + }; + let config = KmsConfig::from_proto(&proto); + assert_eq!(config.key_id, "test_key"); + assert_eq!(config.region, "test_region"); + assert_eq!(config.endpoint, "test_endpoint"); + assert_eq!(config.vendor, "test_vendor"); + assert!(config.azure.is_none()); + assert!(config.gcp.is_none()); + assert!(config.aws.is_none()); + // Test case 2: KMS config with Azure details + let mut proto_azure = proto.clone(); + proto_azure.azure_kms = Some(encryptionpb::AzureKms { + tenant_id: "azure_tenant".to_string(), + client_id: "azure_client".to_string(), + key_vault_url: "azure_vault".to_string(), + hsm_name: "azure_hsm".to_string(), + hsm_url: "azure_hsm_url".to_string(), + client_certificate: "azure_cert".to_string(), + client_certificate_path: "azure_cert_path".to_string(), + client_certificate_password: "azure_password".to_string(), + client_secret: "azure_secret".to_string(), + ..Default::default() + }) + .into(); + let config_azure = KmsConfig::from_proto(&proto_azure); + assert!(config_azure.azure.is_some()); + let azure_config = config_azure.azure.unwrap(); + assert_eq!(azure_config.tenant_id, "azure_tenant"); + assert_eq!(azure_config.client_id, "azure_client"); + assert_eq!(azure_config.keyvault_url, "azure_vault"); + assert_eq!(azure_config.hsm_name, "azure_hsm"); + assert_eq!(azure_config.hsm_url, "azure_hsm_url"); + assert_eq!( + azure_config.client_certificate, + Some("azure_cert".to_string()) + ); + assert_eq!( + azure_config.client_certificate_path, + Some("azure_cert_path".to_string()) + ); + assert_eq!(azure_config.client_certificate_password, "azure_password"); + assert_eq!(azure_config.client_secret, Some("azure_secret".to_string())); + + // Test case 3: KMS config with GCP details + let mut proto_gcp = proto.clone(); + proto_gcp.gcp_kms = Some(encryptionpb::GcpKms { + credential: "gcp_credential".to_string(), + ..Default::default() + }) + .into(); + let config_gcp = KmsConfig::from_proto(&proto_gcp); + assert!(config_gcp.gcp.is_some()); + let gcp_config = config_gcp.gcp.unwrap(); + assert_eq!( + gcp_config.credential_file_path, + Some("gcp_credential".to_string()) + ); + + // Test case 4: KMS config with AWS details + let mut proto_aws = proto.clone(); + proto_aws.aws_kms = Some(encryptionpb::AwsKms { + access_key: "aws_access".to_string(), + secret_access_key: "aws_secret".to_string(), + ..Default::default() + }) + .into(); + let config_aws = KmsConfig::from_proto(&proto_aws); + assert!(config_aws.aws.is_some()); + let aws_config = config_aws.aws.unwrap(); + assert_eq!(aws_config.access_key, Some("aws_access".to_string())); + assert_eq!(aws_config.secret_access_key, Some("aws_secret".to_string())); + } + + #[test] + fn test_to_cloud_config() { + // Test case 1: Basic KMS config without vendor-specific details + let kms_config = KmsConfig { + key_id: "test_key".to_string(), + region: "test_region".to_string(), + endpoint: "test_endpoint".to_string(), + vendor: "test_vendor".to_string(), + azure: None, + gcp: None, + aws: None, + }; + let cloud_config = kms_config.to_cloud_config().unwrap(); + assert_eq!(cloud_config.key_id.as_str(), "test_key"); + assert_eq!(cloud_config.location.region, "test_region"); + assert_eq!(cloud_config.location.endpoint, "test_endpoint"); + assert_eq!(cloud_config.vendor, "test_vendor"); + assert!(cloud_config.azure.is_none()); + assert!(cloud_config.gcp.is_none()); + assert!(cloud_config.aws.is_none()); + + // Test case 2: KMS config with Azure details + let kms_config_azure = KmsConfig { + azure: Some(AzureConfig { + tenant_id: "azure_tenant".to_string(), + client_id: "azure_client".to_string(), + keyvault_url: "azure_vault".to_string(), + hsm_name: "azure_hsm".to_string(), + hsm_url: "azure_hsm_url".to_string(), + client_certificate: Some("azure_cert".to_string()), + client_certificate_path: Some("azure_cert_path".to_string()), + client_certificate_password: "azure_password".to_string(), + client_secret: Some("azure_secret".to_string()), + }), + ..kms_config.clone() + }; + let cloud_config_azure = kms_config_azure.to_cloud_config().unwrap(); + assert!(cloud_config_azure.azure.is_some()); + let azure_config = cloud_config_azure.azure.unwrap(); + assert_eq!(azure_config.tenant_id, "azure_tenant"); + assert_eq!(azure_config.client_id, "azure_client"); + assert_eq!(azure_config.keyvault_url, "azure_vault"); + assert_eq!(azure_config.hsm_name, "azure_hsm"); + assert_eq!(azure_config.hsm_url, "azure_hsm_url"); + assert_eq!( + azure_config.client_certificate, + Some("azure_cert".to_string()) + ); + assert_eq!( + azure_config.client_certificate_path, + Some("azure_cert_path".to_string()) + ); + assert_eq!(azure_config.client_certificate_password, "azure_password"); + assert_eq!(azure_config.client_secret, Some("azure_secret".to_string())); + + // Test case 3: KMS config with GCP details + let kms_config_gcp = KmsConfig { + gcp: Some(GcpConfig { + credential_file_path: Some("gcp_credential".to_string()), + }), + ..kms_config.clone() + }; + let cloud_config_gcp = kms_config_gcp.to_cloud_config().unwrap(); + assert!(cloud_config_gcp.gcp.is_some()); + let gcp_config = cloud_config_gcp.gcp.unwrap(); + assert_eq!( + gcp_config.credential_file_path, + Some("gcp_credential".to_string()) + ); + + // Test case 4: KMS config with AWS details + let kms_config_aws = KmsConfig { + aws: Some(AwsConfig { + access_key: Some("aws_access".to_string()), + secret_access_key: Some("aws_secret".to_string()), + }), + ..kms_config.clone() + }; + let cloud_config_aws = kms_config_aws.to_cloud_config().unwrap(); + assert!(cloud_config_aws.aws.is_some()); + let aws_config = cloud_config_aws.aws.unwrap(); + assert_eq!(aws_config.access_key, Some("aws_access".to_string())); + assert_eq!(aws_config.secret_access_key, Some("aws_secret".to_string())); + + // Test case 5: KMS config with empty key_id (should return an error) + let kms_config_empty_key = KmsConfig { + key_id: "".to_string(), + ..kms_config.clone() + }; + kms_config_empty_key.to_cloud_config().unwrap_err(); + } } diff --git a/components/encryption/src/lib.rs b/components/encryption/src/lib.rs index 6c145c5e9be..086c2c0ae8f 100644 --- a/components/encryption/src/lib.rs +++ b/components/encryption/src/lib.rs @@ -3,6 +3,7 @@ #![feature(let_chains)] #![feature(noop_waker)] +mod backup; mod config; mod crypter; mod encrypted_file; @@ -11,11 +12,16 @@ mod file_dict_file; mod io; mod manager; mod master_key; +#[cfg(any(test, feature = "testexport"))] +pub use master_key::fake; mod metrics; +pub mod test_utils; + use std::{io::ErrorKind, path::Path}; pub use self::{ + backup::backup_encryption::*, config::*, crypter::{verify_encryption_config, AesGcmCrypter, FileEncryptionInfo, Iv}, encrypted_file::EncryptedFile, @@ -25,7 +31,9 @@ pub use self::{ create_aes_ctr_crypter, DecrypterReader, DecrypterWriter, EncrypterReader, EncrypterWriter, }, manager::{DataKeyImporter, DataKeyManager, DataKeyManagerArgs}, - master_key::{Backend, FileBackend, KmsBackend, PlaintextBackend}, + master_key::{ + AsyncBackend, Backend, FileBackend, KmsBackend, MultiMasterKeyBackend, PlaintextBackend, + }, }; const TRASH_PREFIX: &str = "TRASH-"; diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 79071b3610a..9250ed0714c 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -949,6 +949,19 @@ impl DataKeyManager { } Ok(()) } + + // same logic in raft_log_engine/src/engine#rename + pub fn rename_file(&self, src_name: &PathBuf, dst_name: &PathBuf) -> IoResult<()> { + let src_str = src_name.to_str().unwrap(); + let dst_str = dst_name.to_str().unwrap(); + self.link_file(src_str, dst_str)?; + let r = file_system::rename(src_name, dst_name); + let del_file = if r.is_ok() { src_str } else { dst_str }; + if let Err(e) = self.delete_file(del_file, None) { + warn!("fail to remove encryption metadata after renaming file"; "err" => ?e); + } + r + } } /// An RAII-style importer of data keys. It automatically creates data key that diff --git a/components/encryption/src/master_key/file.rs b/components/encryption/src/master_key/file.rs index 1b24a95e497..e28aecdcc0e 100644 --- a/components/encryption/src/master_key/file.rs +++ b/components/encryption/src/master_key/file.rs @@ -2,11 +2,12 @@ use std::{io::Read, path::Path}; +use async_trait::async_trait; use file_system::File; use kvproto::encryptionpb::EncryptedContent; use tikv_util::box_err; -use super::{Backend, MemAesGcmBackend}; +use super::{AsyncBackend, Backend, MemAesGcmBackend}; use crate::{AesGcmCrypter, Error, Iv, Result}; #[derive(Debug)] @@ -61,26 +62,25 @@ impl Backend for FileBackend { true } } +#[async_trait] +impl AsyncBackend for FileBackend { + async fn encrypt_async(&self, plaintext: &[u8]) -> Result { + self.encrypt(plaintext) + } + + async fn decrypt_async(&self, content: &EncryptedContent) -> Result> { + self.decrypt(content) + } +} #[cfg(test)] mod tests { - use std::{fs::File, io::Write, path::PathBuf}; - use hex::FromHex; use matches::assert_matches; - use tempfile::TempDir; use super::{super::metadata::MetadataKey, *}; use crate::*; - fn create_key_file(val: &str) -> (PathBuf, TempDir) { - let tmp_dir = TempDir::new().unwrap(); - let path = tmp_dir.path().join("key"); - let mut file = File::create(path.clone()).unwrap(); - file.write_all(format!("{}\n", val).as_bytes()).unwrap(); - (path, tmp_dir) - } - #[test] fn test_file_backend_ase_256_gcm() { // See more http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip @@ -90,8 +90,9 @@ mod tests { .unwrap(); let iv = Vec::from_hex("cafabd9672ca6c79a2fbdc22").unwrap(); - let (key_path, _tmp_key_dir) = - create_key_file("c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139"); + let (key_path, _tmp_key_dir) = test_utils::create_master_key_file_test_only( + "c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139", + ); let backend = FileBackend::new(&key_path).unwrap(); let iv = Iv::from_slice(iv.as_slice()).unwrap(); @@ -105,8 +106,9 @@ mod tests { fn test_file_backend_authenticate() { let pt = vec![1u8, 2, 3]; - let (key_path, _tmp_key_dir) = - create_key_file("c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139"); + let (key_path, _tmp_key_dir) = test_utils::create_master_key_file_test_only( + "c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139", + ); let backend = FileBackend::new(&key_path).unwrap(); let encrypted_content = backend.encrypt(&pt).unwrap(); diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index db3c62194fd..ad89d217f92 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -2,6 +2,7 @@ use std::{sync::Mutex, time::Duration}; +use async_trait::async_trait; use cloud::kms::{CryptographyType, DataKeyPair, EncryptedKey, KmsProvider, PlainKey}; use kvproto::encryptionpb::EncryptedContent; use tikv_util::{ @@ -11,7 +12,7 @@ use tikv_util::{ }; use tokio::runtime::{Builder, Runtime}; -use super::{metadata::MetadataKey, Backend, MemAesGcmBackend}; +use super::{metadata::MetadataKey, AsyncBackend, Backend, MemAesGcmBackend}; use crate::{crypter::Iv, errors::cloud_convert_error, Error, Result}; #[derive(Debug)] @@ -38,7 +39,7 @@ impl State { #[derive(Debug)] pub struct KmsBackend { timeout_duration: Duration, - state: Mutex>, + state: tokio::sync::Mutex>, kms_provider: Box, // This mutex allows the decrypt_content API to be reference based runtime: Mutex, @@ -57,21 +58,25 @@ impl KmsBackend { Ok(KmsBackend { timeout_duration: Duration::from_secs(10), - state: Mutex::new(None), + state: tokio::sync::Mutex::new(None), runtime, kms_provider, }) } - fn encrypt_content(&self, plaintext: &[u8], iv: Iv) -> Result { - let mut opt_state = self.state.lock().unwrap(); + pub fn encrypt_content(&self, plaintext: &[u8], iv: Iv) -> Result { + let runtime = self.runtime.lock().unwrap(); + runtime.block_on(self.encrypt_content_async(plaintext, iv)) + } + + async fn encrypt_content_async(&self, plaintext: &[u8], iv: Iv) -> Result { + let mut opt_state = self.state.lock().await; if opt_state.is_none() { - let runtime = self.runtime.lock().unwrap(); - let data_key = runtime - .block_on(retry(|| { - with_timeout(self.timeout_duration, self.kms_provider.generate_data_key()) - })) - .map_err(cloud_convert_error("get data key failed".into()))?; + let data_key = retry(|| { + with_timeout(self.timeout_duration, self.kms_provider.generate_data_key()) + }) + .await + .map_err(cloud_convert_error("get data key failed".into()))?; *opt_state = Some(State::new_from_datakey(DataKeyPair { plaintext: PlainKey::new(data_key.plaintext.clone(), CryptographyType::AesGcm256) .map_err(cloud_convert_error("invalid plain key".into()))?, @@ -96,9 +101,14 @@ impl KmsBackend { Ok(content) } + pub fn decrypt_content(&self, content: &EncryptedContent) -> Result> { + let runtime = self.runtime.lock().unwrap(); + runtime.block_on(self.decrypt_content_async(content)) + } + // On decrypt failure, the rule is to return WrongMasterKey error in case it is // possible that a wrong master key has been used, or other error otherwise. - fn decrypt_content(&self, content: &EncryptedContent) -> Result> { + async fn decrypt_content_async(&self, content: &EncryptedContent) -> Result> { let vendor_name = self.kms_provider.name(); match content.metadata.get(MetadataKey::KmsVendor.as_str()) { Some(val) if val.as_slice() == vendor_name.as_bytes() => (), @@ -126,22 +136,25 @@ impl KmsBackend { }; { - let mut opt_state = self.state.lock().unwrap(); + let mut opt_state = self.state.lock().await; if let Some(state) = &*opt_state { if state.cached(&ciphertext_key) { return state.encryption_backend.decrypt_content(content); } } { - let runtime = self.runtime.lock().unwrap(); - let plaintext = runtime - .block_on(retry(|| { - with_timeout( - self.timeout_duration, - self.kms_provider.decrypt_data_key(&ciphertext_key), - ) - })) - .map_err(cloud_convert_error("decrypt encrypted key failed".into()))?; + let plaintext = retry(|| { + with_timeout( + self.timeout_duration, + self.kms_provider.decrypt_data_key(&ciphertext_key), + ) + }) + .await + .map_err(|e| { + Error::WrongMasterKey(box_err!(cloud_convert_error( + "decrypt encrypted key failed".into(), + )(e))) + })?; let data_key = DataKeyPair { encrypted: ciphertext_key, plaintext: PlainKey::new(plaintext, CryptographyType::AesGcm256) @@ -154,6 +167,16 @@ impl KmsBackend { } } } + + // Used to clear the cached state to ensure that the next + // backend.decrypt_content() invocation bypasses the cache and invokes the + // KmsProvider::decrypt_data_key() function. + #[cfg(any(test, feature = "testexport"))] + pub fn clear_state(&mut self) { + let runtime = self.runtime.lock().unwrap(); + let mut opt_state = runtime.block_on(self.state.lock()); + *opt_state = None; + } } impl Backend for KmsBackend { @@ -169,11 +192,26 @@ impl Backend for KmsBackend { true } } +#[async_trait] +impl AsyncBackend for KmsBackend { + async fn encrypt_async(&self, plaintext: &[u8]) -> Result { + self.encrypt_content_async(plaintext, Iv::new_gcm()?).await + } + + async fn decrypt_async(&self, content: &EncryptedContent) -> Result> { + self.decrypt_content_async(content).await + } +} -#[cfg(test)] -mod fake { +#[cfg(any(test, feature = "testexport"))] +pub mod fake { use async_trait::async_trait; - use cloud::{error::Result, kms::KmsProvider}; + use cloud::{ + error::{Error as CloudError, KmsError, Result}, + kms::KmsProvider, + }; + use fail::fail_point; + use hex::FromHex; use super::*; @@ -183,19 +221,33 @@ mod fake { #[derive(Debug)] pub struct FakeKms { plaintext_key: PlainKey, + should_decrypt_data_key_fail: bool, } impl FakeKms { - pub fn new(plaintext_key: Vec) -> Self { + pub fn new(plaintext_key: Vec, should_decrypt_data_key_fail: bool) -> Self { Self { plaintext_key: PlainKey::new(plaintext_key, CryptographyType::AesGcm256).unwrap(), + should_decrypt_data_key_fail, } } } + fn check_fail_point(fail_point_name: &str) -> Result<()> { + fail_point!(fail_point_name, |val| { + val.and_then(|x| x.parse::().ok()) + .filter(|&fail| fail) + .map(|_| Err(CloudError::ApiTimeout(box_err!("api timeout")))) + .unwrap_or(Ok(())) + }); + Ok(()) + } + #[async_trait] impl KmsProvider for FakeKms { async fn generate_data_key(&self) -> Result { + check_fail_point("kms_api_timeout_encrypt")?; + Ok(DataKeyPair { encrypted: EncryptedKey::new(FAKE_DATA_KEY_ENCRYPTED.to_vec())?, plaintext: PlainKey::new(self.plaintext_key.clone(), CryptographyType::AesGcm256) @@ -204,21 +256,55 @@ mod fake { } async fn decrypt_data_key(&self, _ciphertext: &EncryptedKey) -> Result> { - Ok(vec![1u8, 32]) + check_fail_point("kms_api_timeout_decrypt")?; + + if self.should_decrypt_data_key_fail { + Err(CloudError::KmsError(KmsError::WrongMasterKey(box_err!( + "wrong master key" + )))) + } else { + Ok(hex::decode(PLAINKEY_HEX).unwrap()) + } } fn name(&self) -> &str { FAKE_VENDOR_NAME } } + + // See more http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip + const PLAIN_TEXT_HEX: &str = "25431587e9ecffc7c37f8d6d52a9bc3310651d46fb0e3bad2726c8f2db653749"; + const CIPHER_TEXT_HEX: &str = + "84e5f23f95648fa247cb28eef53abec947dbf05ac953734618111583840bd980"; + const PLAINKEY_HEX: &str = "c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139"; + const IV_HEX: &str = "cafabd9672ca6c79a2fbdc22"; + + pub fn prepare_data_for_encrypt() -> (Iv, Vec, Vec, Vec) { + let iv = Vec::from_hex(IV_HEX).unwrap(); + let iv = Iv::from_slice(iv.as_slice()).unwrap(); + let pt = Vec::from_hex(PLAIN_TEXT_HEX).unwrap(); + let plainkey = Vec::from_hex(PLAINKEY_HEX).unwrap(); + let ct = Vec::from_hex(CIPHER_TEXT_HEX).unwrap(); + (iv, pt, plainkey, ct) + } + + pub fn prepare_kms_backend( + plainkey: Vec, + should_decrypt_data_key_fail: bool, + ) -> KmsBackend { + KmsBackend::new(Box::new(FakeKms::new( + plainkey, + should_decrypt_data_key_fail, + ))) + .unwrap() + } } #[cfg(test)] mod tests { - use hex::FromHex; use matches::assert_matches; - use super::{fake::FakeKms, *}; + use super::{fake::*, *}; #[test] fn test_state() { @@ -243,19 +329,9 @@ mod tests { #[test] fn test_kms_backend() { - // See more http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip - let pt = Vec::from_hex("25431587e9ecffc7c37f8d6d52a9bc3310651d46fb0e3bad2726c8f2db653749") - .unwrap(); - let ct = Vec::from_hex("84e5f23f95648fa247cb28eef53abec947dbf05ac953734618111583840bd980") - .unwrap(); - let plainkey = - Vec::from_hex("c3d99825f2181f4808acd2068eac7441a65bd428f14d2aab43fefc0129091139") - .unwrap(); - - let iv = Vec::from_hex("cafabd9672ca6c79a2fbdc22").unwrap(); - - let backend = KmsBackend::new(Box::new(FakeKms::new(plainkey))).unwrap(); - let iv = Iv::from_slice(iv.as_slice()).unwrap(); + let (iv, pt, plainkey, ct) = prepare_data_for_encrypt(); + let backend = prepare_kms_backend(plainkey, false); + let encrypted_content = backend.encrypt_content(&pt, iv).unwrap(); assert_eq!(encrypted_content.get_content(), ct.as_slice()); let plaintext = backend.decrypt_content(&encrypted_content).unwrap(); @@ -293,4 +369,16 @@ mod tests { Error::Other(_) ); } + + #[test] + fn test_kms_backend_wrong_key() { + let (iv, pt, plainkey, ..) = prepare_data_for_encrypt(); + let mut backend = prepare_kms_backend(plainkey, true); + + let encrypted_content = backend.encrypt_content(&pt, iv).unwrap(); + + backend.clear_state(); + let err = backend.decrypt_content(&encrypted_content).unwrap_err(); + assert_matches!(err, Error::WrongMasterKey(_)); + } } diff --git a/components/encryption/src/master_key/mod.rs b/components/encryption/src/master_key/mod.rs index a674cd3a685..ab17ecdd0f0 100644 --- a/components/encryption/src/master_key/mod.rs +++ b/components/encryption/src/master_key/mod.rs @@ -1,9 +1,13 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use kvproto::encryptionpb::EncryptedContent; -use tikv_util::box_err; +use std::sync::Arc; -use crate::{Error, Result}; +use async_trait::async_trait; +use kvproto::encryptionpb::{EncryptedContent, EncryptionMethod, MasterKey}; +use tikv_util::{box_err, error}; +use tokio::sync::RwLock; + +use crate::{manager::generate_data_key, Error, MasterKeyConfig, Result}; /// Provide API to encrypt/decrypt key dictionary content. /// @@ -18,6 +22,12 @@ pub trait Backend: Sync + Send + std::fmt::Debug + 'static { fn is_secure(&self) -> bool; } +#[async_trait] +pub trait AsyncBackend: Sync + Send + std::fmt::Debug + 'static { + async fn encrypt_async(&self, plaintext: &[u8]) -> Result; + async fn decrypt_async(&self, ciphertext: &EncryptedContent) -> Result>; +} + mod mem; use self::mem::MemAesGcmBackend; @@ -28,6 +38,8 @@ mod metadata; use self::metadata::*; mod kms; +#[cfg(any(test, feature = "testexport"))] +pub use self::kms::fake; pub use self::kms::KmsBackend; #[derive(Default, Debug, Clone)] @@ -68,6 +80,162 @@ impl Backend for PlaintextBackend { } } +#[async_trait] +impl AsyncBackend for PlaintextBackend { + async fn encrypt_async(&self, plaintext: &[u8]) -> Result { + self.encrypt(plaintext) + } + + async fn decrypt_async(&self, ciphertext: &EncryptedContent) -> Result> { + self.decrypt(ciphertext) + } +} + +/// Used for restore where multiple master keys are provided. +/// It will iterate the master key list and do the encryption/decryption. +/// If any master key succeeds, the request succeeds, if none succeeds, the +/// request will fail with a combined error of each master key error. +#[derive(Default, Debug, Clone)] +pub struct MultiMasterKeyBackend { + inner: Arc>, +} + +#[derive(Default, Debug)] +struct MultiMasterKeyBackendInner { + backends: Option>>, + pub(crate) configs: Option>, +} +impl MultiMasterKeyBackend { + pub fn new() -> Self { + MultiMasterKeyBackend { + inner: Arc::new(RwLock::new(MultiMasterKeyBackendInner { + backends: None, + configs: None, + })), + } + } + + pub async fn update_from_proto_if_needed( + &self, + master_keys_proto: Vec, + create_backend_fn: F, + ) -> Result<()> + where + F: Fn(&MasterKeyConfig) -> Result>, + { + if master_keys_proto.is_empty() { + return Ok(()); + } + let mut master_keys_config = Vec::new(); + for proto in master_keys_proto { + let opt_master_key_config = MasterKeyConfig::from_proto(&proto); + // sanity check + if opt_master_key_config.is_none() { + return Err(log_and_error( + "internal error: master key config should not be empty", + )); + } + master_keys_config.push(opt_master_key_config.unwrap()); + } + + self.update_from_config_if_needed(master_keys_config, create_backend_fn) + .await + } + + pub async fn update_from_config_if_needed( + &self, + master_keys_configs: Vec, + create_backend_fn: F, + ) -> Result<()> + where + F: Fn(&MasterKeyConfig) -> Result>, + { + if master_keys_configs.is_empty() { + return Ok(()); + } + + let mut write_guard = self.inner.write().await; + if write_guard.configs.as_ref() != Some(&master_keys_configs) { + write_guard.backends = Some(create_master_key_backends( + &master_keys_configs, + create_backend_fn, + )?); + write_guard.configs = Some(master_keys_configs); + } + Ok(()) + } + + pub async fn encrypt(&self, plaintext: &[u8]) -> Result { + let read_guard = self.inner.read().await; + if read_guard.backends.is_none() { + return Err(log_and_error( + "internal error: multi master key backend not initialized when encrypting", + )); + } + let mut errors = Vec::new(); + + for master_key_backend in read_guard.backends.as_ref().unwrap() { + match master_key_backend.encrypt_async(plaintext).await { + Ok(res) => return Ok(res), + Err(e) => errors.push(format!("Backend failed to encrypt with error: {}", e)), + } + } + + let combined_error = format!("failed to encrypt content: {}", errors.join("; ")); + Err(log_and_error(&combined_error)) + } + pub async fn decrypt(&self, ciphertext: &EncryptedContent) -> Result> { + let read_guard = self.inner.read().await; + if read_guard.backends.is_none() { + return Err(log_and_error( + "internal error: multi master key backend not initialized when decrypting", + )); + } + let mut errors = Vec::new(); + + for master_key_backend in read_guard.backends.as_ref().unwrap() { + match master_key_backend.decrypt_async(ciphertext).await { + Ok(res) => return Ok(res), + Err(e) => errors.push(format!("Backend failed to decrypt with error: {}", e)), + } + } + + let combined_error = format!("failed to decrypt content: {}", errors.join("; ")); + Err(log_and_error(&combined_error)) + } + + pub fn generate_data_key(&self, method: EncryptionMethod) -> Result> { + let (_id, key) = generate_data_key(method)?; + Ok(key) + } + + pub async fn is_initialized(&self) -> bool { + let read_guard = self.inner.read().await; + // configs and backends are updated together, should always be both empty or not + // empty. + read_guard.configs.is_some() && read_guard.backends.is_some() + } +} + +fn create_master_key_backends( + master_keys_config: &Vec, + create_backend_fn: F, +) -> Result>> +where + F: Fn(&MasterKeyConfig) -> Result>, +{ + let mut backends = Vec::new(); + for master_key_config in master_keys_config { + backends.push(create_backend_fn(master_key_config)?); + } + Ok(backends) +} + +fn log_and_error(err_msg: &str) -> Error { + error!("{}", err_msg); + Error::Other(box_err!(err_msg)) +} + #[cfg(test)] pub mod tests { use std::{collections::HashMap, sync::Mutex}; @@ -154,4 +322,145 @@ pub mod tests { true } } + + #[derive(Clone)] + struct MockAsyncBackend { + encrypt_result: Arc Result + Send + Sync>, + decrypt_result: Arc Result> + Send + Sync>, + } + + #[async_trait] + impl AsyncBackend for MockAsyncBackend { + async fn encrypt_async(&self, _plaintext: &[u8]) -> Result { + (self.encrypt_result)() + } + + async fn decrypt_async(&self, _ciphertext: &EncryptedContent) -> Result> { + (self.decrypt_result)() + } + } + + impl std::fmt::Debug for MockAsyncBackend { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MockAsyncBackend").finish() + } + } + + fn create_mock_backend(encrypt_result: E, decrypt_result: D) -> Box + where + E: Fn() -> Result + Send + Sync + 'static, + D: Fn() -> Result> + Send + Sync + 'static, + { + Box::new(MockAsyncBackend { + encrypt_result: Arc::new(encrypt_result), + decrypt_result: Arc::new(decrypt_result), + }) + } + + // In your tests: + #[tokio::test] + async fn test_multi_master_key_backend_encrypt_decrypt_failure() { + let backend = MultiMasterKeyBackend::new(); + + let configs = vec![MasterKeyConfig::File { + config: FileConfig { + path: "test".to_string(), + }, + }]; + backend + .update_from_config_if_needed(configs, |_| { + Ok(create_mock_backend( + || { + Err(Error::Other(Box::new(std::io::Error::new( + ErrorKind::Other, + "Encrypt error", + )))) + }, + || { + Err(Error::Other(Box::new(std::io::Error::new( + ErrorKind::Other, + "Decrypt error", + )))) + }, + )) + }) + .await + .unwrap(); + + let encrypt_result = backend.encrypt(&[4, 5, 6]).await; + assert!(encrypt_result.is_err(), "Encryption should have failed"); + if let Err(e) = encrypt_result { + assert!( + e.to_string().contains("Encrypt error"), + "Unexpected error message: {}", + e + ); + } + + let decrypt_result = backend.decrypt(&EncryptedContent::default()).await; + assert!(decrypt_result.is_err(), "Decryption should have failed"); + if let Err(e) = decrypt_result { + assert!( + e.to_string().contains("Decrypt error"), + "Unexpected error message: {}", + e + ); + } + } + + #[tokio::test] + async fn test_multi_master_key_backend_inner_with_multiple_backends() { + let mut inner = MultiMasterKeyBackendInner::default(); + + let configs = vec![ + MasterKeyConfig::File { + config: FileConfig { + path: "test1".to_string(), + }, + }, + MasterKeyConfig::File { + config: FileConfig { + path: "test2".to_string(), + }, + }, + MasterKeyConfig::File { + config: FileConfig { + path: "test3".to_string(), + }, + }, + ]; + let backends: Vec> = vec![ + create_mock_backend( + || { + Err(Error::Other(Box::new(std::io::Error::new( + ErrorKind::Other, + "Encrypt error 1", + )))) + }, + || Ok(vec![1, 2, 3]), + ), + create_mock_backend( + || Ok(EncryptedContent::default()), + || { + Err(Error::Other(Box::new(std::io::Error::new( + ErrorKind::Other, + "Decrypt error 2", + )))) + }, + ), + create_mock_backend(|| Ok(EncryptedContent::default()), || Ok(vec![7, 8, 9])), + ]; + + inner.configs = Some(configs); + inner.backends = Some(backends); + + let backend = MultiMasterKeyBackend { + inner: Arc::new(RwLock::new(inner)), + }; + + backend.encrypt(&[10, 11, 12]).await.unwrap(); + + let decrypt_result = backend.decrypt(&EncryptedContent::default()).await.unwrap(); + assert_eq!(decrypt_result, vec![1, 2, 3]); + } } diff --git a/components/encryption/src/test_utils.rs b/components/encryption/src/test_utils.rs new file mode 100644 index 00000000000..723f96cc176 --- /dev/null +++ b/components/encryption/src/test_utils.rs @@ -0,0 +1,18 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fs::File, io::Write, path::PathBuf}; + +use rand::Rng; +use tempfile::TempDir; +pub fn create_master_key_file_test_only(val: &str) -> (PathBuf, TempDir) { + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("master_key"); + let mut file = File::create(path.clone()).unwrap(); + file.write_all(format!("{}\n", val).as_bytes()).unwrap(); + (path, tmp_dir) +} + +pub fn generate_random_master_key() -> String { + let master_key: [u8; 32] = rand::thread_rng().gen(); + hex::encode(master_key) +} diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index d9e3f3a0121..3ee5d82ad14 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -10,12 +10,9 @@ license = "Apache-2.0" testexport = [] [dependencies] -encryption = { workspace = true } engine_traits = { workspace = true } kvproto = { workspace = true } +encryption = { workspace = true } raft = { workspace = true } -tikv_alloc = { workspace = true } -# FIXME: Remove this dep from the engine_traits interface -tikv_util = { workspace = true } tracker = { workspace = true } txn_types = { workspace = true } diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index 60038e3fcf8..950165b7bb2 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -2,7 +2,7 @@ use engine_traits::{ IterMetricsCollector, IterOptions, Iterable, Iterator, KvEngine, MetricsExt, Peekable, - ReadOptions, Result, SnapshotContext, SyncMutable, WriteOptions, + ReadOptions, Result, SyncMutable, WriteOptions, }; use crate::{db_vector::PanicDbVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; @@ -13,7 +13,7 @@ pub struct PanicEngine; impl KvEngine for PanicEngine { type Snapshot = PanicSnapshot; - fn snapshot(&self, _: Option) -> Self::Snapshot { + fn snapshot(&self) -> Self::Snapshot { panic!() } fn sync(&self) -> Result<()> { diff --git a/components/engine_panic/src/lib.rs b/components/engine_panic/src/lib.rs index 0b0c107f873..93555f5ba5f 100644 --- a/components/engine_panic/src/lib.rs +++ b/components/engine_panic/src/lib.rs @@ -46,6 +46,5 @@ pub use crate::flow_control_factors::*; pub mod table_properties; pub use crate::table_properties::*; pub mod checkpoint; -pub mod range_cache_engine; mod raft_engine; diff --git a/components/engine_panic/src/range_cache_engine.rs b/components/engine_panic/src/range_cache_engine.rs deleted file mode 100644 index 5ef60d9e65d..00000000000 --- a/components/engine_panic/src/range_cache_engine.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::RangeCacheEngineExt; - -use crate::PanicEngine; - -impl RangeCacheEngineExt for PanicEngine { - fn range_cache_engine_enabled(&self) -> bool { - panic!() - } - - fn evict_range(&self, range: &engine_traits::CacheRange) { - panic!() - } -} diff --git a/components/engine_panic/src/sst.rs b/components/engine_panic/src/sst.rs index 59c23e67636..ccdf4fef270 100644 --- a/components/engine_panic/src/sst.rs +++ b/components/engine_panic/src/sst.rs @@ -4,8 +4,8 @@ use std::{marker::PhantomData, path::PathBuf, sync::Arc}; use ::encryption::DataKeyManager; use engine_traits::{ - CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, RefIterable, Result, - SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, + CfName, ExternalSstFileInfo, ExternalSstFileReader, IterOptions, Iterable, Iterator, + RefIterable, Result, SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, }; use crate::engine::PanicEngine; @@ -155,6 +155,12 @@ impl ExternalSstFileInfo for PanicExternalSstFileInfo { pub struct PanicExternalSstFileReader; +impl ExternalSstFileReader for PanicExternalSstFileReader { + fn reset(&mut self) -> Result<()> { + panic!() + } +} + impl std::io::Read for PanicExternalSstFileReader { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { panic!() diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index c1b6aef7374..d3893da020a 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -45,7 +45,6 @@ protobuf = "2" raft = { workspace = true } regex = "1" serde = "1.0" -serde_derive = "1.0" slog = { workspace = true } slog-global = { workspace = true } slog_derive = "0.2" @@ -62,6 +61,6 @@ package = "rocksdb" features = ["encryption"] [dev-dependencies] -proptest = "1.0.0" rand = "0.8" toml = "0.5" +proptest = "1.0.0" diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 19f258f659b..e393f58721c 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -2,9 +2,7 @@ use std::{any::Any, sync::Arc}; -use engine_traits::{ - IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SnapshotContext, SyncMutable, -}; +use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ @@ -184,7 +182,7 @@ impl RocksEngine { impl KvEngine for RocksEngine { type Snapshot = RocksSnapshot; - fn snapshot(&self, _: Option) -> RocksSnapshot { + fn snapshot(&self) -> RocksSnapshot { RocksSnapshot::new(self.db.clone()) } @@ -297,7 +295,7 @@ mod tests { engine.put_msg(key, &r).unwrap(); engine.put_msg_cf(cf, key, &r).unwrap(); - let snap = engine.snapshot(None); + let snap = engine.snapshot(); let mut r1: Region = engine.get_msg(key).unwrap().unwrap(); assert_eq!(r, r1); diff --git a/components/engine_rocks/src/engine_iterator.rs b/components/engine_rocks/src/engine_iterator.rs index 468f648bfdb..5db3e21a03d 100644 --- a/components/engine_rocks/src/engine_iterator.rs +++ b/components/engine_rocks/src/engine_iterator.rs @@ -15,10 +15,6 @@ impl RocksEngineIterator { pub fn from_raw(iter: DBIterator>) -> RocksEngineIterator { RocksEngineIterator(iter) } - - pub fn sequence(&self) -> Option { - self.0.sequence() - } } pub struct RocksIterMetricsCollector; diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 86afd386c7c..28c7c97d0a8 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -46,7 +46,6 @@ mod logger; pub use crate::logger::*; mod misc; pub use crate::misc::*; -pub mod range_cache_engine; pub mod range_properties; mod snapshot; pub use crate::snapshot::*; diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 419278be6d7..b50002b61a5 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -747,6 +747,7 @@ mod tests { .unwrap_or_else(|err| panic!("{:?}", err)); // Create prefix bloom filter for memtable. cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); + cf_opts.set_level_compaction_dynamic_level_bytes(false); let cf = "default"; let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); let mut wb = db.write_batch(); diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index b9032e53f8f..01ca0447a06 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -567,6 +567,7 @@ pub fn get_range_stats( num_entries, num_versions: props.num_versions, num_rows: props.num_rows, + num_deletes: props.num_deletes, }) } diff --git a/components/engine_rocks/src/range_cache_engine.rs b/components/engine_rocks/src/range_cache_engine.rs deleted file mode 100644 index 1275b00a6fc..00000000000 --- a/components/engine_rocks/src/range_cache_engine.rs +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::RangeCacheEngineExt; - -use crate::RocksEngine; - -impl RangeCacheEngineExt for RocksEngine { - fn range_cache_engine_enabled(&self) -> bool { - false - } - - #[inline] - fn evict_range(&self, _: &engine_traits::CacheRange) {} -} diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 1e7f55ffe63..438b108bb85 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -162,12 +162,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .block_cache_index_bytes_insert .inc_by(value); } - TickerType::BlockCacheIndexBytesEvict => { - STORE_ENGINE_CACHE_EFFICIENCY - .get(name_enum) - .block_cache_index_bytes_evict - .inc_by(value); - } TickerType::BlockCacheFilterMiss => { STORE_ENGINE_CACHE_EFFICIENCY .get(name_enum) @@ -192,12 +186,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .block_cache_filter_bytes_insert .inc_by(value); } - TickerType::BlockCacheFilterBytesEvict => { - STORE_ENGINE_CACHE_EFFICIENCY - .get(name_enum) - .block_cache_filter_bytes_evict - .inc_by(value); - } TickerType::BlockCacheDataMiss => { STORE_ENGINE_CACHE_EFFICIENCY .get(name_enum) @@ -357,12 +345,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .iter_bytes_read .inc_by(value); } - TickerType::NoFileCloses => { - STORE_ENGINE_FILE_STATUS - .get(name_enum) - .no_file_closes - .inc_by(value); - } TickerType::NoFileOpens => { STORE_ENGINE_FILE_STATUS .get(name_enum) @@ -411,12 +393,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .write_done_by_other .inc_by(value); } - TickerType::WriteTimedout => { - STORE_ENGINE_WRITE_SERVED - .get(name_enum) - .write_timeout - .inc_by(value); - } TickerType::WriteWithWal => { STORE_ENGINE_WRITE_SERVED .get(name_enum) @@ -677,46 +653,6 @@ pub fn flush_engine_histogram_metrics(t: HistType, value: HistogramData, name: & value ); } - HistType::StallL0SlowdownCount => { - engine_histogram_metrics!( - STORE_ENGINE_STALL_L0_SLOWDOWN_COUNT_VEC, - "stall_l0_slowdown_count", - name, - value - ); - } - HistType::StallMemtableCompactionCount => { - engine_histogram_metrics!( - STORE_ENGINE_STALL_MEMTABLE_COMPACTION_COUNT_VEC, - "stall_memtable_compaction_count", - name, - value - ); - } - HistType::StallL0NumFilesCount => { - engine_histogram_metrics!( - STORE_ENGINE_STALL_L0_NUM_FILES_COUNT_VEC, - "stall_l0_num_files_count", - name, - value - ); - } - HistType::HardRateLimitDelayCount => { - engine_histogram_metrics!( - STORE_ENGINE_HARD_RATE_LIMIT_DELAY_VEC, - "hard_rate_limit_delay", - name, - value - ); - } - HistType::SoftRateLimitDelayCount => { - engine_histogram_metrics!( - STORE_ENGINE_SOFT_RATE_LIMIT_DELAY_VEC, - "soft_rate_limit_delay", - name, - value - ); - } HistType::NumFilesInSingleCompaction => { engine_histogram_metrics!( STORE_ENGINE_NUM_FILES_IN_SINGLE_COMPACTION_VEC, @@ -1588,31 +1524,6 @@ lazy_static! { "Histogram of WAL file sync micros", &["db", "type"] ).unwrap(); - pub static ref STORE_ENGINE_STALL_L0_SLOWDOWN_COUNT_VEC: GaugeVec = register_gauge_vec!( - "tikv_engine_stall_l0_slowdown_count", - "Histogram of stall l0 slowdown count", - &["db", "type"] - ).unwrap(); - pub static ref STORE_ENGINE_STALL_MEMTABLE_COMPACTION_COUNT_VEC: GaugeVec = register_gauge_vec!( - "tikv_engine_stall_memtable_compaction_count", - "Histogram of stall memtable compaction count", - &["db", "type"] - ).unwrap(); - pub static ref STORE_ENGINE_STALL_L0_NUM_FILES_COUNT_VEC: GaugeVec = register_gauge_vec!( - "tikv_engine_stall_l0_num_files_count", - "Histogram of stall l0 num files count", - &["db", "type"] - ).unwrap(); - pub static ref STORE_ENGINE_HARD_RATE_LIMIT_DELAY_VEC: GaugeVec = register_gauge_vec!( - "tikv_engine_hard_rate_limit_delay_count", - "Histogram of hard rate limit delay count", - &["db", "type"] - ).unwrap(); - pub static ref STORE_ENGINE_SOFT_RATE_LIMIT_DELAY_VEC: GaugeVec = register_gauge_vec!( - "tikv_engine_soft_rate_limit_delay_count", - "Histogram of soft rate limit delay count", - &["db", "type"] - ).unwrap(); pub static ref STORE_ENGINE_NUM_FILES_IN_SINGLE_COMPACTION_VEC: GaugeVec = register_gauge_vec!( "tikv_engine_num_files_in_single_compaction", "Histogram of number of files in single compaction", diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index 5bbc6245c72..2b70ff985c5 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -64,12 +64,10 @@ pub const ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::BlockCacheIndexHit, TickerType::BlockCacheIndexAdd, TickerType::BlockCacheIndexBytesInsert, - TickerType::BlockCacheIndexBytesEvict, TickerType::BlockCacheFilterMiss, TickerType::BlockCacheFilterHit, TickerType::BlockCacheFilterAdd, TickerType::BlockCacheFilterBytesInsert, - TickerType::BlockCacheFilterBytesEvict, TickerType::BlockCacheDataMiss, TickerType::BlockCacheDataHit, TickerType::BlockCacheDataAdd, @@ -97,7 +95,6 @@ pub const ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::NumberDbNextFound, TickerType::NumberDbPrevFound, TickerType::IterBytesRead, - TickerType::NoFileCloses, TickerType::NoFileOpens, TickerType::NoFileErrors, TickerType::StallMicros, @@ -107,7 +104,6 @@ pub const ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::WalFileBytes, TickerType::WriteDoneBySelf, TickerType::WriteDoneByOther, - TickerType::WriteTimedout, TickerType::WriteWithWal, TickerType::CompactReadBytes, TickerType::CompactWriteBytes, @@ -157,11 +153,6 @@ pub const ENGINE_HIST_TYPES: &[HistType] = &[ HistType::CompactionOutfileSyncMicros, HistType::WalFileSyncMicros, HistType::ManifestFileSyncMicros, - HistType::StallL0SlowdownCount, - HistType::StallMemtableCompactionCount, - HistType::StallL0NumFilesCount, - HistType::HardRateLimitDelayCount, - HistType::SoftRateLimitDelayCount, HistType::NumFilesInSingleCompaction, HistType::DbSeek, HistType::WriteStall, diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 1030b7aa17f..d72456cc572 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -4,8 +4,8 @@ use std::{path::PathBuf, sync::Arc}; use ::encryption::DataKeyManager; use engine_traits::{ - Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, - SstExt, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + Error, ExternalSstFileInfo, ExternalSstFileReader, IterOptions, Iterator, RefIterable, Result, + SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; use file_system::get_io_rate_limiter; @@ -243,9 +243,31 @@ pub struct RocksSstWriter { env: Option>, } +pub struct ResettableSequentualFile { + env: Arc, + path: String, + state: SequentialFile, +} + +impl std::io::Read for ResettableSequentualFile { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.state.read(buf) + } +} + +impl ExternalSstFileReader for ResettableSequentualFile { + fn reset(&mut self) -> Result<()> { + self.state = self + .env + .new_sequential_file(&self.path, EnvOptions::new()) + .map_err(r2e)?; + Ok(()) + } +} + impl SstWriter for RocksSstWriter { type ExternalSstFileInfo = RocksExternalSstFileInfo; - type ExternalSstFileReader = SequentialFile; + type ExternalSstFileReader = ResettableSequentualFile; fn put(&mut self, key: &[u8], val: &[u8]) -> Result<()> { self.writer.put(key, val).map_err(r2e) @@ -279,7 +301,12 @@ impl SstWriter for RocksSstWriter { let seq_file = env .new_sequential_file(path, EnvOptions::new()) .map_err(r2e)?; - Ok((RocksExternalSstFileInfo(sst_info), seq_file)) + let reset_file = ResettableSequentualFile { + env, + path: path.to_owned(), + state: seq_file, + }; + Ok((RocksExternalSstFileInfo(sst_info), reset_file)) } } @@ -401,5 +428,10 @@ mod tests { assert_eq!(buf.len() as u64, sst_file.file_size()); // There must not be a file in disk. std::fs::metadata(p).unwrap_err(); + + let mut buf2 = vec![]; + reader.reset().unwrap(); + reader.read_to_end(&mut buf2).unwrap(); + assert_eq!(buf, buf2); } } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index e4991419eed..e51e1178854 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -441,17 +441,16 @@ impl CompactionFilter for StackingComp &mut self, level: usize, key: &[u8], - seqno: u64, value: &[u8], value_type: CompactionFilterValueType, ) -> CompactionFilterDecision { if let Some(outer) = self.outer.as_mut() - && let r = outer.unsafe_filter(level, key, seqno, value, value_type) + && let r = outer.unsafe_filter(level, key, value, value_type) && !matches!(r, CompactionFilterDecision::Keep) { r } else if let Some(inner) = self.inner.as_mut() { - inner.unsafe_filter(level, key, seqno, value, value_type) + inner.unsafe_filter(level, key, value, value_type) } else { CompactionFilterDecision::Keep } @@ -501,7 +500,6 @@ impl CompactionFilter for RangeCompactionFilter { &mut self, _level: usize, key: &[u8], - _seqno: u64, _value: &[u8], _value_type: CompactionFilterValueType, ) -> CompactionFilterDecision { @@ -553,43 +551,52 @@ mod tests { // create db when db not exist let mut cfs_opts = vec![(CF_DEFAULT, RocksCfOptions::default())]; let mut opts = RocksCfOptions::default(); - opts.set_level_compaction_dynamic_level_bytes(true); - cfs_opts.push(("cf_dynamic_level_bytes", opts.clone())); + opts.set_level_compaction_dynamic_level_bytes(false); + cfs_opts.push(("cf_dynamic_level_bytes_disabled", opts.clone())); let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); + column_families_must_eq( + path_str, + vec![CF_DEFAULT, "cf_dynamic_level_bytes_disabled"], + ); check_dynamic_level_bytes(&db); drop(db); // add cf1. let cfs_opts = vec![ (CF_DEFAULT, opts.clone()), - ("cf_dynamic_level_bytes", opts.clone()), + ("cf_dynamic_level_bytes_disabled", opts.clone()), ("cf1", opts.clone()), ]; let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + column_families_must_eq( + path_str, + vec![CF_DEFAULT, "cf_dynamic_level_bytes_disabled", "cf1"], + ); check_dynamic_level_bytes(&db); - for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes_disabled", "cf1"] { db.put_cf(cf, b"k", b"v").unwrap(); } drop(db); // change order should not cause data corruption. let cfs_opts = vec![ - ("cf_dynamic_level_bytes", opts.clone()), + ("cf_dynamic_level_bytes_disabled", opts.clone()), ("cf1", opts.clone()), (CF_DEFAULT, opts), ]; let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + column_families_must_eq( + path_str, + vec![CF_DEFAULT, "cf_dynamic_level_bytes_disabled", "cf1"], + ); check_dynamic_level_bytes(&db); - for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes_disabled", "cf1"] { assert_eq!(db.get_value_cf(cf, b"k").unwrap().unwrap(), b"v"); } drop(db); // drop cf1. - let cfs = vec![CF_DEFAULT, "cf_dynamic_level_bytes"]; + let cfs = vec![CF_DEFAULT, "cf_dynamic_level_bytes_disabled"]; let db = new_engine(path_str, &cfs).unwrap(); column_families_must_eq(path_str, cfs); check_dynamic_level_bytes(&db); @@ -617,9 +624,11 @@ mod tests { fn check_dynamic_level_bytes(db: &RocksEngine) { let tmp_cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); - assert!(!tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); - let tmp_cf_opts = db.get_options_cf("cf_dynamic_level_bytes").unwrap(); assert!(tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); + let tmp_cf_opts = db + .get_options_cf("cf_dynamic_level_bytes_disabled") + .unwrap(); + assert!(!tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); } #[test] diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index 31355157a1a..d95d1b3d34f 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -12,18 +12,14 @@ failpoints = ["fail/failpoints"] engine_rocks = { workspace = true } engine_traits = { workspace = true } fail = "0.5" -futures = "0.3" keys = { workspace = true } lazy_static = "1.4.0" -pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } -protobuf = "2.8" raftstore = { workspace = true } slog = { workspace = true } slog-global = { workspace = true } tikv_util = { workspace = true } [dev-dependencies] -engine_test = { workspace = true } kvproto = { workspace = true } tempfile = "3.0" diff --git a/components/engine_test/Cargo.toml b/components/engine_test/Cargo.toml index 3ac42ba73ef..f5bb800078b 100644 --- a/components/engine_test/Cargo.toml +++ b/components/engine_test/Cargo.toml @@ -25,7 +25,6 @@ test-engines-panic = [ ] [dependencies] -collections = { workspace = true } encryption = { workspace = true } engine_panic = { workspace = true } engine_rocks = { workspace = true } @@ -33,6 +32,3 @@ engine_traits = { workspace = true } file_system = { workspace = true } raft_log_engine = { workspace = true } tempfile = "3.0" -tikv_alloc = { workspace = true } -# FIXME: Remove this dep from the engine_traits interface -tikv_util = { workspace = true } diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index b31ff4ab03f..2e86822ceac 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -11,25 +11,22 @@ testexport = [] [dependencies] collections = { workspace = true } -encryption = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } -lazy_static = "1.0" log_wrappers = { workspace = true } protobuf = "2" raft = { workspace = true } +encryption = { workspace = true } serde = "1.0" slog = { workspace = true } slog-global = { workspace = true } thiserror = "1.0" -tikv_alloc = { workspace = true } tikv_util = { workspace = true } tracker = { workspace = true } txn_types = { workspace = true } [dev-dependencies] -serde_derive = "1.0" toml = "0.5" diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 7e20201e231..cc90f2ce075 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -33,17 +33,13 @@ pub trait KvEngine: + Debug + Unpin + Checkpointable - + RangeCacheEngineExt + 'static { /// A consistent read-only snapshot of the database type Snapshot: Snapshot; /// Create a snapshot - /// - /// SnapCtx will only be used by some type of trait implementors (ex: - /// HybridEngine) - fn snapshot(&self, snap_ctx: Option) -> Self::Snapshot; + fn snapshot(&self) -> Self::Snapshot; /// Syncs any writes to disk fn sync(&self) -> Result<()>; @@ -82,16 +78,3 @@ pub trait KvEngine: #[cfg(feature = "testexport")] fn inner_refcount(&self) -> usize; } - -#[derive(Debug, Clone)] -pub struct SnapshotContext { - pub range: Option, - pub read_ts: u64, -} - -impl SnapshotContext { - pub fn set_range(&mut self, range: CacheRange) { - assert!(self.range.is_none()); - self.range = Some(range); - } -} diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index 574a950dd59..6df2ef5a992 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -149,7 +149,7 @@ pub enum Error { EntriesUnavailable, #[error("The entries of region is compacted")] EntriesCompacted, - #[error("Iterator of RangeCacheSnapshot is only supported with boundary set")] + #[error("Iterator of RegionCacheSnapshot is only supported with boundary set")] BoundaryNotSet, } diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 46b1877a703..2a98c69bc8b 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -217,7 +217,7 @@ impl PersistenceListener { }) })(); // The correctness relies on the assumption that there will be only one - // thread writting to the DB and increasing apply index. + // thread writing to the DB and increasing apply index. // Apply index will be set within DB lock, so it's correct even with manual // flush. let offset = data_cf_offset(&cf); diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 1fce6e169df..0c3131f92f3 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -311,8 +311,8 @@ mod table_properties; pub use crate::table_properties::*; mod checkpoint; pub use crate::checkpoint::*; -mod range_cache_engine; -pub use crate::range_cache_engine::*; +mod region_cache_engine; +pub use crate::region_cache_engine::*; // These modules contain more general traits, some of which may be implemented // by multiple types. diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index fe5a3c0f696..5e1b1347751 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -57,12 +57,25 @@ pub trait StatisticsReporter { #[derive(Default)] pub struct RangeStats { - // The number of entries + // The number of entries in write cf. pub num_entries: u64, // The number of MVCC versions of all rows (num_entries - tombstones). pub num_versions: u64, // The number of rows. pub num_rows: u64, + // The number of MVCC deletes of all rows. + pub num_deletes: u64, +} + +impl RangeStats { + /// The number of redundant keys in the range. + /// It's calculated by `num_entries - num_versions + num_deleted`. + pub fn redundant_keys(&self) -> u64 { + // Consider the number of `mvcc_deletes` as the number of redundant keys. + self.num_entries + .saturating_sub(self.num_rows) + .saturating_add(self.num_deletes) + } } pub trait MiscExt: CfNamesExt + FlowControlFactorsExt + WriteBatchExt { diff --git a/components/engine_traits/src/range_cache_engine.rs b/components/engine_traits/src/range_cache_engine.rs deleted file mode 100644 index 8b5c10bd354..00000000000 --- a/components/engine_traits/src/range_cache_engine.rs +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - cmp, - fmt::{self, Debug}, - result, -}; - -use keys::{enc_end_key, enc_start_key}; -use kvproto::metapb; - -use crate::{Iterable, KvEngine, Snapshot, WriteBatchExt}; - -#[derive(Debug, PartialEq)] -pub enum FailedReason { - NotCached, - TooOldRead, -} - -/// RangeCacheEngine works as a range cache caching some ranges (in Memory or -/// NVME for instance) to improve the read performance. -pub trait RangeCacheEngine: - WriteBatchExt + Iterable + Debug + Clone + Unpin + Send + Sync + 'static -{ - type Snapshot: Snapshot; - - // If None is returned, the RangeCacheEngine is currently not readable for this - // region or read_ts. - // Sequence number is shared between RangeCacheEngine and disk KvEnigne to - // provide atomic write - fn snapshot( - &self, - range: CacheRange, - read_ts: u64, - seq_num: u64, - ) -> result::Result; - - type DiskEngine: KvEngine; - fn set_disk_engine(&mut self, disk_engine: Self::DiskEngine); - - // return the range containing the key - fn get_range_for_key(&self, key: &[u8]) -> Option; - - type RangeHintService: RangeHintService; - fn start_hint_service(&self, range_hint_service: Self::RangeHintService); - - fn enabled(&self) -> bool { - false - } - - fn evict_range(&self, range: &CacheRange); -} - -pub trait RangeCacheEngineExt { - fn range_cache_engine_enabled(&self) -> bool; - - // TODO(SpadeA): try to find a better way to reduce coupling degree of range - // cache engine and kv engine - fn evict_range(&self, range: &CacheRange); -} - -/// A service that should run in the background to retrieve and apply cache -/// hints. -/// -/// TODO (afeinberg): Presently, this is only a marker trait with a single -/// implementation. Methods and/or associated types will be added to this trait -/// as it continues to evolve to handle eviction, using stats. -pub trait RangeHintService: Send + Sync {} - -#[derive(Clone, Eq)] -pub struct CacheRange { - pub start: Vec, - pub end: Vec, - // Note: tag may not be accurate due decouple of region split and range split. It's only for - // debug purpose. - pub tag: String, -} - -impl PartialEq for CacheRange { - fn eq(&self, other: &Self) -> bool { - self.start == other.start && self.end == other.end - } -} - -impl Debug for CacheRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CacheRange") - .field("tag", &self.tag) - .field("range_start", &log_wrappers::Value(&self.start)) - .field("range_end", &log_wrappers::Value(&self.end)) - .finish() - } -} - -impl CacheRange { - pub fn new(start: Vec, end: Vec) -> Self { - Self { - start, - end, - tag: "".to_owned(), - } - } - - pub fn from_region(region: &metapb::Region) -> Self { - Self { - start: enc_start_key(region), - end: enc_end_key(region), - tag: format!("[region_id={}]", region.get_id()), - } - } -} - -impl PartialOrd for CacheRange { - fn partial_cmp(&self, other: &Self) -> Option { - if self.end <= other.start { - return Some(cmp::Ordering::Less); - } - - if other.end <= self.start { - return Some(cmp::Ordering::Greater); - } - - if self == other { - return Some(cmp::Ordering::Equal); - } - - None - } -} - -impl Ord for CacheRange { - fn cmp(&self, other: &Self) -> cmp::Ordering { - let c = self.start.cmp(&other.start); - if !c.is_eq() { - return c; - } - self.end.cmp(&other.end) - } -} - -impl CacheRange { - // todo: need to consider ""? - pub fn contains_range(&self, other: &CacheRange) -> bool { - self.start <= other.start && self.end >= other.end - } - - pub fn contains_key(&self, key: &[u8]) -> bool { - self.start.as_slice() <= key && key < self.end.as_slice() - } - - // Note: overlaps also includes "contains" - pub fn overlaps(&self, other: &CacheRange) -> bool { - self.start < other.end && other.start < self.end - } - - pub fn split_off(&self, range: &CacheRange) -> (Option, Option) { - assert!(self.contains_range(range)); - let left = if self.start != range.start { - Some(CacheRange { - start: self.start.clone(), - end: range.start.clone(), - tag: "".to_owned(), - }) - } else { - None - }; - let right = if self.end != range.end { - Some(CacheRange { - start: range.end.clone(), - end: self.end.clone(), - tag: "".to_owned(), - }) - } else { - None - }; - - (left, right) - } -} - -#[cfg(test)] -mod tests { - use std::cmp::Ordering; - - use crate::CacheRange; - - #[test] - fn test_cache_range_eq() { - let r1 = CacheRange::new(b"k1".to_vec(), b"k2".to_vec()); - let mut r2 = CacheRange::new(b"k1".to_vec(), b"k2".to_vec()); - r2.tag = "Something".to_string(); - assert_eq!(r1, r2); - } - - #[test] - fn test_cache_range_partial_cmp() { - let r1 = CacheRange::new(b"k1".to_vec(), b"k2".to_vec()); - let r2 = CacheRange::new(b"k2".to_vec(), b"k3".to_vec()); - let r3 = CacheRange::new(b"k2".to_vec(), b"k4".to_vec()); - assert_eq!(r1.partial_cmp(&r2).unwrap(), Ordering::Less); - assert_eq!(r2.partial_cmp(&r1).unwrap(), Ordering::Greater); - assert!(r2.partial_cmp(&r3).is_none()); - } - - #[test] - fn test_split_off() { - let r1 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - let r2 = CacheRange::new(b"k2".to_vec(), b"k4".to_vec()); - - let r3 = CacheRange::new(b"k1".to_vec(), b"k2".to_vec()); - let r4 = CacheRange::new(b"k4".to_vec(), b"k6".to_vec()); - - let (left, right) = r1.split_off(&r1); - assert!(left.is_none() && right.is_none()); - let (left, right) = r1.split_off(&r2); - assert_eq!(left.unwrap(), r3); - assert_eq!(right.unwrap(), r4); - } - - #[test] - fn test_overlap() { - let r1 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - let r2 = CacheRange::new(b"k2".to_vec(), b"k4".to_vec()); - assert!(r1.overlaps(&r2)); - assert!(r2.overlaps(&r1)); - - let r1 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - let r2 = CacheRange::new(b"k2".to_vec(), b"k7".to_vec()); - assert!(r1.overlaps(&r2)); - assert!(r2.overlaps(&r1)); - - let r1 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - let r2 = CacheRange::new(b"k1".to_vec(), b"k4".to_vec()); - assert!(r1.overlaps(&r2)); - assert!(r2.overlaps(&r1)); - - let r1 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - let r2 = CacheRange::new(b"k2".to_vec(), b"k6".to_vec()); - assert!(r1.overlaps(&r2)); - assert!(r2.overlaps(&r1)); - - let r1 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - let r2 = CacheRange::new(b"k1".to_vec(), b"k6".to_vec()); - assert!(r1.overlaps(&r2)); - assert!(r2.overlaps(&r1)); - - let r1 = CacheRange::new(b"k1".to_vec(), b"k2".to_vec()); - let r2 = CacheRange::new(b"k2".to_vec(), b"k3".to_vec()); - assert!(!r1.overlaps(&r2)); - assert!(!r2.overlaps(&r1)); - - let r1 = CacheRange::new(b"k1".to_vec(), b"k2".to_vec()); - let r2 = CacheRange::new(b"k3".to_vec(), b"k4".to_vec()); - assert!(!r1.overlaps(&r2)); - assert!(!r2.overlaps(&r1)); - } -} diff --git a/components/engine_traits/src/region_cache_engine.rs b/components/engine_traits/src/region_cache_engine.rs new file mode 100644 index 00000000000..757629d3fdd --- /dev/null +++ b/components/engine_traits/src/region_cache_engine.rs @@ -0,0 +1,258 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Debug}, + result, +}; + +use keys::{enc_end_key, enc_start_key}; +use kvproto::metapb::Region; + +use crate::{Iterable, KvEngine, Snapshot, WriteBatchExt}; + +#[derive(Debug, PartialEq)] +pub enum FailedReason { + NotCached, + TooOldRead, + // while we always get rocksdb's snapshot first and then get IME's snapshot. + // epoch is first checked in get rocksdb's snaphsot by raftstore. + // But because we update IME's epoch in apply batch, and update raft local reader's + // epoch after ApplyRes is returned, so it's possible that IME's region epoch is + // newer than raftstore's, so we still need to check epoch again in IME snapshot. + EpochNotMatch, +} + +#[derive(Debug, PartialEq)] +pub enum RegionEvent { + Split { + source: CacheRegion, + new_regions: Vec, + }, + TryLoad { + region: CacheRegion, + for_manual_range: bool, + }, + Eviction { + region: CacheRegion, + reason: EvictReason, + }, + // range eviction triggered by delete_range + // we should evict all cache regions that overlaps with this range + EvictByRange { + range: CacheRegion, + reason: EvictReason, + }, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum EvictReason { + LoadFailed, + LoadFailedWithoutStart, + MemoryLimitReached, + BecomeFollower, + AutoEvict, + DeleteRange, + Merge, + Disabled, + ApplySnapshot, + Flashback, + Manual, + PeerDestroy, +} + +/// RegionCacheEngine works as a region cache caching some regions (in Memory or +/// NVME for instance) to improve the read performance. +pub trait RegionCacheEngine: + RegionCacheEngineExt + WriteBatchExt + Iterable + Debug + Clone + Unpin + Send + Sync + 'static +{ + type Snapshot: Snapshot; + + // If None is returned, the RegionCacheEngine is currently not readable for this + // region or read_ts. + // Sequence number is shared between RegionCacheEngine and disk KvEnigne to + // provide atomic write + fn snapshot( + &self, + region: CacheRegion, + read_ts: u64, + seq_num: u64, + ) -> result::Result; + + type DiskEngine: KvEngine; + fn set_disk_engine(&mut self, disk_engine: Self::DiskEngine); + + // return the region containing the key + fn get_region_for_key(&self, key: &[u8]) -> Option; + + type RangeHintService: RangeHintService; + fn start_hint_service(&self, range_hint_service: Self::RangeHintService); + + fn enabled(&self) -> bool { + false + } +} + +pub trait RegionCacheEngineExt { + // TODO(SpadeA): try to find a better way to reduce coupling degree of + // region cache engine and kv engine + fn on_region_event(&self, event: RegionEvent); + + fn region_cached(&self, region: &Region) -> bool; + + fn load_region(&self, region: &Region); +} + +/// A service that should run in the background to retrieve and apply cache +/// hints. +/// +/// TODO (afeinberg): Presently, this is only a marker trait with a single +/// implementation. Methods and/or associated types will be added to this trait +/// as it continues to evolve to handle eviction, using stats. +pub trait RangeHintService: Send + Sync {} + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct CacheRegion { + // target region id + pub id: u64, + // the version of target region epoch. we only track version but not + // conf_version because conf_version does not change the applied data. + pub epoch_version: u64, + // data start key of the region range, equals to data_key(region.start_key). + pub start: Vec, + // data end key of the region range, equals to data_end_key(region.end_key). + pub end: Vec, +} + +impl Debug for CacheRegion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CacheRegion") + .field("id", &self.id) + .field("epoch", &self.epoch_version) + .field("range_start", &log_wrappers::Value(&self.start)) + .field("range_end", &log_wrappers::Value(&self.end)) + .finish() + } +} + +impl CacheRegion { + pub fn new>, T2: Into>>( + id: u64, + epoch_version: u64, + start: T1, + end: T2, + ) -> Self { + Self { + id, + epoch_version, + start: start.into(), + end: end.into(), + } + } + + pub fn from_region(region: &Region) -> Self { + Self { + start: enc_start_key(region), + end: enc_end_key(region), + id: region.id, + epoch_version: region.get_region_epoch().version, + } + } +} + +impl CacheRegion { + pub fn contains_range(&self, other: &CacheRegion) -> bool { + self.start <= other.start && self.end >= other.end + } + + pub fn contains_key(&self, key: &[u8]) -> bool { + self.start.as_slice() <= key && key < self.end.as_slice() + } + + // Note: overlaps also includes "contains" + pub fn overlaps(&self, other: &CacheRegion) -> bool { + self.start < other.end && other.start < self.end + } + + pub fn union(&self, other: &CacheRegion) -> Option { + if self.overlaps(other) { + Some(CacheRegion { + id: 0, + epoch_version: 0, + start: std::cmp::min(&self.start, &other.start).clone(), + end: std::cmp::max(&self.end, &other.end).clone(), + }) + } else { + None + } + } + + pub fn difference(&self, other: &CacheRegion) -> (Option, Option) { + if !self.overlaps(other) { + return (None, None); + } + let left = if self.start < other.start { + Some(CacheRegion { + id: 0, + epoch_version: 0, + start: self.start.clone(), + end: other.start.clone(), + }) + } else { + None + }; + let right = if self.end > other.end { + Some(CacheRegion { + id: 0, + epoch_version: 0, + start: other.end.clone(), + end: self.end.clone(), + }) + } else { + None + }; + (left, right) + } +} + +#[cfg(test)] +mod tests { + use super::CacheRegion; + + #[test] + fn test_overlap() { + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k6".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k2".to_vec(), b"k4".to_vec()); + assert!(r1.overlaps(&r2)); + assert!(r2.overlaps(&r1)); + + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k6".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k2".to_vec(), b"k7".to_vec()); + assert!(r1.overlaps(&r2)); + assert!(r2.overlaps(&r1)); + + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k6".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k1".to_vec(), b"k4".to_vec()); + assert!(r1.overlaps(&r2)); + assert!(r2.overlaps(&r1)); + + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k6".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k2".to_vec(), b"k6".to_vec()); + assert!(r1.overlaps(&r2)); + assert!(r2.overlaps(&r1)); + + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k6".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k1".to_vec(), b"k6".to_vec()); + assert!(r1.overlaps(&r2)); + assert!(r2.overlaps(&r1)); + + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k2".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k2".to_vec(), b"k3".to_vec()); + assert!(!r1.overlaps(&r2)); + assert!(!r2.overlaps(&r1)); + + let r1 = CacheRegion::new(1, 0, b"k1".to_vec(), b"k2".to_vec()); + let r2 = CacheRegion::new(2, 0, b"k3".to_vec(), b"k4".to_vec()); + assert!(!r1.overlaps(&r2)); + assert!(!r2.overlaps(&r1)); + } +} diff --git a/components/engine_traits/src/snapshot.rs b/components/engine_traits/src/snapshot.rs index 6ab2bb78af1..e8f61fc9d68 100644 --- a/components/engine_traits/src/snapshot.rs +++ b/components/engine_traits/src/snapshot.rs @@ -13,4 +13,9 @@ where Self: 'static + Peekable + Iterable + CfNamesExt + SnapshotMiscExt + Send + Sync + Sized + Debug, { + /// Whether the snapshot acquired hit the in memory engine. It always + /// returns false if the in memory engine is disabled. + fn in_memory_engine_hit(&self) -> bool { + false + } } diff --git a/components/engine_traits/src/sst.rs b/components/engine_traits/src/sst.rs index 036c8999e3f..991c925d1c4 100644 --- a/components/engine_traits/src/sst.rs +++ b/components/engine_traits/src/sst.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{path::PathBuf, sync::Arc}; +use std::{path::PathBuf, str::FromStr, sync::Arc}; use encryption::DataKeyManager; use kvproto::import_sstpb::SstMeta; @@ -30,7 +30,7 @@ pub trait SstReader: RefIterable + Sized + Send { /// SstWriter is used to create sst files that can be added to database later. pub trait SstWriter: Send { type ExternalSstFileInfo: ExternalSstFileInfo; - type ExternalSstFileReader: std::io::Read + Send; + type ExternalSstFileReader: ExternalSstFileReader; /// Add key, value to currently opened file /// REQUIRES: key is after any previously added key according to comparator. @@ -50,6 +50,10 @@ pub trait SstWriter: Send { fn finish_read(self) -> Result<(Self::ExternalSstFileInfo, Self::ExternalSstFileReader)>; } +pub trait ExternalSstFileReader: std::io::Read + Send { + fn reset(&mut self) -> Result<()>; +} + // compression type used for write sst file #[derive(Copy, Clone)] pub enum SstCompressionType { @@ -58,6 +62,19 @@ pub enum SstCompressionType { Zstd, } +impl FromStr for SstCompressionType { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_ascii_lowercase().as_str() { + "lz4" => Ok(Self::Lz4), + "snappy" => Ok(Self::Snappy), + "zstd" => Ok(Self::Zstd), + otherwise => Err(format!("{} isn't a valid compression method", otherwise)), + } + } +} + /// A builder builds a SstWriter. pub trait SstWriterBuilder where diff --git a/components/engine_traits/src/write_batch.rs b/components/engine_traits/src/write_batch.rs index e04cb498fb6..222f7045b6c 100644 --- a/components/engine_traits/src/write_batch.rs +++ b/components/engine_traits/src/write_batch.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use crate::{errors::Result, options::WriteOptions, CacheRange}; +use crate::{errors::Result, options::WriteOptions}; /// Engines that can create write batches pub trait WriteBatchExt: Sized { @@ -39,10 +39,16 @@ pub trait Mutable: Send { /// Delete a range of key/values in a given column family fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()>; - fn put_msg(&mut self, key: &[u8], m: &M) -> Result<()> { + fn put_msg(&mut self, key: &[u8], m: &M) -> Result<()> + where + Self: Sized, + { self.put(key, &m.write_to_bytes()?) } - fn put_msg_cf(&mut self, cf: &str, key: &[u8], m: &M) -> Result<()> { + fn put_msg_cf(&mut self, cf: &str, key: &[u8], m: &M) -> Result<()> + where + Self: Sized, + { self.put_cf(cf, key, &m.write_to_bytes()?) } } @@ -73,8 +79,13 @@ pub trait WriteBatch: Mutable { /// Commit the WriteBatch to disk with the given options fn write_opt(&mut self, opts: &WriteOptions) -> Result; - // TODO: it should be `FnOnce`. - fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut(u64)) -> Result { + /// Commit the WriteBatch to disk with the given options and call the + /// callback. The callback is called multiple times with the sequence number + /// of the write. + fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut(u64)) -> Result + where + Self: Sized, + { let seq = self.write_opt(opts)?; cb(seq); Ok(seq) @@ -122,9 +133,7 @@ pub trait WriteBatch: Mutable { fn rollback_to_save_point(&mut self) -> Result<()>; /// Merge another WriteBatch to itself - fn merge(&mut self, src: Self) -> Result<()>; - - /// It declares that the following consecutive write will be within this - /// range. - fn prepare_for_range(&mut self, _: CacheRange) {} + fn merge(&mut self, src: Self) -> Result<()> + where + Self: Sized; } diff --git a/components/engine_traits_tests/Cargo.toml b/components/engine_traits_tests/Cargo.toml index a68043db896..07765ca4430 100644 --- a/components/engine_traits_tests/Cargo.toml +++ b/components/engine_traits_tests/Cargo.toml @@ -26,12 +26,9 @@ test-engines-panic = [ ] [dependencies] -encryption = { workspace = true } encryption_export = { workspace = true } engine_test = { workspace = true } engine_traits = { workspace = true } -kvproto = { workspace = true } panic_hook = { workspace = true } tempfile = "3.0" test_util = { workspace = true } -tikv_alloc = { workspace = true } diff --git a/components/engine_traits_tests/src/iterator.rs b/components/engine_traits_tests/src/iterator.rs index fee6cda6f02..714ca4cb0b4 100644 --- a/components/engine_traits_tests/src/iterator.rs +++ b/components/engine_traits_tests/src/iterator.rs @@ -41,9 +41,7 @@ fn iter_empty_engine() { #[test] fn iter_empty_snapshot() { let db = default_engine(); - iter_empty(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + iter_empty(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_forward(e: &E, i: IF) @@ -101,9 +99,7 @@ fn iter_forward_engine() { #[test] fn iter_forward_snapshot() { let db = default_engine(); - iter_forward(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + iter_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_reverse(e: &E, i: IF) @@ -161,9 +157,7 @@ fn iter_reverse_engine() { #[test] fn iter_reverse_snapshot() { let db = default_engine(); - iter_reverse(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + iter_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_to_key_then_forward(e: &E, i: IF) @@ -204,9 +198,7 @@ fn seek_to_key_then_forward_engine() { #[test] fn seek_to_key_then_forward_snapshot() { let db = default_engine(); - seek_to_key_then_forward(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + seek_to_key_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_to_key_then_reverse(e: &E, i: IF) @@ -247,9 +239,7 @@ fn seek_to_key_then_reverse_engine() { #[test] fn seek_to_key_then_reverse_snapshot() { let db = default_engine(); - seek_to_key_then_reverse(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + seek_to_key_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_forward_then_reverse(e: &E, i: IF) @@ -310,9 +300,7 @@ fn iter_forward_then_reverse_engine() { #[test] fn iter_forward_then_reverse_snapshot() { let db = default_engine(); - iter_forward_then_reverse(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + iter_forward_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_reverse_then_forward(e: &E, i: IF) @@ -373,9 +361,7 @@ fn iter_reverse_then_forward_engine() { #[test] fn iter_reverse_then_forward_snapshot() { let db = default_engine(); - iter_reverse_then_forward(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + iter_reverse_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } // When seek finds an exact key then seek_for_prev behaves just like seek @@ -419,9 +405,7 @@ fn seek_for_prev_engine() { #[test] fn seek_for_prev_snapshot() { let db = default_engine(); - seek_for_prev(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + seek_for_prev(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } // When Seek::Key doesn't find an exact match, @@ -456,9 +440,7 @@ fn seek_key_miss_engine() { #[test] fn seek_key_miss_snapshot() { let db = default_engine(); - seek_key_miss(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + seek_key_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_key_prev_miss(e: &E, i: IF) @@ -490,7 +472,5 @@ fn seek_key_prev_miss_engine() { #[test] fn seek_key_prev_miss_snapshot() { let db = default_engine(); - seek_key_prev_miss(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + seek_key_prev_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } diff --git a/components/engine_traits_tests/src/read_consistency.rs b/components/engine_traits_tests/src/read_consistency.rs index 35d0262fbcb..8c7ab50657f 100644 --- a/components/engine_traits_tests/src/read_consistency.rs +++ b/components/engine_traits_tests/src/read_consistency.rs @@ -12,7 +12,7 @@ fn snapshot_with_writes() { db.engine.put(b"a", b"aa").unwrap(); - let snapshot = db.engine.snapshot(None); + let snapshot = db.engine.snapshot(); assert_eq!(snapshot.get_value(b"a").unwrap().unwrap(), b"aa"); @@ -77,7 +77,5 @@ fn iterator_with_writes_engine() { #[test] fn iterator_with_writes_snapshot() { let db = default_engine(); - iterator_with_writes(&db.engine, |e| { - e.snapshot(None).iterator(CF_DEFAULT).unwrap() - }); + iterator_with_writes(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } diff --git a/components/engine_traits_tests/src/snapshot_basic.rs b/components/engine_traits_tests/src/snapshot_basic.rs index 83248abfb6e..c0f93480830 100644 --- a/components/engine_traits_tests/src/snapshot_basic.rs +++ b/components/engine_traits_tests/src/snapshot_basic.rs @@ -10,7 +10,7 @@ fn snapshot_get_value() { db.engine.put(b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(None); + let snap = db.engine.snapshot(); let value = snap.get_value(b"a").unwrap(); let value = value.unwrap(); @@ -26,7 +26,7 @@ fn snapshot_get_value_after_put() { db.engine.put(b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(None); + let snap = db.engine.snapshot(); db.engine.put(b"a", b"aaa").unwrap(); @@ -41,7 +41,7 @@ fn snapshot_get_value_cf() { db.engine.put_cf(CF_WRITE, b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(None); + let snap = db.engine.snapshot(); let value = snap.get_value_cf(CF_WRITE, b"a").unwrap(); let value = value.unwrap(); @@ -57,7 +57,7 @@ fn snapshot_get_value_cf_after_put() { db.engine.put_cf(CF_WRITE, b"a", b"aa").unwrap(); - let snap = db.engine.snapshot(None); + let snap = db.engine.snapshot(); db.engine.put_cf(CF_WRITE, b"a", b"aaa").unwrap(); diff --git a/components/error_code/Cargo.toml b/components/error_code/Cargo.toml index 0be4d7fa58c..4939fae26fa 100644 --- a/components/error_code/Cargo.toml +++ b/components/error_code/Cargo.toml @@ -14,9 +14,7 @@ name = "error_code_gen" path = "bin.rs" [dependencies] -grpcio = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" raft = { workspace = true } -serde = { version = "1.0", features = ["derive"] } tikv_alloc = { workspace = true } diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index 52a06cdb9d2..39e483cb790 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -6,13 +6,13 @@ publish = false license = "Apache-2.0" [dependencies] -async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } +async-compression = { version = "0.4.12", features = ["futures-io", "zstd"] } async-trait = "0.1" aws = { workspace = true } azure = { workspace = true } +chrono = { workspace = true } cloud = { workspace = true } encryption = { workspace = true } -engine_traits = { workspace = true } file_system = { workspace = true } futures = "0.3" futures-io = "0.3" @@ -23,6 +23,8 @@ lazy_static = "1.3" openssl = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly", "push"] } rand = "0.8" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" slog = { workspace = true } # better to not use slog-global, but pass in the logger slog-global = { workspace = true } @@ -31,9 +33,10 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "fs", "process"] } tokio-util = { version = "0.7", features = ["compat"] } url = "2.0" +uuid = { version = "0.8", features = ["v4", "serde"] } +walkdir = "2" [dev-dependencies] -matches = "0.1.8" rust-ini = "0.14.0" structopt = "0.3" tempfile = "3.1" diff --git a/components/external_storage/src/export.rs b/components/external_storage/src/export.rs index 7d34f8aed08..127bace7f52 100644 --- a/components/external_storage/src/export.rs +++ b/components/external_storage/src/export.rs @@ -5,8 +5,10 @@ use std::{io, path::Path, sync::Arc}; use async_trait::async_trait; pub use aws::{Config as S3Config, S3Storage}; pub use azure::{AzureStorage, Config as AzureConfig}; -use cloud::blob::{BlobStorage, PutResource}; +pub use cloud::blob::BlobObject; +use cloud::blob::{BlobStorage, DeletableStorage, IterableStorage, PutResource}; use encryption::DataKeyManager; +use futures_util::{future::LocalBoxFuture, stream::LocalBoxStream}; use gcp::GcsStorage; use kvproto::brpb::{ AzureBlobStorage, Gcs, Noop, StorageBackend, StorageBackend_oneof_backend as Backend, S3, @@ -15,10 +17,9 @@ use tikv_util::time::{Instant, Limiter}; use crate::{ compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_into_file, - record_storage_create, BackendConfig, ExternalData, ExternalStorage, HdfsStorage, LocalStorage, - NoopStorage, RestoreConfig, UnpinReader, + record_storage_create, wrap_with_checksum_reader_if_needed, BackendConfig, ExternalData, + ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, }; - pub fn create_storage( storage_backend: &StorageBackend, config: BackendConfig, @@ -45,8 +46,10 @@ fn bad_backend(backend: Backend) -> io::Error { bad_storage_backend(&storage_backend) } -fn blob_store(store: Blob) -> Box { - Box::new(BlobStore::new(store)) as Box +fn blob_store( + store: Blob, +) -> Box { + Box::new(Compat::new(store)) as Box } fn create_backend( @@ -121,35 +124,44 @@ pub fn make_azblob_backend(config: AzureBlobStorage) -> StorageBackend { backend } -pub struct BlobStore(Blob); +pub struct Compat(Blob); -impl BlobStore { +impl Compat { pub fn new(inner: Blob) -> Self { - BlobStore(inner) + Compat(inner) + } + + pub fn into_inner(self) -> Blob { + self.0 } } -impl std::ops::Deref for BlobStore { +impl std::ops::Deref for Compat { type Target = Blob; fn deref(&self) -> &Self::Target { &self.0 } } -pub struct EncryptedExternalStorage { +pub struct AutoEncryptLocalRestoredFileExternalStorage { pub key_manager: Arc, pub storage: S, } #[async_trait] -impl ExternalStorage for EncryptedExternalStorage { +impl ExternalStorage for AutoEncryptLocalRestoredFileExternalStorage { fn name(&self) -> &'static str { self.storage.name() } fn url(&self) -> io::Result { self.storage.url() } - async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + content_length: u64, + ) -> io::Result<()> { self.storage.write(name, reader, content_length).await } fn read(&self, name: &str) -> ExternalData<'_> { @@ -169,44 +181,70 @@ impl ExternalStorage for EncryptedExternalStorage { let RestoreConfig { range, compression_type, - expected_sha256, + expected_plaintext_file_checksum: expected_sha256, file_crypter, + opt_encrypted_file_checksum, } = restore_config; - let reader = { + let (mut reader, opt_hasher) = { let inner = if let Some((off, len)) = range { self.read_part(storage_name, off, len) } else { self.read(storage_name) }; - compression_reader_dispatcher(compression_type, inner)? + // wrap with checksum reader if needed + // + let (checksum_reader, opt_hasher) = + wrap_with_checksum_reader_if_needed(opt_encrypted_file_checksum.is_some(), inner)?; + + // wrap with decrypter if needed + // + let encrypted_reader = encrypt_wrap_reader(file_crypter, checksum_reader)?; + + ( + compression_reader_dispatcher(compression_type, encrypted_reader)?, + opt_hasher, + ) }; let file_writer = self.key_manager.create_file_for_write(&restore_name)?; let min_read_speed: usize = 8192; - let mut input = encrypt_wrap_reader(file_crypter, reader)?; - read_external_storage_into_file( - &mut input, + &mut reader, file_writer, speed_limiter, expected_length, expected_sha256, min_read_speed, + opt_encrypted_file_checksum, + opt_hasher, ) .await } + + fn iter_prefix(&self, prefix: &str) -> LocalBoxStream<'_, io::Result> { + self.storage.iter_prefix(prefix) + } + + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + self.storage.delete(name) + } } #[async_trait] -impl ExternalStorage for BlobStore { +impl ExternalStorage for Compat { fn name(&self) -> &'static str { (**self).config().name() } fn url(&self) -> io::Result { (**self).config().url() } - async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + content_length: u64, + ) -> io::Result<()> { (**self) .put(name, PutResource(reader.0), content_length) .await @@ -219,6 +257,19 @@ impl ExternalStorage for BlobStore { fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { (**self).get_part(name, off, len) } + + /// Walk the prefix of the blob storage. + /// It returns the stream of items. + fn iter_prefix( + &self, + prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result> { + (**self).iter_prefix(prefix) + } + + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + (**self).delete(name) + } } #[cfg(test)] diff --git a/components/external_storage/src/hdfs.rs b/components/external_storage/src/hdfs.rs index 17556490320..05934463091 100644 --- a/components/external_storage/src/hdfs.rs +++ b/components/external_storage/src/hdfs.rs @@ -3,6 +3,11 @@ use std::{io, path, process::Stdio}; use async_trait::async_trait; +use cloud::blob::BlobObject; +use futures_util::{ + future::{FutureExt, LocalBoxFuture}, + stream::LocalBoxStream, +}; use tokio::{io as async_io, process::Command}; use tokio_util::compat::FuturesAsyncReadCompatExt; use url::Url; @@ -76,7 +81,12 @@ impl ExternalStorage for HdfsStorage { Ok(self.remote.clone()) } - async fn write(&self, name: &str, reader: UnpinReader, _content_length: u64) -> io::Result<()> { + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + _content_length: u64, + ) -> io::Result<()> { if name.contains(path::MAIN_SEPARATOR) { return Err(io::Error::new( io::ErrorKind::Other, @@ -138,6 +148,19 @@ impl ExternalStorage for HdfsStorage { fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { unimplemented!("currently only HDFS export is implemented") } + + /// Walk the prefix of the blob storage. + /// It returns the stream of items. + fn iter_prefix( + &self, + _prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result> { + Box::pin(futures::future::err(crate::unimplemented()).into_stream()) + } + + fn delete(&self, _name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + Box::pin(futures::future::err(crate::unimplemented())) + } } #[cfg(test)] diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 05dbf6f965d..c9b3aafe586 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -9,19 +9,21 @@ extern crate slog_global; extern crate tikv_alloc; use std::{ + any::Any, io::{self, Write}, marker::Unpin, - sync::Arc, + panic::Location, + sync::{Arc, Mutex}, time::Duration, }; use async_compression::futures::bufread::ZstdDecoder; use async_trait::async_trait; use encryption::{DecrypterReader, FileEncryptionInfo, Iv}; -use file_system::File; +use file_system::{File, Sha256Reader}; use futures::io::BufReader; use futures_io::AsyncRead; -use futures_util::AsyncReadExt; +use futures_util::{future::LocalBoxFuture, stream::LocalBoxStream, AsyncReadExt}; use kvproto::brpb::CompressionType; use openssl::hash::{Hasher, MessageDigest}; use tikv_util::{ @@ -30,11 +32,15 @@ use tikv_util::{ time::{Instant, Limiter}, }; use tokio::time::timeout; +use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; +use url::Url; mod hdfs; +pub use cloud::blob::{BlobObject, IterableStorage}; pub use hdfs::{HdfsConfig, HdfsStorage}; pub mod local; pub use local::LocalStorage; +pub mod locking; mod noop; pub use noop::NoopStorage; mod metrics; @@ -52,7 +58,13 @@ pub fn record_storage_create(start: Instant, storage: &dyn ExternalStorage) { /// This wrapper would remove the lifetime at the argument of the generated /// async function in order to make rustc happy. (And reduce the length of /// signature of write.) see https://github.com/rust-lang/rust/issues/63033 -pub struct UnpinReader(pub Box); +pub struct UnpinReader<'a>(pub Box); + +impl<'a, R: AsyncRead + Unpin + Send + 'a> From for UnpinReader<'a> { + fn from(r: R) -> Self { + UnpinReader(Box::new(r)) + } +} pub type ExternalData<'a> = Box; @@ -66,8 +78,9 @@ pub struct BackendConfig { pub struct RestoreConfig { pub range: Option<(u64, u64)>, pub compression_type: Option, - pub expected_sha256: Option>, + pub expected_plaintext_file_checksum: Option>, pub file_crypter: Option, + pub opt_encrypted_file_checksum: Option>, } /// a reader dispatcher for different compression type. @@ -96,13 +109,18 @@ pub fn compression_reader_dispatcher( /// An abstraction of an external storage. // TODO: these should all be returning a future (i.e. async fn). #[async_trait] -pub trait ExternalStorage: 'static + Send + Sync { +pub trait ExternalStorage: 'static + Send + Sync + Any { fn name(&self) -> &'static str; - fn url(&self) -> io::Result; + fn url(&self) -> io::Result; /// Write all contents of the read to the given path. - async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()>; + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + content_length: u64, + ) -> io::Result<()>; /// Read all contents of the given path. fn read(&self, name: &str) -> ExternalData<'_>; @@ -122,18 +140,31 @@ pub trait ExternalStorage: 'static + Send + Sync { let RestoreConfig { range, compression_type, - expected_sha256, + expected_plaintext_file_checksum: expected_sha256, file_crypter, + opt_encrypted_file_checksum, } = restore_config; - let reader = { + let (reader, opt_hasher) = { let inner = if let Some((off, len)) = range { self.read_part(storage_name, off, len) } else { self.read(storage_name) }; - compression_reader_dispatcher(compression_type, inner)? + // wrap with checksum reader if needed + // + let (checksum_reader, opt_hasher) = + wrap_with_checksum_reader_if_needed(opt_encrypted_file_checksum.is_some(), inner)?; + + // wrap with decrypter if needed + // + let encrypted_reader = encrypt_wrap_reader(file_crypter, checksum_reader)?; + + ( + compression_reader_dispatcher(compression_type, encrypted_reader)?, + opt_hasher, + ) }; let output = File::create(restore_name)?; // the minimum speed of reading data, in bytes/second. @@ -141,18 +172,38 @@ pub trait ExternalStorage: 'static + Send + Sync { // a "TimedOut" error. // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) let min_read_speed: usize = 8192; - let input = encrypt_wrap_reader(file_crypter, reader)?; - read_external_storage_into_file( - input, + reader, output, speed_limiter, expected_length, expected_sha256, min_read_speed, + opt_encrypted_file_checksum, + opt_hasher, ) .await } + + /// Walk the prefix of the blob storage. + /// It returns the stream of items. + fn iter_prefix( + &self, + prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result>; + + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>>; +} + +#[track_caller] +pub fn unimplemented() -> io::Error { + io::Error::new( + io::ErrorKind::Unsupported, + format!( + "this method isn't supported, check more details at {:?}", + Location::caller() + ), + ) } #[async_trait] @@ -161,11 +212,16 @@ impl ExternalStorage for Arc { (**self).name() } - fn url(&self) -> io::Result { + fn url(&self) -> io::Result { (**self).url() } - async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + content_length: u64, + ) -> io::Result<()> { (**self).write(name, reader, content_length).await } @@ -195,6 +251,17 @@ impl ExternalStorage for Arc { ) .await } + + fn iter_prefix( + &self, + prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result> { + (**self).iter_prefix(prefix) + } + + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + (**self).delete(name) + } } #[async_trait] @@ -203,11 +270,16 @@ impl ExternalStorage for Box { self.as_ref().name() } - fn url(&self) -> io::Result { + fn url(&self) -> io::Result { self.as_ref().url() } - async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + content_length: u64, + ) -> io::Result<()> { self.as_ref().write(name, reader, content_length).await } @@ -237,6 +309,17 @@ impl ExternalStorage for Box { ) .await } + + fn iter_prefix( + &self, + prefix: &str, + ) -> LocalBoxStream<'_, std::result::Result> { + self.as_ref().iter_prefix(prefix) + } + + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + self.as_ref().delete(name) + } } /// Wrap the reader with file_crypter. @@ -263,8 +346,10 @@ pub async fn read_external_storage_into_file( mut output: Out, speed_limiter: &Limiter, expected_length: u64, - expected_sha256: Option>, + expected_plaintext_file_checksum: Option>, min_read_speed: usize, + opt_expected_encrypted_file_checksum: Option>, + opt_encrypted_file_hasher: Option>>, ) -> io::Result<()> where In: AsyncRead + Unpin, @@ -275,12 +360,8 @@ where // do the I/O copy from external_storage to the local file. let mut buffer = vec![0u8; READ_BUF_SIZE]; let mut file_length = 0; - let mut hasher = Hasher::new(MessageDigest::sha256()).map_err(|err| { - io::Error::new( - io::ErrorKind::Other, - format!("openssl hasher failed to init: {}", err), - ) - })?; + let mut hasher = build_hasher()?; + let mut yield_checker = RescheduleChecker::new(tokio::task::yield_now, Duration::from_millis(10)); loop { @@ -294,13 +375,8 @@ where } speed_limiter.consume(bytes_read).await; output.write_all(&buffer[..bytes_read])?; - if expected_sha256.is_some() { - hasher.update(&buffer[..bytes_read]).map_err(|err| { - io::Error::new( - io::ErrorKind::Other, - format!("openssl hasher udpate failed: {}", err), - ) - })?; + if expected_plaintext_file_checksum.is_some() { + update_hasher(&mut hasher, &buffer[..bytes_read])?; } file_length += bytes_read as u64; yield_checker.check().await; @@ -316,21 +392,18 @@ where )); } - if let Some(expected_s) = expected_sha256 { - let cal_sha256 = hasher.finish().map_or_else( - |err| { - Err(io::Error::new( - io::ErrorKind::Other, - format!("openssl hasher finish failed: {}", err), - )) - }, - |bytes| Ok(bytes.to_vec()), - )?; + calc_and_compare_checksums( + opt_expected_encrypted_file_checksum, + opt_encrypted_file_hasher, + )?; + + if let Some(expected_s) = expected_plaintext_file_checksum { + let cal_sha256 = finish_hasher(hasher)?; if !expected_s.eq(&cal_sha256) { return Err(io::Error::new( io::ErrorKind::InvalidData, format!( - "sha256 not match, expect: {:?}, calculate: {:?}", + "plaintext file checksums do not match, expect: {:?}, calculate: {:?}", expected_s, cal_sha256, ), )); @@ -346,8 +419,10 @@ pub async fn read_external_storage_info_buff( reader: &mut (dyn AsyncRead + Unpin + Send), speed_limiter: &Limiter, expected_length: u64, - expected_sha256: Option>, + opt_expected_checksum: Option>, min_read_speed: usize, + opt_expected_encrypted_file_checksum: Option>, + opt_encrypted_file_hasher: Option>>, ) -> io::Result> { // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with @@ -387,36 +462,27 @@ pub async fn read_external_storage_info_buff( ), )); } + + // check encrypted file checksum + // + calc_and_compare_checksums( + opt_expected_encrypted_file_checksum, + opt_encrypted_file_hasher, + )?; + // check sha256 of file - if let Some(sha256) = expected_sha256 { - let mut hasher = Hasher::new(MessageDigest::sha256()).map_err(|err| { - io::Error::new( - io::ErrorKind::Other, - format!("openssl hasher failed to init: {}", err), - ) - })?; - hasher.update(&output).map_err(|err| { - io::Error::new( - io::ErrorKind::Other, - format!("openssl hasher udpate failed: {}", err), - ) - })?; - - let cal_sha256 = hasher.finish().map_or_else( - |err| { - Err(io::Error::new( - io::ErrorKind::Other, - format!("openssl hasher finish failed: {}", err), - )) - }, - |bytes| Ok(bytes.to_vec()), - )?; - if !sha256.eq(&cal_sha256) { + if let Some(expected_checksum) = opt_expected_checksum { + let mut hasher = build_hasher()?; + + update_hasher(&mut hasher, &output)?; + + let cal_sha256 = finish_hasher(hasher)?; + if !expected_checksum.eq(&cal_sha256) { return Err(io::Error::new( io::ErrorKind::InvalidData, format!( "sha256 not match, expect: {:?}, calculate: {:?}", - sha256, cal_sha256, + expected_checksum, cal_sha256, ), )); } @@ -424,3 +490,75 @@ pub async fn read_external_storage_info_buff( Ok(output) } + +fn build_hasher() -> Result { + Hasher::new(MessageDigest::sha256()).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher failed to init: {}", err), + ) + }) +} + +fn update_hasher(hasher: &mut Hasher, data: &[u8]) -> Result<(), io::Error> { + hasher.update(data).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher update failed: {}", err), + ) + }) +} + +fn finish_hasher(mut hasher: Hasher) -> Result, io::Error> { + hasher.finish().map_or_else( + |err| { + Err(io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher finish failed: {}", err), + )) + }, + |bytes| Ok(bytes.to_vec()), + ) +} + +fn calc_and_compare_checksums( + opt_expected_encrypted_file_checksum: Option>, + opt_encrypted_file_hasher: Option>>, +) -> Result<(), io::Error> { + if let Some(expected_encrypted_checksum) = opt_expected_encrypted_file_checksum { + if let Some(hasher) = opt_encrypted_file_hasher { + let calc_checksum = hasher.lock().unwrap().finish().map_or_else( + |err| { + Err(io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher finish failed: {}", err), + )) + }, + |bytes| Ok(bytes.to_vec()), + )?; + + if !expected_encrypted_checksum.eq(&calc_checksum) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "encrypted file checksums do not match, expected: {:?}, calculated: {:?}", + expected_encrypted_checksum, calc_checksum, + ), + )); + } + } + } + Ok(()) +} + +pub fn wrap_with_checksum_reader_if_needed( + contains_checksum: bool, + encrypted_reader: ExternalData<'_>, +) -> Result<(ExternalData<'_>, Option>>), io::Error> { + if contains_checksum { + let (checksum_reader, hasher) = Sha256Reader::new(encrypted_reader.compat())?; + Ok((Box::new(checksum_reader.compat()), Some(hasher))) + } else { + Ok((encrypted_reader, None)) + } +} diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 0bf6be65107..4415f22d2be 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -3,17 +3,23 @@ use std::{ fs::File as StdFile, io::{self, BufReader, Read, Seek}, + os::unix::ffi::OsStrExt, path::{Path, PathBuf}, sync::Arc, }; use async_trait::async_trait; -use futures::io::AllowStdIo; -use futures_util::stream::TryStreamExt; +use cloud::blob::BlobObject; +use futures::{io::AllowStdIo, prelude::Stream}; +use futures_util::{ + future::{FutureExt, LocalBoxFuture}, + stream::TryStreamExt, +}; use rand::Rng; use tikv_util::stream::error_stream; use tokio::fs::{self, File}; use tokio_util::compat::FuturesAsyncReadCompatExt; +use walkdir::WalkDir; use super::ExternalStorage; use crate::UnpinReader; @@ -64,7 +70,12 @@ impl ExternalStorage for LocalStorage { Ok(url_for(self.base.as_path())) } - async fn write(&self, name: &str, reader: UnpinReader, _content_length: u64) -> io::Result<()> { + async fn write( + &self, + name: &str, + reader: UnpinReader<'_>, + _content_length: u64, + ) -> io::Result<()> { let p = Path::new(name); if p.is_absolute() { return Err(io::Error::new( @@ -145,6 +156,77 @@ impl ExternalStorage for LocalStorage { let take = reader.take(len); Box::new(AllowStdIo::new(take)) as _ } + + fn iter_prefix( + &self, + prefix: &str, + ) -> std::pin::Pin> + '_>> { + let p = Path::new(prefix); + let (dir_name, require_prefix) = if self.base.join(p).is_dir() { + // Fast path: when we are going to enumerate content of a directory, just walk + // through this dir. + (p.to_owned(), None) + } else { + let dir = p.parent().unwrap_or_else(|| Path::new("")).to_owned(); + let qualified_prefix = self.base.join(prefix).to_owned(); + (dir, Some(qualified_prefix)) + }; + + Box::pin( + futures::stream::iter( + WalkDir::new(self.base.join(dir_name)) + .follow_links(false) + .into_iter() + .filter(move |v| { + let require_prefix = require_prefix.as_ref(); + v.as_ref().map(|d| { + let is_file = d.file_type().is_file(); + let target_file_name = match require_prefix { + None => true, + // We need to compare by bytes instead of using Path::starts_with. + // Because we want get `` + Some(pfx) => d.path().as_os_str().as_bytes().starts_with(pfx.as_os_str().as_bytes()), + }; + is_file && target_file_name + }) + }.unwrap_or(false)), + ) + .map_err(|err| { + let kind = err + .io_error() + .map(|err| err.kind()) + .unwrap_or(io::ErrorKind::Other); + io::Error::new(kind, err) + }) + .and_then(|v| { + let rel = v + .path() + .strip_prefix(&self.base); + match rel { + Err(_) => futures::future::err(io::Error::new( + io::ErrorKind::Other, + format!("unknown: we found something not match the prefix... it is {}, our prefix is {}", v.path().display(), self.base.display()), + )), + Ok(item) => futures::future::ok(BlobObject{ + key: item.to_string_lossy().into_owned(), + }) + } + }) + ) + } + + fn delete(&self, name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + let path = self.base.join(name); + async move { + match fs::remove_file(&path).await { + Err(err) if err.kind() != io::ErrorKind::NotFound => return Err(err), + _ => {} + }; + // sync the inode. + self.base_dir.sync_all().await + } + .boxed_local() + } } #[cfg(test)] diff --git a/components/external_storage/src/locking.rs b/components/external_storage/src/locking.rs new file mode 100644 index 00000000000..b44c974c3ec --- /dev/null +++ b/components/external_storage/src/locking.rs @@ -0,0 +1,416 @@ +//! This mod allows you to create a "lock" in the external storage. +//! +//! In a storage that PUT and LIST is strong consistent (both AWS S3, GCP GCS +//! supports this), an atomic lock can be reterived: that is, if there are many +//! clients racing for locking one file, at most one of them can eventually +//! reterive the lock. +//! +//! The atomic was implmentated by a "Write-and-verify" protocol, that is: +//! +//! - Before writing a file `f`, we will write an intention file +//! `f.INTENT.{txn_id}`. Where `txn_id` is an unique ID generated in each +//! write. +//! - Then, it double checks whether there are other intention files. If there +//! were, there must be other clients trying lock this file, we will delete +//! our intention file and return failure now. +//! +//! For now, there isn't internal retry when failed to locking a file. We may +//! encounter live locks when there are too many clients racing for the same +//! lock. + +use std::io; + +use chrono::Utc; +use futures_util::{ + future::{ok, FutureExt, LocalBoxFuture}, + io::AsyncReadExt, + stream::TryStreamExt, +}; +use tikv_util::sys::{ + hostname, + thread::{process_id, Pid}, +}; +use uuid::Uuid; + +use crate::{ExternalStorage, UnpinReader}; + +#[derive(serde::Serialize, serde::Deserialize)] +struct LockMeta { + locked_at: chrono::DateTime, + locker_host: String, + locker_pid: Pid, + txn_id: Uuid, + hint: String, +} + +impl LockMeta { + fn new(txn_id: Uuid, hint: String) -> Self { + Self { + locked_at: Utc::now(), + locker_host: hostname().unwrap_or_else(|| "an_unknown_tikv_node".to_owned()), + locker_pid: process_id(), + txn_id, + hint, + } + } + + fn to_json(&self) -> io::Result> { + serde_json::ser::to_vec(self).map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err)) + } +} + +#[derive(Debug)] +pub struct RemoteLock { + txn_id: uuid::Uuid, + path: String, +} + +impl RemoteLock { + /// Unlock this lock. + /// + /// If the lock was modified, this may return an error. + pub async fn unlock(&self, st: &dyn ExternalStorage) -> io::Result<()> { + let mut buf = vec![]; + st.read(&self.path).read_to_end(&mut buf).await?; + let meta = serde_json::from_slice::(&buf)?; + if meta.txn_id != self.txn_id { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "TXN ID mismatch, remote {} ours {}", + meta.txn_id, self.txn_id + ), + )); + } + + st.delete(&self.path).await + } +} + +/// The [`ExclusiveWriteTxn`] instance for putting a read lock for a path. +struct PutRLock { + basic_path: String, + lock_path: String, + hint: String, +} + +impl PutRLock { + fn new(path: &str, hint: String) -> Self { + Self { + basic_path: path.to_string(), + lock_path: format!("{}.READ.{:016x}", path, rand::random::()), + hint, + } + } +} + +impl ExclusiveWriteTxn for PutRLock { + fn path(&self) -> &str { + &self.lock_path + } + + fn content(&self, cx: ExclusiveWriteCtx<'_>) -> io::Result> { + LockMeta::new(cx.txn_id(), self.hint.clone()).to_json() + } + + fn verify<'cx: 'ret, 's: 'ret, 'ret>( + &'s self, + cx: ExclusiveWriteCtx<'cx>, + ) -> LocalBoxFuture<'ret, io::Result<()>> { + // We need capture `cx` here, or rustc complains that we are returning a future + // reference to a local variable. (Yes indeed.) + async move { + cx.check_files_of_prefix(&format!("{}.WRIT", self.basic_path), requirements::nothing) + .await + } + .boxed_local() + } +} + +/// The [`ExclusiveWriteTxn`] instance for putting a write lock for a path. +struct PutWLock { + basic_path: String, + lock_path: String, + hint: String, +} + +impl PutWLock { + fn new(path: &str, hint: String) -> Self { + Self { + basic_path: path.to_owned(), + lock_path: format!("{}.WRIT", path), + hint, + } + } +} + +impl ExclusiveWriteTxn for PutWLock { + fn path(&self) -> &str { + &self.lock_path + } + + fn content(&self, cx: ExclusiveWriteCtx<'_>) -> io::Result> { + LockMeta::new(cx.txn_id(), self.hint.clone()).to_json() + } + + fn verify<'cx: 'ret, 's: 'ret, 'ret>( + &'s self, + cx: ExclusiveWriteCtx<'cx>, + ) -> LocalBoxFuture<'ret, io::Result<()>> { + async move { + cx.check_files_of_prefix(&self.basic_path, requirements::only(&cx.intent_file_name())) + .await + } + .boxed_local() + } +} + +/// LockExt allows you to create lock at some path. +#[allow(async_fn_in_trait)] +pub trait LockExt { + /// Create a read lock at the given path. + /// If there are write locks at that path, this will fail. + /// + /// The hint will be saved as readable text in the lock file. + /// Will generate a lock file at `$path.READ.{random_hex_u64}`. + async fn lock_for_read(&self, path: &str, hint: String) -> io::Result; + + /// Create a write lock at the given path. + /// If there is any read lock or write lock at that path, this will fail. + /// + /// The hint will be saved as readable text in the lock file. + /// Will generate a lock file at `$path.READ.WRIT`. + async fn lock_for_write(&self, path: &str, hint: String) -> io::Result; +} + +impl LockExt for S { + async fn lock_for_read(&self, path: &str, hint: String) -> io::Result { + let w = PutRLock::new(path, hint); + let path = w.lock_path.clone(); + let id = self.exclusive_write(&w).await?; + Ok(RemoteLock { txn_id: id, path }) + } + + async fn lock_for_write(&self, path: &str, hint: String) -> io::Result { + let w = PutWLock::new(path, hint); + let path = w.lock_path.clone(); + let id = self.exclusive_write(&w).await?; + Ok(RemoteLock { txn_id: id, path }) + } +} + +#[derive(Clone, Copy)] +pub struct ExclusiveWriteCtx<'a> { + file: &'a str, + txn_id: uuid::Uuid, + storage: &'a dyn ExternalStorage, +} + +pub mod requirements { + use std::io; + + pub fn only(expect: &str) -> impl (Fn(&str) -> io::Result<()>) + '_ { + move |v| { + if v != expect { + Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("there is a file {}", v), + )) + } else { + Ok(()) + } + } + } + + pub fn nothing(v: &str) -> io::Result<()> { + Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("there is a file {}", v), + )) + } +} + +impl<'a> ExclusiveWriteCtx<'a> { + pub fn txn_id(&self) -> uuid::Uuid { + self.txn_id + } + + pub async fn check_files_of_prefix( + &self, + prefix: &str, + mut requires: impl FnMut(&str) -> io::Result<()>, + ) -> io::Result<()> { + self.storage + .iter_prefix(prefix) + .try_for_each(|v| futures::future::ready(requires(&v.key))) + .await + } + + pub async fn verify_only_my_intent(&self) -> io::Result<()> { + self.check_files_of_prefix(self.file, requirements::only(&self.intent_file_name())) + .await + } + + pub fn intent_file_name(&self) -> String { + format!("{}.INTENT.{:032X}", self.file, self.txn_id) + } +} + +#[allow(async_fn_in_trait)] +pub trait ExclusiveWriteTxn { + fn path(&self) -> &str; + fn content(&self, cx: ExclusiveWriteCtx<'_>) -> io::Result>; + fn verify<'cx: 'ret, 's: 'ret, 'ret>( + &'s self, + _cx: ExclusiveWriteCtx<'cx>, + ) -> LocalBoxFuture<'ret, io::Result<()>> { + ok(()).boxed_local() + } +} + +/// An storage that supports atomically write a file if the file not exists. +pub trait ExclusiveWriteExt { + fn exclusive_write<'s: 'ret, 'txn: 'ret, 'ret>( + &'s self, + w: &'txn dyn ExclusiveWriteTxn, + ) -> LocalBoxFuture<'ret, io::Result>; +} + +// In fact this can be implemented for all types that are `ExternalStorage`. +// +// But we cannot replace this implementation with: +// ```no-run +// impl ExclusiveWriteExt for T +// ``` +// Because for some T is a blob storage, &T isn't a blob storage: which means, +// we cannot downcast T (i.e. we have a &T, but we cannot cast it to `&dyn +// ExternalStorage`, even T itself is `impl ExternalStorage`)! So we cannot +// construct the `ExclusiveWriteCtx` here. +// +// We may remove the `?Sized` hence &T can be dereferenced and then downcasted. +// But here `dyn ExternalStorage`, trait object of our universal storage +// interface, isn't a [`ExclusiveWriteExt`] more -- it is simply `!Sized`. +// +// Writing a blank implementation for the types is also not helpful, because +// `ExternalStorage` requires `'static`... (Which was required by +// `async_trait`...) +// +// Can we make the `ExclusiveWriteCtx` contains a `&T` instead of a `&dyn ...`? +// Perhaps. I have tried it. Eventually blocked by something like "cyclic +// dependiencies" in query system. I have no idea how to solve it. I gave up. +// +// Hence, as a workaround, we directly implement this extension for the trait +// object of the universal external storage interface. +impl ExclusiveWriteExt for dyn ExternalStorage { + fn exclusive_write<'s: 'ret, 'txn: 'ret, 'ret>( + &'s self, + w: &'txn dyn ExclusiveWriteTxn, + ) -> LocalBoxFuture<'ret, io::Result> { + async move { + let txn_id = Uuid::new_v4(); + let cx = ExclusiveWriteCtx { + file: w.path(), + txn_id, + storage: self, + }; + futures::future::try_join(cx.verify_only_my_intent(), w.verify(cx)).await?; + let target = cx.intent_file_name(); + self.write(&target, UnpinReader(Box::new(futures::io::empty())), 0) + .await?; + + let result = async { + futures::future::try_join(cx.verify_only_my_intent(), w.verify(cx)).await?; + let content = w.content(cx)?; + self.write( + w.path(), + UnpinReader(Box::new(futures::io::Cursor::new(&content))), + content.len() as _, + ) + .await?; + io::Result::Ok(txn_id) + } + .await; + + let _ = self.delete(&target).await; + result + } + .boxed_local() + } +} + +#[cfg(test)] +mod test { + use uuid::Uuid; + + use super::LockExt; + use crate::{ExternalStorage, LocalStorage}; + + #[tokio::test] + async fn test_read_blocks_write() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path(); + let ls = LocalStorage::new(path).unwrap(); + let ls = &ls as &dyn ExternalStorage; + + let res = ls + .lock_for_read("my_lock", String::from("testing lock")) + .await; + let l1 = res.unwrap(); + let res = ls + .lock_for_read("my_lock", String::from("testing lock")) + .await; + let l2 = res.unwrap(); + + let res = ls + .lock_for_write("my_lock", String::from("testing lock")) + .await; + res.unwrap_err(); + + l1.unlock(ls).await.unwrap(); + l2.unlock(ls).await.unwrap(); + let res = ls + .lock_for_write("my_lock", String::from("testing lock")) + .await; + res.unwrap(); + } + + #[tokio::test] + async fn test_write_blocks_read() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path(); + let ls = LocalStorage::new(path).unwrap(); + let ls = &ls as &dyn ExternalStorage; + + let res = ls + .lock_for_write("my_lock", String::from("testing lock")) + .await; + let l1 = res.unwrap(); + + let res = ls + .lock_for_read("my_lock", String::from("testing lock")) + .await; + res.unwrap_err(); + + l1.unlock(ls).await.unwrap(); + let res = ls + .lock_for_read("my_lock", String::from("testing lock")) + .await; + res.unwrap(); + } + + #[tokio::test] + async fn test_dont_unlock_others() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path(); + let ls = LocalStorage::new(path).unwrap(); + let ls = &ls as &dyn ExternalStorage; + + let mut l1 = ls + .lock_for_read("my_lock", String::from("test")) + .await + .unwrap(); + l1.txn_id = Uuid::new_v4(); + + l1.unlock(ls).await.unwrap_err(); + } +} diff --git a/components/external_storage/src/noop.rs b/components/external_storage/src/noop.rs index 50e9c43c7bc..3907d3d77ff 100644 --- a/components/external_storage/src/noop.rs +++ b/components/external_storage/src/noop.rs @@ -1,6 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use async_trait::async_trait; +use cloud::blob::BlobObject; +use futures_util::{ + future::{self, LocalBoxFuture}, + stream::{self, LocalBoxStream}, +}; use tokio::io; use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; @@ -33,7 +38,7 @@ impl ExternalStorage for NoopStorage { async fn write( &self, _name: &str, - reader: UnpinReader, + reader: UnpinReader<'_>, _content_length: u64, ) -> io::Result<()> { // we must still process the entire reader to run the SHA-256 hasher. @@ -48,6 +53,14 @@ impl ExternalStorage for NoopStorage { fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { Box::new(io::empty().compat()) } + + fn iter_prefix(&self, _prefix: &str) -> LocalBoxStream<'_, io::Result> { + Box::pin(stream::empty()) + } + + fn delete(&self, _name: &str) -> LocalBoxFuture<'_, io::Result<()>> { + Box::pin(future::ok(())) + } } #[cfg(test)] diff --git a/components/file_system/Cargo.toml b/components/file_system/Cargo.toml index 3515e64064e..bcfa7221825 100644 --- a/components/file_system/Cargo.toml +++ b/components/file_system/Cargo.toml @@ -22,8 +22,6 @@ prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" rand = "0.8" serde = "1.0" -slog = { workspace = true } -slog-global = { workspace = true } strum = { version = "0.20", features = ["derive"] } tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 1c5577f361a..48a7e59d447 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -29,8 +29,10 @@ pub use std::{ use std::{ io::{self, ErrorKind, Read, Write}, path::Path, + pin::Pin, str::FromStr, sync::{Arc, Mutex}, + task::{ready, Context, Poll}, }; pub use file::{File, OpenOptions}; @@ -50,6 +52,7 @@ pub use rate_limiter::{ }; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use strum::{EnumCount, EnumIter}; +use tokio::io::{AsyncRead, ReadBuf}; #[derive(Clone, Copy, Debug, PartialEq)] pub enum IoOp { @@ -431,6 +434,34 @@ impl Read for Sha256Reader { } } +impl AsyncRead for Sha256Reader { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + let initial_filled_len = buf.filled().len(); + ready!(Pin::new(&mut self.reader).poll_read(cx, buf))?; + + let filled_len = buf.filled().len(); + if initial_filled_len == filled_len { + return Poll::Ready(Ok(())); + } + let new_data = &buf.filled()[initial_filled_len..filled_len]; + + // Update the hasher with the read data + let mut hasher = self + .hasher + .lock() + .expect("failed to lock hasher in Sha256Reader async read"); + if let Err(e) = hasher.update(new_data) { + return Poll::Ready(Err(io::Error::new(ErrorKind::Other, e))); + } + + Poll::Ready(Ok(())) + } +} + pub const SPACE_PLACEHOLDER_FILE: &str = "space_placeholder_file"; /// Create a file with hole, to reserve space for TiKV. diff --git a/components/health_controller/Cargo.toml b/components/health_controller/Cargo.toml index 064ba91611d..829e6f71f2e 100644 --- a/components/health_controller/Cargo.toml +++ b/components/health_controller/Cargo.toml @@ -11,7 +11,6 @@ kvproto = { workspace = true } ordered-float = "2.6" parking_lot = "0.12.1" prometheus = { version = "0.13", features = ["nightly"] } -prometheus-static-metric = "0.5" slog = { workspace = true } slog-global = { workspace = true } tikv_util = { workspace = true } diff --git a/components/hybrid_engine/Cargo.toml b/components/hybrid_engine/Cargo.toml index 682e9ba9036..3a23f9927da 100644 --- a/components/hybrid_engine/Cargo.toml +++ b/components/hybrid_engine/Cargo.toml @@ -5,33 +5,28 @@ edition = "2021" publish = false license = "Apache-2.0" -[features] -testexport = [] -failpoints = ["fail/failpoints"] - -[[test]] -name = "failpoints" -path = "tests/failpoints/mod.rs" -required-features = ["failpoints"] - [dependencies] -crossbeam = { workspace = true } -engine_rocks = { workspace = true } engine_traits = { workspace = true } -fail = "0.5" -kvproto = { workspace = true } -lazy_static = "1.4.0" +txn_types = { workspace = true } +tikv_util = { workspace = true } +engine_rocks = { workspace = true } online_config = { workspace = true } -prometheus = { version = "0.13", default-features = false, features = ["nightly"] } -prometheus-static-metric = "0.5" -raft = { workspace = true } -raftstore = { workspace = true } -range_cache_memory_engine = { workspace = true } +in_memory_engine = { workspace = true } slog = { workspace = true } slog-global = { workspace = true } tempfile = "3.0" -tikv_util = { workspace = true } -txn_types = { workspace = true } +prometheus = { version = "0.13", default-features = false, features = [ + "nightly", +] } +prometheus-static-metric = "0.5" +lazy_static = "1.4.0" +crossbeam = { workspace = true } +raftstore = { workspace = true } +raft = { workspace = true } +kvproto = { workspace = true } +keys = { workspace = true } [dev-dependencies] tempfile = "3.0" +test_util = { workspace = true } +fail = { version = "0.5", features = ["failpoints"] } diff --git a/components/hybrid_engine/src/cf_names.rs b/components/hybrid_engine/src/cf_names.rs deleted file mode 100644 index 3393f720973..00000000000 --- a/components/hybrid_engine/src/cf_names.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{CfNamesExt, KvEngine, RangeCacheEngine}; - -use crate::engine::HybridEngine; - -impl CfNamesExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn cf_names(&self) -> Vec<&str> { - self.disk_engine().cf_names() - } -} diff --git a/components/hybrid_engine/src/cf_options.rs b/components/hybrid_engine/src/cf_options.rs deleted file mode 100644 index 84ec83272f1..00000000000 --- a/components/hybrid_engine/src/cf_options.rs +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{CfOptionsExt, KvEngine, RangeCacheEngine, Result}; - -use crate::engine::HybridEngine; - -impl CfOptionsExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type CfOptions = EK::CfOptions; - - fn get_options_cf(&self, cf: &str) -> Result { - self.disk_engine().get_options_cf(cf) - } - - fn set_options_cf(&self, cf: &str, options: &[(&str, &str)]) -> Result<()> { - self.disk_engine().set_options_cf(cf, options) - } -} diff --git a/components/hybrid_engine/src/checkpoint.rs b/components/hybrid_engine/src/checkpoint.rs deleted file mode 100644 index d1a12ca0d7e..00000000000 --- a/components/hybrid_engine/src/checkpoint.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{Checkpointable, KvEngine, RangeCacheEngine, Result}; - -use crate::engine::HybridEngine; - -impl Checkpointable for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type Checkpointer = EK::Checkpointer; - - fn new_checkpointer(&self) -> Result { - self.disk_engine().new_checkpointer() - } - - fn merge(&self, dbs: &[&Self]) -> Result<()> { - let disk_dbs: Vec<_> = dbs.iter().map(|&db| db.disk_engine()).collect(); - self.disk_engine().merge(&disk_dbs) - } -} diff --git a/components/hybrid_engine/src/compact.rs b/components/hybrid_engine/src/compact.rs deleted file mode 100644 index 27760148b20..00000000000 --- a/components/hybrid_engine/src/compact.rs +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{CompactExt, KvEngine, ManualCompactionOptions, RangeCacheEngine, Result}; - -use crate::engine::HybridEngine; - -impl CompactExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type CompactedEvent = EK::CompactedEvent; - - fn auto_compactions_is_disabled(&self) -> Result { - self.disk_engine().auto_compactions_is_disabled() - } - - fn compact_range_cf( - &self, - cf: &str, - start_key: Option<&[u8]>, - end_key: Option<&[u8]>, - compaction_option: ManualCompactionOptions, - ) -> Result<()> { - self.disk_engine() - .compact_range_cf(cf, start_key, end_key, compaction_option) - } - - fn compact_files_in_range_cf( - &self, - cf: &str, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - self.disk_engine() - .compact_files_in_range_cf(cf, start, end, output_level) - } - - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - self.disk_engine() - .compact_files_in_range(start, end, output_level) - } - - fn compact_files_cf( - &self, - cf: &str, - files: Vec, - output_level: Option, - max_subcompactions: u32, - exclude_l0: bool, - ) -> Result<()> { - self.disk_engine() - .compact_files_cf(cf, files, output_level, max_subcompactions, exclude_l0) - } - - fn check_in_range(&self, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()> { - self.disk_engine().check_in_range(start, end) - } -} diff --git a/components/hybrid_engine/src/db_options.rs b/components/hybrid_engine/src/db_options.rs deleted file mode 100644 index 7a6f3dc5ce5..00000000000 --- a/components/hybrid_engine/src/db_options.rs +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{DbOptionsExt, KvEngine, RangeCacheEngine, Result}; - -use crate::engine::HybridEngine; - -impl DbOptionsExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type DbOptions = EK::DbOptions; - - fn get_db_options(&self) -> Self::DbOptions { - self.disk_engine().get_db_options() - } - - fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()> { - self.disk_engine().set_db_options(options) - } -} diff --git a/components/hybrid_engine/src/db_vector.rs b/components/hybrid_engine/src/db_vector.rs index 821de1ac416..1b286e182e7 100644 --- a/components/hybrid_engine/src/db_vector.rs +++ b/components/hybrid_engine/src/db_vector.rs @@ -5,13 +5,13 @@ use std::{ ops::Deref, }; -use engine_traits::{DbVector, KvEngine, Peekable, RangeCacheEngine, ReadOptions, Result}; +use engine_traits::{DbVector, KvEngine, Peekable, ReadOptions, RegionCacheEngine, Result}; use tikv_util::Either; pub struct HybridDbVector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { db_vec: Either<::DbVector, ::DbVector>, } @@ -19,16 +19,16 @@ where impl DbVector for HybridDbVector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { } impl HybridDbVector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { - pub fn try_from_disk_snap( + pub(crate) fn try_from_disk_snap( snap: &EK::Snapshot, opts: &ReadOptions, cf: &str, @@ -41,7 +41,7 @@ where })) } - pub fn try_from_cache_snap( + pub(crate) fn try_from_cache_snap( snap: &EC::Snapshot, opts: &ReadOptions, cf: &str, @@ -58,7 +58,7 @@ where impl Deref for HybridDbVector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { type Target = [u8]; @@ -73,7 +73,7 @@ where impl Debug for HybridDbVector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { write!(formatter, "{:?}", &**self) @@ -83,7 +83,7 @@ where impl<'a, EK, EC> PartialEq<&'a [u8]> for HybridDbVector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn eq(&self, rhs: &&[u8]) -> bool { **rhs == **self diff --git a/components/hybrid_engine/src/engine.rs b/components/hybrid_engine/src/engine.rs index e3591098cb4..d7f5caf038e 100644 --- a/components/hybrid_engine/src/engine.rs +++ b/components/hybrid_engine/src/engine.rs @@ -1,16 +1,6 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{ - FailedReason, KvEngine, Mutable, Peekable, RangeCacheEngine, ReadOptions, Result, - SnapshotContext, SnapshotMiscExt, SyncMutable, WriteBatch, WriteBatchExt, -}; - -use crate::{ - metrics::{ - RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC, SNAPSHOT_TYPE_COUNT_STATIC, - }, - snapshot::HybridEngineSnapshot, -}; +use engine_traits::{KvEngine, RegionCacheEngine}; /// This engine is structured with both a disk engine and an region cache /// engine. The disk engine houses the complete database data, whereas the @@ -22,214 +12,80 @@ use crate::{ pub struct HybridEngine where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { + #[allow(dead_code)] disk_engine: EK, - range_cache_engine: EC, -} - -impl HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - pub fn disk_engine(&self) -> &EK { - &self.disk_engine - } - - pub fn mut_disk_engine(&mut self) -> &mut EK { - &mut self.disk_engine - } - - pub fn range_cache_engine(&self) -> &EC { - &self.range_cache_engine - } - - pub fn mut_range_cache_engine(&mut self) -> &mut EC { - &mut self.range_cache_engine - } + region_cache_engine: EC, } impl HybridEngine where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { - pub fn new(disk_engine: EK, range_cache_engine: EC) -> Self { + pub fn new(disk_engine: EK, region_cache_engine: EC) -> Self { Self { disk_engine, - range_cache_engine, + region_cache_engine, } } -} - -// todo: implement KvEngine methods as well as it's super traits. -impl KvEngine for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, - HybridEngine: WriteBatchExt, -{ - type Snapshot = HybridEngineSnapshot; - fn snapshot(&self, ctx: Option) -> Self::Snapshot { - let disk_snap = self.disk_engine.snapshot(ctx.clone()); - let range_cache_snap = if !self.range_cache_engine.enabled() { + #[cfg(test)] + pub(crate) fn new_snapshot( + &self, + ctx: Option, + ) -> crate::HybridEngineSnapshot { + use engine_traits::SnapshotMiscExt; + let disk_snap = self.disk_engine.snapshot(); + let region_cache_snap = if !self.region_cache_engine.enabled() { None } else if let Some(ctx) = ctx { - match self.range_cache_engine.snapshot( - ctx.range.unwrap(), - ctx.read_ts, - disk_snap.sequence_number(), - ) { - Ok(snap) => { - SNAPSHOT_TYPE_COUNT_STATIC.range_cache_engine.inc(); - Some(snap) - } - Err(FailedReason::TooOldRead) => { - RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC - .too_old_read - .inc(); - None - } - Err(FailedReason::NotCached) => { - RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC - .not_cached - .inc(); - None - } - } + self.region_cache_engine + .snapshot( + ctx.region.unwrap(), + ctx.read_ts, + disk_snap.sequence_number(), + ) + .ok() } else { - RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC - .no_read_ts - .inc(); None }; - if range_cache_snap.is_none() { - SNAPSHOT_TYPE_COUNT_STATIC.rocksdb.inc(); - } - HybridEngineSnapshot::new(disk_snap, range_cache_snap) + crate::HybridEngineSnapshot::new(disk_snap, region_cache_snap) } - fn sync(&self) -> engine_traits::Result<()> { - self.disk_engine.sync() - } - - fn bad_downcast(&self) -> &T { - self.disk_engine.bad_downcast() - } - - #[cfg(feature = "testexport")] - fn inner_refcount(&self) -> usize { - self.disk_engine.inner_refcount() - } -} - -impl Peekable for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type DbVector = EK::DbVector; - - // region cache engine only supports peekable trait in the snapshot of it - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { - self.disk_engine.get_value_opt(opts, key) + #[cfg(test)] + pub(crate) fn disk_engine(&self) -> &EK { + &self.disk_engine } - // region cache engine only supports peekable trait in the snapshot of it - fn get_value_cf_opt( - &self, - opts: &ReadOptions, - cf: &str, - key: &[u8], - ) -> Result> { - self.disk_engine.get_value_cf_opt(opts, cf, key) + pub fn region_cache_engine(&self) -> &EC { + &self.region_cache_engine } } -impl SyncMutable for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, - HybridEngine: WriteBatchExt, -{ - fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { - let mut batch = self.write_batch(); - if let Some(range) = self.range_cache_engine.get_range_for_key(key) { - batch.prepare_for_range(range); - } - batch.put(key, value)?; - let _ = batch.write()?; - Ok(()) - } - - fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { - let mut batch = self.write_batch(); - if let Some(range) = self.range_cache_engine.get_range_for_key(key) { - batch.prepare_for_range(range); - } - batch.put_cf(cf, key, value)?; - let _ = batch.write()?; - Ok(()) - } - - fn delete(&self, key: &[u8]) -> Result<()> { - let mut batch = self.write_batch(); - if let Some(range) = self.range_cache_engine.get_range_for_key(key) { - batch.prepare_for_range(range); - } - batch.delete(key)?; - let _ = batch.write()?; - Ok(()) - } - - fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { - let mut batch = self.write_batch(); - if let Some(range) = self.range_cache_engine.get_range_for_key(key) { - batch.prepare_for_range(range); - } - batch.delete_cf(cf, key)?; - let _ = batch.write()?; - Ok(()) - } - - fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - let mut batch = self.write_batch(); - if let Some(range) = self.range_cache_engine.get_range_for_key(begin_key) { - batch.prepare_for_range(range); - } - batch.delete_range(begin_key, end_key)?; - let _ = batch.write()?; - Ok(()) - } - - fn delete_range_cf(&self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - let mut batch = self.write_batch(); - if let Some(range) = self.range_cache_engine.get_range_for_key(begin_key) { - batch.prepare_for_range(range); - } - batch.delete_range_cf(cf, begin_key, end_key)?; - let _ = batch.write()?; - Ok(()) - } +#[cfg(test)] +#[derive(Debug, Clone)] +pub struct SnapshotContext { + pub region: Option, + pub read_ts: u64, } #[cfg(test)] mod tests { - use std::sync::Arc; use engine_rocks::util::new_engine; - use engine_traits::{CacheRange, KvEngine, SnapshotContext, CF_DEFAULT, CF_LOCK, CF_WRITE}; - use online_config::{ConfigChange, ConfigManager, ConfigValue}; - use range_cache_memory_engine::{ - config::RangeCacheConfigManager, RangeCacheEngineConfig, RangeCacheEngineContext, - RangeCacheMemoryEngine, + use engine_traits::{CacheRegion, CF_DEFAULT, CF_LOCK, CF_WRITE}; + use in_memory_engine::{ + config::InMemoryEngineConfigManager, test_util::new_region, InMemoryEngineConfig, + InMemoryEngineContext, RegionCacheMemoryEngine, }; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; use tempfile::Builder; use tikv_util::config::VersionTrack; + use super::*; use crate::HybridEngine; #[test] @@ -240,39 +96,40 @@ mod tests { &[CF_DEFAULT, CF_LOCK, CF_WRITE], ) .unwrap(); - let config = Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())); + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); let memory_engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config.clone())); + RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config.clone())); - let range = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - memory_engine.new_range(range.clone()); - { - let mut core = memory_engine.core().write(); - core.mut_range_manager().set_safe_point(&range, 10); - } + let region = new_region(1, b"k00", b"k10"); + let range = CacheRegion::from_region(®ion); + memory_engine.new_region(region.clone()); + memory_engine + .core() + .region_manager() + .set_safe_point(region.id, 10); let hybrid_engine = HybridEngine::new(disk_engine, memory_engine.clone()); - let s = hybrid_engine.snapshot(None); - assert!(!s.range_cache_snapshot_available()); + let s = hybrid_engine.new_snapshot(None); + assert!(!s.region_cache_snapshot_available()); let mut snap_ctx = SnapshotContext { read_ts: 15, - range: Some(range.clone()), + region: Some(range.clone()), }; - let s = hybrid_engine.snapshot(Some(snap_ctx.clone())); - assert!(s.range_cache_snapshot_available()); + let s = hybrid_engine.new_snapshot(Some(snap_ctx.clone())); + assert!(s.region_cache_snapshot_available()); snap_ctx.read_ts = 5; - let s = hybrid_engine.snapshot(Some(snap_ctx.clone())); - assert!(!s.range_cache_snapshot_available()); + let s = hybrid_engine.new_snapshot(Some(snap_ctx.clone())); + assert!(!s.region_cache_snapshot_available()); - let mut config_manager = RangeCacheConfigManager(config.clone()); + let mut config_manager = InMemoryEngineConfigManager(config.clone()); let mut config_change = ConfigChange::new(); - config_change.insert(String::from("enabled"), ConfigValue::Bool(false)); + config_change.insert(String::from("enable"), ConfigValue::Bool(false)); config_manager.dispatch(config_change).unwrap(); - assert!(!config.value().enabled); + assert!(!config.value().enable); snap_ctx.read_ts = 15; - let s = hybrid_engine.snapshot(Some(snap_ctx)); - assert!(!s.range_cache_snapshot_available()); + let s = hybrid_engine.new_snapshot(Some(snap_ctx)); + assert!(!s.region_cache_snapshot_available()); } } diff --git a/components/hybrid_engine/src/engine_iterator.rs b/components/hybrid_engine/src/engine_iterator.rs index 9f4f60ee7de..6f55b394ca6 100644 --- a/components/hybrid_engine/src/engine_iterator.rs +++ b/components/hybrid_engine/src/engine_iterator.rs @@ -1,14 +1,14 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{ - IterMetricsCollector, Iterable, Iterator, KvEngine, MetricsExt, RangeCacheEngine, Result, + IterMetricsCollector, Iterable, Iterator, KvEngine, MetricsExt, RegionCacheEngine, Result, }; use tikv_util::Either; pub struct HybridEngineIterator where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { iter: Either<::Iterator, ::Iterator>, } @@ -16,15 +16,15 @@ where impl HybridEngineIterator where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { - pub fn disk_engine_iterator(iter: ::Iterator) -> Self { + pub(crate) fn disk_engine_iterator(iter: ::Iterator) -> Self { Self { iter: Either::Left(iter), } } - pub fn range_cache_engine_iterator(iter: ::Iterator) -> Self { + pub(crate) fn region_cache_engine_iterator(iter: ::Iterator) -> Self { Self { iter: Either::Right(iter), } @@ -34,7 +34,7 @@ where impl Iterator for HybridEngineIterator where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn seek(&mut self, key: &[u8]) -> Result { match self.iter { @@ -103,7 +103,7 @@ where pub struct HybridEngineIterMetricsCollector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { collector: Either< <::Iterator as MetricsExt>::Collector, @@ -114,7 +114,7 @@ where impl IterMetricsCollector for HybridEngineIterMetricsCollector where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn internal_delete_skipped_count(&self) -> u64 { match &self.collector { @@ -134,7 +134,7 @@ where impl MetricsExt for HybridEngineIterator where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { type Collector = HybridEngineIterMetricsCollector; diff --git a/components/hybrid_engine/src/flow_control_factors.rs b/components/hybrid_engine/src/flow_control_factors.rs deleted file mode 100644 index 2634ffa1ccc..00000000000 --- a/components/hybrid_engine/src/flow_control_factors.rs +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{FlowControlFactorsExt, KvEngine, RangeCacheEngine, Result}; - -use crate::engine::HybridEngine; - -impl FlowControlFactorsExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn get_cf_num_files_at_level(&self, cf: &str, level: usize) -> Result> { - self.disk_engine().get_cf_num_files_at_level(cf, level) - } - - fn get_cf_num_immutable_mem_table(&self, cf: &str) -> Result> { - self.disk_engine().get_cf_num_immutable_mem_table(cf) - } - - fn get_cf_pending_compaction_bytes(&self, cf: &str) -> Result> { - self.disk_engine().get_cf_pending_compaction_bytes(cf) - } -} diff --git a/components/hybrid_engine/src/hybrid_metrics.rs b/components/hybrid_engine/src/hybrid_metrics.rs deleted file mode 100644 index 2be75f95ead..00000000000 --- a/components/hybrid_engine/src/hybrid_metrics.rs +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{KvEngine, RangeCacheEngine, StatisticsReporter}; - -use crate::engine::HybridEngine; - -pub struct HybridEngineStatisticsReporter {} - -impl StatisticsReporter> for HybridEngineStatisticsReporter -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn new(name: &str) -> Self { - unimplemented!() - } - - fn collect(&mut self, engine: &HybridEngine) { - unimplemented!() - } - - fn flush(&mut self) { - unimplemented!() - } -} diff --git a/components/hybrid_engine/src/import.rs b/components/hybrid_engine/src/import.rs deleted file mode 100644 index 3347ae41771..00000000000 --- a/components/hybrid_engine/src/import.rs +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{ImportExt, KvEngine, RangeCacheEngine}; - -use crate::engine::HybridEngine; - -impl ImportExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type IngestExternalFileOptions = EK::IngestExternalFileOptions; - - fn ingest_external_file_cf(&self, cf: &str, files: &[&str]) -> engine_traits::Result<()> { - self.disk_engine().ingest_external_file_cf(cf, files) - } -} diff --git a/components/hybrid_engine/src/iterable.rs b/components/hybrid_engine/src/iterable.rs deleted file mode 100644 index 4e7a54faa07..00000000000 --- a/components/hybrid_engine/src/iterable.rs +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{IterOptions, Iterable, KvEngine, RangeCacheEngine, Result}; - -use crate::engine::HybridEngine; - -impl Iterable for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type Iterator = EK::Iterator; - - fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { - // Iterator of region cache engine should only be created from the - // snapshot of it - self.disk_engine().iterator_opt(cf, opts) - } -} diff --git a/components/hybrid_engine/src/lib.rs b/components/hybrid_engine/src/lib.rs index ba30a4f4b26..8db02de65d3 100644 --- a/components/hybrid_engine/src/lib.rs +++ b/components/hybrid_engine/src/lib.rs @@ -1,33 +1,17 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -#![allow(dead_code)] -#![allow(unused_variables)] +// TODO: HybridEngine has became a very thin shim, consider merging +// `HybridEngineSnapshot` and `HybridEngineIterator` into in_memory_engine +// crate. + #![feature(let_chains)] -mod cf_names; -mod cf_options; -mod checkpoint; -mod compact; -mod db_options; mod db_vector; mod engine; mod engine_iterator; -mod flow_control_factors; -mod hybrid_metrics; -mod import; -mod iterable; mod metrics; -mod misc; -mod mvcc_properties; pub mod observer; -mod perf_context; -mod range_cache_engine; -mod range_properties; mod snapshot; -mod sst; -mod table_properties; -mod ttl_properties; pub mod util; -mod write_batch; pub use engine::HybridEngine; pub use snapshot::HybridEngineSnapshot; diff --git a/components/hybrid_engine/src/metrics.rs b/components/hybrid_engine/src/metrics.rs index 9c31cead88e..32e49a223ca 100644 --- a/components/hybrid_engine/src/metrics.rs +++ b/components/hybrid_engine/src/metrics.rs @@ -5,9 +5,11 @@ use prometheus::{register_int_counter_vec, IntCounterVec}; use prometheus_static_metric::{auto_flush_from, make_auto_flush_static_metric}; make_auto_flush_static_metric! { + // We may acquire ime snapshot even not in coprocessor request. We count it as wasted. pub label_enum SnapshotType { rocksdb, - range_cache_engine, + in_memory_engine, + wasted, } pub struct SnapshotTypeCountVec: LocalIntCounter { @@ -18,6 +20,7 @@ make_auto_flush_static_metric! { no_read_ts, not_cached, too_old_read, + epoch_not_match, } pub struct FailedReasonCountVec: LocalIntCounter { @@ -32,10 +35,10 @@ lazy_static! { &["type"], ) .unwrap(); - pub static ref RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_VEC: IntCounterVec = + pub static ref IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_VEC: IntCounterVec = register_int_counter_vec!( - "tikv_range_cache_snapshot_acquire_failed_reason_count", - "The reasons for why range cache snapshot is not acquired", + "tikv_in_memory_engine_snapshot_acquire_failed_reason_count", + "The reasons for why region cache snapshot is not acquired", &["type"], ) .unwrap(); @@ -44,8 +47,8 @@ lazy_static! { lazy_static! { pub static ref SNAPSHOT_TYPE_COUNT_STATIC: SnapshotTypeCountVec = auto_flush_from!(SNAPSHOT_TYPE_COUNT_VEC, SnapshotTypeCountVec); - pub static ref RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC: FailedReasonCountVec = auto_flush_from!( - RANGE_CACHEN_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_VEC, + pub static ref IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC: FailedReasonCountVec = auto_flush_from!( + IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_VEC, FailedReasonCountVec ); } diff --git a/components/hybrid_engine/src/misc.rs b/components/hybrid_engine/src/misc.rs deleted file mode 100644 index 777637d8d77..00000000000 --- a/components/hybrid_engine/src/misc.rs +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{CacheRange, KvEngine, MiscExt, RangeCacheEngine, Result, WriteBatchExt}; - -use crate::{engine::HybridEngine, hybrid_metrics::HybridEngineStatisticsReporter}; - -impl MiscExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, - HybridEngine: WriteBatchExt, -{ - type StatisticsReporter = HybridEngineStatisticsReporter; - - fn flush_cf(&self, cf: &str, wait: bool) -> Result<()> { - self.disk_engine().flush_cf(cf, wait) - } - - fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { - self.disk_engine().flush_cfs(cfs, wait) - } - - fn flush_oldest_cf( - &self, - wait: bool, - threshold: Option, - ) -> Result { - self.disk_engine().flush_oldest_cf(wait, threshold) - } - - fn delete_ranges_cf( - &self, - wopts: &engine_traits::WriteOptions, - cf: &str, - strategy: engine_traits::DeleteStrategy, - ranges: &[engine_traits::Range<'_>], - ) -> Result { - for r in ranges { - self.range_cache_engine() - .evict_range(&CacheRange::new(r.start_key.to_vec(), r.end_key.to_vec())); - } - self.disk_engine() - .delete_ranges_cf(wopts, cf, strategy, ranges) - } - - fn get_approximate_memtable_stats_cf( - &self, - cf: &str, - range: &engine_traits::Range<'_>, - ) -> Result<(u64, u64)> { - self.disk_engine() - .get_approximate_memtable_stats_cf(cf, range) - } - - fn ingest_maybe_slowdown_writes(&self, cf: &str) -> Result { - self.disk_engine().ingest_maybe_slowdown_writes(cf) - } - - fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { - self.disk_engine().get_sst_key_ranges(cf, level) - } - - fn get_engine_used_size(&self) -> Result { - self.disk_engine().get_engine_used_size() - } - - fn path(&self) -> &str { - self.disk_engine().path() - } - - fn sync_wal(&self) -> Result<()> { - self.disk_engine().sync_wal() - } - - fn disable_manual_compaction(&self) -> Result<()> { - self.disk_engine().disable_manual_compaction() - } - - fn enable_manual_compaction(&self) -> Result<()> { - self.disk_engine().enable_manual_compaction() - } - - fn pause_background_work(&self) -> Result<()> { - self.disk_engine().pause_background_work() - } - - fn continue_background_work(&self) -> Result<()> { - self.disk_engine().continue_background_work() - } - - fn exists(path: &str) -> bool { - EK::exists(path) - } - - fn locked(path: &str) -> Result { - EK::locked(path) - } - - fn dump_stats(&self) -> Result { - self.disk_engine().dump_stats() - } - - fn get_latest_sequence_number(&self) -> u64 { - self.disk_engine().get_latest_sequence_number() - } - - fn get_oldest_snapshot_sequence_number(&self) -> Option { - self.disk_engine().get_oldest_snapshot_sequence_number() - } - - fn get_total_sst_files_size_cf(&self, cf: &str) -> Result> { - self.disk_engine().get_total_sst_files_size_cf(cf) - } - - fn get_num_keys(&self) -> Result { - self.disk_engine().get_num_keys() - } - - fn get_range_stats( - &self, - cf: &str, - start: &[u8], - end: &[u8], - ) -> Result> { - self.disk_engine().get_range_stats(cf, start, end) - } - - fn is_stalled_or_stopped(&self) -> bool { - self.disk_engine().is_stalled_or_stopped() - } - - fn get_active_memtable_stats_cf( - &self, - cf: &str, - ) -> Result> { - self.disk_engine().get_active_memtable_stats_cf(cf) - } - - fn get_accumulated_flush_count_cf(cf: &str) -> Result { - EK::get_accumulated_flush_count_cf(cf) - } - - type DiskEngine = EK::DiskEngine; - fn get_disk_engine(&self) -> &Self::DiskEngine { - self.disk_engine().get_disk_engine() - } -} - -#[cfg(test)] -mod tests { - use engine_traits::{ - CacheRange, DeleteStrategy, MiscExt, Mutable, Range, RangeCacheEngine, WriteBatch, - WriteBatchExt, WriteOptions, CF_DEFAULT, - }; - use range_cache_memory_engine::RangeCacheEngineConfig; - - use crate::util::hybrid_engine_for_tests; - - #[test] - fn test_delete_range() { - let range1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let range2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let range3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - let range1_clone = range1.clone(); - let range2_clone = range2.clone(); - let range3_clone = range3.clone(); - let (_path, hybrid_engine) = hybrid_engine_for_tests( - "temp", - RangeCacheEngineConfig::config_for_test(), - move |memory_engine| { - memory_engine.new_range(range1_clone); - memory_engine.new_range(range2_clone); - memory_engine.new_range(range3_clone); - }, - ) - .unwrap(); - let mut write_batch = hybrid_engine.write_batch(); - write_batch.prepare_for_range(range1.clone()); - write_batch.put(b"k02", b"val").unwrap(); - write_batch.put(b"k03", b"val").unwrap(); - write_batch.prepare_for_range(range2.clone()); - write_batch.put(b"k22", b"val").unwrap(); - write_batch.put(b"k23", b"val").unwrap(); - write_batch.prepare_for_range(range3.clone()); - write_batch.put(b"k42", b"val").unwrap(); - write_batch.put(b"k42", b"val").unwrap(); - write_batch.write().unwrap(); - - hybrid_engine - .delete_ranges_cf( - &WriteOptions::default(), - CF_DEFAULT, - DeleteStrategy::DeleteByRange, - &[Range::new(b"k00", b"k15"), Range::new(b"k22", b"k27")], - ) - .unwrap(); - - hybrid_engine - .range_cache_engine() - .snapshot(range1.clone(), 1000, 1000) - .unwrap_err(); - hybrid_engine - .range_cache_engine() - .snapshot(range2.clone(), 1000, 1000) - .unwrap_err(); - hybrid_engine - .range_cache_engine() - .snapshot(range3.clone(), 1000, 1000) - .unwrap(); - } -} diff --git a/components/hybrid_engine/src/mvcc_properties.rs b/components/hybrid_engine/src/mvcc_properties.rs deleted file mode 100644 index 51a2434bad2..00000000000 --- a/components/hybrid_engine/src/mvcc_properties.rs +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{KvEngine, MvccProperties, MvccPropertiesExt, RangeCacheEngine}; -use txn_types::TimeStamp; - -use crate::engine::HybridEngine; - -impl MvccPropertiesExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn get_mvcc_properties_cf( - &self, - cf: &str, - safe_point: TimeStamp, - start_key: &[u8], - end_key: &[u8], - ) -> Option { - self.disk_engine() - .get_mvcc_properties_cf(cf, safe_point, start_key, end_key) - } -} diff --git a/components/hybrid_engine/src/observer.rs b/components/hybrid_engine/src/observer.rs deleted file mode 100644 index acf8eb990bf..00000000000 --- a/components/hybrid_engine/src/observer.rs +++ /dev/null @@ -1,288 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::{Arc, Mutex}; - -use engine_traits::{CacheRange, KvEngine, RangeCacheEngineExt}; -use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType, raft_serverpb::RaftApplyState}; -use raft::StateRole; -use raftstore::coprocessor::{ - AdminObserver, ApplyCtxInfo, BoxAdminObserver, BoxCmdObserver, BoxQueryObserver, - BoxRoleObserver, Cmd, CmdBatch, CmdObserver, Coprocessor, CoprocessorHost, ObserveLevel, - ObserverContext, QueryObserver, RegionState, RoleObserver, -}; - -#[derive(Clone)] -pub struct Observer { - pending_evict: Arc>>, - cache_engine: Arc, -} - -impl Observer { - pub fn new(cache_engine: Arc) -> Self { - Observer { - pending_evict: Arc::default(), - cache_engine, - } - } - - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - // This observer does not need high priority, use the default 100. - let priority = 100; - coprocessor_host - .registry - .register_cmd_observer(priority, BoxCmdObserver::new(self.clone())); - // Evict cache when a peer applies ingest sst. - coprocessor_host - .registry - .register_query_observer(priority, BoxQueryObserver::new(self.clone())); - // Evict cache when a peer applies region merge. - coprocessor_host - .registry - .register_admin_observer(priority, BoxAdminObserver::new(self.clone())); - // Evict cache when a leader steps down. - coprocessor_host - .registry - .register_role_observer(priority, BoxRoleObserver::new(self.clone())); - - // NB: We do not evict the cache when applying a snapshot because - // the peer must be in the follower role during this process. - // The cache is already evicted when the leader steps down. - } - - fn post_exec_cmd( - &self, - ctx: &mut ObserverContext<'_>, - cmd: &Cmd, - state: &RegionState, - apply: &mut ApplyCtxInfo<'_>, - ) { - if !self.cache_engine.range_cache_engine_enabled() { - return; - } - // Evict caches for successfully executed ingest commands and admin - // commands that change region range. - // - // NB: We do not evict the cache for region splits, as the split ranges - // still contain the latest data and hot regions are often split. - // Evicting the cache for region splits is not worthwhile and may cause - // performance regression due to frequent loading and evicting of - // hot regions. - if apply.pending_handle_ssts.is_some() - || (state.modified_region.is_some() - && matches!( - cmd.request.get_admin_request().get_cmd_type(), - AdminCmdType::PrepareMerge | AdminCmdType::CommitMerge - )) - { - let range = CacheRange::from_region(ctx.region()); - tikv_util::info!( - "evict range due to apply commands"; - "region_id" => ctx.region().get_id(), - "is_ingest_sst" => apply.pending_handle_ssts.is_some(), - "admin_command" => ?cmd.request.get_admin_request().get_cmd_type(), - "range" => ?range, - ); - self.pending_evict.lock().unwrap().push(range); - } - } - - fn on_flush_cmd(&self) { - if !self.cache_engine.range_cache_engine_enabled() { - return; - } - - let ranges = { - let mut ranges = self.pending_evict.lock().unwrap(); - std::mem::take(&mut *ranges) - }; - for range in ranges { - self.cache_engine.evict_range(&range); - } - } - - fn evict_range_on_leader_steps_down(&self, region: &Region) { - if !self.cache_engine.range_cache_engine_enabled() { - return; - } - - let range = CacheRange::from_region(region); - tikv_util::info!( - "evict range due to leader step down"; - "region_id" => region.get_id(), - "range" => ?range, - ); - self.pending_evict.lock().unwrap().push(range); - } -} - -impl Coprocessor for Observer {} - -impl QueryObserver for Observer { - fn post_exec_query( - &self, - ctx: &mut ObserverContext<'_>, - cmd: &Cmd, - _: &RaftApplyState, - state: &RegionState, - apply: &mut ApplyCtxInfo<'_>, - ) -> bool { - self.post_exec_cmd(ctx, cmd, state, apply); - // This observer does not require persisting the cmd to engine - // immediately, so return false. - false - } -} - -impl AdminObserver for Observer { - fn post_exec_admin( - &self, - ctx: &mut ObserverContext<'_>, - cmd: &Cmd, - _: &RaftApplyState, - state: &RegionState, - apply: &mut ApplyCtxInfo<'_>, - ) -> bool { - self.post_exec_cmd(ctx, cmd, state, apply); - // This observer does not require persisting the cmd to engine - // immediately, so return false. - false - } -} - -impl RoleObserver for Observer { - fn on_role_change( - &self, - ctx: &mut ObserverContext<'_>, - change: &raftstore::coprocessor::RoleChange, - ) { - if let StateRole::Follower = change.state - && change.initialized - { - self.evict_range_on_leader_steps_down(ctx.region()) - } - } -} - -impl CmdObserver for Observer { - fn on_flush_applied_cmd_batch( - &self, - _max_level: ObserveLevel, - _cmd_batches: &mut Vec, - _engine: &E, - ) { - self.on_flush_cmd(); - } - fn on_applied_current_term(&self, role: StateRole, region: &Region) {} -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::{AtomicBool, Ordering}; - - use engine_traits::SstMetaInfo; - use kvproto::{ - import_sstpb::SstMeta, - metapb::Peer, - raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, - }; - - use super::*; - - #[derive(Default)] - struct MockRangeCacheEngine { - enabled: AtomicBool, - evicted_ranges: Arc>>, - } - impl RangeCacheEngineExt for MockRangeCacheEngine { - fn range_cache_engine_enabled(&self) -> bool { - self.enabled.load(Ordering::Relaxed) - } - fn evict_range(&self, range: &CacheRange) { - self.evicted_ranges.lock().unwrap().push(range.clone()); - } - } - - fn new_admin_request_batch_split() -> RaftCmdRequest { - let mut request = RaftCmdRequest::default(); - request - .mut_admin_request() - .set_cmd_type(AdminCmdType::BatchSplit); - request - } - - #[test] - fn test_do_not_evict_range_region_split() { - let cache_engine = Arc::new(MockRangeCacheEngine::default()); - let observer = Observer::new(cache_engine.clone()); - - let mut region = Region::default(); - region.set_id(1); - region.mut_peers().push(Peer::default()); - let mut ctx = ObserverContext::new(®ion); - - let mut pending_handle_ssts = None; - let mut delete_ssts = Vec::new(); - let mut pending_delete_ssts = Vec::new(); - - let mut apply = ApplyCtxInfo { - pending_handle_ssts: &mut pending_handle_ssts, - delete_ssts: &mut delete_ssts, - pending_delete_ssts: &mut pending_delete_ssts, - }; - let request = new_admin_request_batch_split(); - let response = RaftCmdResponse::default(); - let cmd = Cmd::new(0, 0, request, response); - - // Must not evict range for region split. - // - // Enable range cache engine. - cache_engine.enabled.store(true, Ordering::Relaxed); - observer.post_exec_cmd(&mut ctx, &cmd, &RegionState::default(), &mut apply); - observer.on_flush_cmd(); - let expected = CacheRange::from_region(®ion); - assert!(&cache_engine.evicted_ranges.lock().unwrap().is_empty()); - } - - #[test] - fn test_evict_range_ingest_sst() { - let cache_engine = Arc::new(MockRangeCacheEngine::default()); - let observer = Observer::new(cache_engine.clone()); - - let mut region = Region::default(); - region.set_id(1); - region.mut_peers().push(Peer::default()); - let mut ctx = ObserverContext::new(®ion); - - let mut meta = SstMeta::default(); - meta.set_region_id(1); - let meta = SstMetaInfo { - total_bytes: 0, - total_kvs: 0, - meta, - }; - let mut pending_handle_ssts = Some(vec![meta]); - let mut delete_ssts = Vec::new(); - let mut pending_delete_ssts = Vec::new(); - - let mut apply = ApplyCtxInfo { - pending_handle_ssts: &mut pending_handle_ssts, - delete_ssts: &mut delete_ssts, - pending_delete_ssts: &mut pending_delete_ssts, - }; - let request = RaftCmdRequest::default(); - let response = RaftCmdResponse::default(); - let cmd = Cmd::new(0, 0, request, response); - - // Must not evict range when range cache engine is disabled. - observer.post_exec_cmd(&mut ctx, &cmd, &RegionState::default(), &mut apply); - observer.on_flush_cmd(); - assert!(cache_engine.evicted_ranges.lock().unwrap().is_empty()); - - // Enable range cache engine. - cache_engine.enabled.store(true, Ordering::Relaxed); - observer.post_exec_cmd(&mut ctx, &cmd, &RegionState::default(), &mut apply); - observer.on_flush_cmd(); - let expected = CacheRange::from_region(®ion); - assert_eq!(&cache_engine.evicted_ranges.lock().unwrap()[0], &expected); - } -} diff --git a/components/hybrid_engine/src/observer/load_eviction.rs b/components/hybrid_engine/src/observer/load_eviction.rs new file mode 100644 index 00000000000..5acfebe1650 --- /dev/null +++ b/components/hybrid_engine/src/observer/load_eviction.rs @@ -0,0 +1,409 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::{CacheRegion, EvictReason, KvEngine, RegionCacheEngineExt, RegionEvent}; +use kvproto::{ + metapb::Region, + raft_cmdpb::AdminCmdType, + raft_serverpb::{ExtraMessage, ExtraMessageType, RaftApplyState}, +}; +use raft::StateRole; +use raftstore::coprocessor::{ + dispatcher::{BoxDestroyPeerObserver, BoxExtraMessageObserver}, + AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, BoxApplySnapshotObserver, + BoxQueryObserver, BoxRoleObserver, Cmd, Coprocessor, CoprocessorHost, DestroyPeerObserver, + ExtraMessageObserver, ObserverContext, QueryObserver, RegionState, RoleObserver, +}; +use tikv_util::info; + +#[derive(Clone)] +pub struct LoadEvictionObserver { + cache_engine: Arc, +} + +impl LoadEvictionObserver { + pub fn new(cache_engine: Arc) -> Self { + LoadEvictionObserver { cache_engine } + } + + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { + // This observer does not need high priority, use the default 100. + let priority = 100; + // Evict cache when a peer applies ingest sst. + coprocessor_host + .registry + .register_query_observer(priority, BoxQueryObserver::new(self.clone())); + // Evict cache when a peer applies region merge. + coprocessor_host + .registry + .register_admin_observer(priority, BoxAdminObserver::new(self.clone())); + // Evict cache when a peer applies snapshot. + // Applying snapshot changes the data in rocksdb but not IME, + // so we trigger region eviction to keep compatibility. + coprocessor_host.registry.register_apply_snapshot_observer( + priority, + BoxApplySnapshotObserver::new(self.clone()), + ); + // Evict cache when a leader steps down. + coprocessor_host + .registry + .register_role_observer(priority, BoxRoleObserver::new(self.clone())); + // Pre load region in transfer leader + coprocessor_host + .registry + .register_extra_message_observer(priority, BoxExtraMessageObserver::new(self.clone())); + // Eviction the cached region when the peer is destroyed. + coprocessor_host + .registry + .register_destroy_peer_observer(priority, BoxDestroyPeerObserver::new(self.clone())); + } + + fn post_exec_cmd( + &self, + ctx: &mut ObserverContext<'_>, + cmd: &Cmd, + state: &RegionState, + apply: &mut ApplyCtxInfo<'_>, + ) { + // Evict caches for successfully executed ingest commands and admin + // commands that change region range. + // + // NB: We do not evict the cache for region splits, as the split ranges + // still contain the latest data and hot regions are often split. + // Evicting the cache for region splits is not worthwhile and may cause + // performance regression due to frequent loading and evicting of + // hot regions. + if apply.pending_handle_ssts.is_some() + || (state.modified_region.is_some() + && matches!( + cmd.request.get_admin_request().get_cmd_type(), + AdminCmdType::PrepareMerge | AdminCmdType::CommitMerge + )) + { + let cache_region = CacheRegion::from_region(ctx.region()); + info!( + "ime evict range due to apply commands"; + "region" => ?cache_region, + "is_ingest_sst" => apply.pending_handle_ssts.is_some(), + "admin_command" => ?cmd.request.get_admin_request().get_cmd_type(), + ); + self.evict_region(cache_region, EvictReason::Merge) + } + // there are new_regions, this must be a split event. + if !state.new_regions.is_empty() { + let cmd_type = cmd.request.get_admin_request().get_cmd_type(); + assert!(cmd_type == AdminCmdType::BatchSplit || cmd_type == AdminCmdType::Split); + info!( + "ime handle region split"; + "region_id" => ctx.region().get_id(), + "admin_command" => ?cmd.request.get_admin_request().get_cmd_type(), + "region" => ?state.modified_region.as_ref().unwrap(), + "new_regions" => ?state.new_regions, + ); + + self.split_region( + CacheRegion::from_region(ctx.region()), + state + .new_regions + .iter() + .map(CacheRegion::from_region) + .collect(), + ); + } + } + + fn split_region(&self, source: CacheRegion, new_regions: Vec) { + self.cache_engine.on_region_event(RegionEvent::Split { + source, + new_regions, + }); + } + + // Try to load region. It will be loaded if it's overlapped with maunal range + fn try_load_region(&self, region: CacheRegion) { + self.cache_engine.on_region_event(RegionEvent::TryLoad { + region, + for_manual_range: true, + }); + } + + fn evict_region(&self, region: CacheRegion, reason: EvictReason) { + self.cache_engine + .on_region_event(RegionEvent::Eviction { region, reason }); + } +} + +impl Coprocessor for LoadEvictionObserver {} + +impl QueryObserver for LoadEvictionObserver { + fn pre_exec_query( + &self, + _: &mut ObserverContext<'_>, + reqs: &[kvproto::raft_cmdpb::Request], + _: u64, + _: u64, + ) -> bool { + reqs.iter().for_each(|r| { + if r.has_delete_range() { + self.cache_engine + .on_region_event(RegionEvent::EvictByRange { + range: CacheRegion::new( + 0, + 0, + keys::data_key(r.get_delete_range().get_start_key()), + keys::data_key(r.get_delete_range().get_end_key()), + ), + reason: EvictReason::DeleteRange, + }) + } + }); + + false + } + + fn post_exec_query( + &self, + ctx: &mut ObserverContext<'_>, + cmd: &Cmd, + _: &RaftApplyState, + state: &RegionState, + apply: &mut ApplyCtxInfo<'_>, + ) -> bool { + self.post_exec_cmd(ctx, cmd, state, apply); + // This observer does not require persisting the cmd to engine + // immediately, so return false. + false + } +} + +impl AdminObserver for LoadEvictionObserver { + fn pre_exec_admin( + &self, + ctx: &mut ObserverContext<'_>, + req: &kvproto::raft_cmdpb::AdminRequest, + _: u64, + _: u64, + ) -> bool { + if req.cmd_type == AdminCmdType::PrepareFlashback { + let cache_region = CacheRegion::from_region(ctx.region()); + self.evict_region(cache_region, EvictReason::Flashback); + } + + false + } + + fn post_exec_admin( + &self, + ctx: &mut ObserverContext<'_>, + cmd: &Cmd, + _: &RaftApplyState, + state: &RegionState, + apply: &mut ApplyCtxInfo<'_>, + ) -> bool { + self.post_exec_cmd(ctx, cmd, state, apply); + // This observer does not require persisting the cmd to engine + // immediately, so return false. + false + } + + fn pre_transfer_leader( + &self, + ctx: &mut ObserverContext<'_>, + _tr: &kvproto::raft_cmdpb::TransferLeaderRequest, + ) -> raftstore::coprocessor::Result> { + if !self.cache_engine.region_cached(ctx.region()) { + return Ok(None); + } + let mut msg = ExtraMessage::new(); + msg.set_type(ExtraMessageType::MsgPreLoadRegionRequest); + Ok(Some(msg)) + } +} + +impl ApplySnapshotObserver for LoadEvictionObserver { + fn post_apply_snapshot( + &self, + ctx: &mut ObserverContext<'_>, + _: u64, + _: &raftstore::store::SnapKey, + _: Option<&raftstore::store::Snapshot>, + ) { + // While currently, we evict cached region after leader step down. + // A region can may still be loaded when it's leader. E.g, to pre-load + // some hot regions before transferring leader. + let cache_region = CacheRegion::from_region(ctx.region()); + self.evict_region(cache_region, EvictReason::ApplySnapshot) + } +} + +impl RoleObserver for LoadEvictionObserver { + fn on_role_change( + &self, + ctx: &mut ObserverContext<'_>, + change: &raftstore::coprocessor::RoleChange, + ) { + if let StateRole::Leader = change.state { + // Currently, it is only used by the manual load. + let cache_region = CacheRegion::from_region(ctx.region()); + info!( + "ime try to load region due to became leader"; + "region" => ?cache_region, + ); + self.try_load_region(cache_region); + } else if let StateRole::Follower = change.state + && change.initialized + { + let cache_region = CacheRegion::from_region(ctx.region()); + info!( + "ime try to evict region due to became follower"; + "region" => ?cache_region, + ); + self.evict_region(cache_region, EvictReason::BecomeFollower); + } + } +} + +impl ExtraMessageObserver for LoadEvictionObserver { + fn on_extra_message(&self, r: &Region, extra_msg: &ExtraMessage) { + if extra_msg.get_type() == ExtraMessageType::MsgPreLoadRegionRequest { + self.cache_engine.load_region(r); + } + } +} + +impl DestroyPeerObserver for LoadEvictionObserver { + fn on_destroy_peer(&self, r: &Region) { + self.cache_engine.on_region_event(RegionEvent::Eviction { + region: CacheRegion::from_region(r), + reason: EvictReason::PeerDestroy, + }); + } +} + +#[cfg(test)] +mod tests { + use std::sync::Mutex; + + use engine_traits::{RegionEvent, SstMetaInfo}; + use kvproto::{ + import_sstpb::SstMeta, + metapb::{Peer, Region}, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + }; + use raftstore::coprocessor::RoleChange; + + use super::*; + + #[derive(Default)] + struct MockRegionCacheEngine { + region_events: Arc>>, + } + impl RegionCacheEngineExt for MockRegionCacheEngine { + fn on_region_event(&self, event: RegionEvent) { + self.region_events.lock().unwrap().push(event); + } + + fn region_cached(&self, _: &Region) -> bool { + unreachable!() + } + + fn load_region(&self, _: &Region) { + unreachable!() + } + } + + fn new_admin_request_batch_split() -> RaftCmdRequest { + let mut request = RaftCmdRequest::default(); + request + .mut_admin_request() + .set_cmd_type(AdminCmdType::BatchSplit); + request + } + + #[test] + fn test_do_not_evict_region_region_split() { + let cache_engine = Arc::new(MockRegionCacheEngine::default()); + let observer = LoadEvictionObserver::new(cache_engine.clone()); + + let mut region = Region::default(); + region.set_id(1); + region.mut_peers().push(Peer::default()); + let mut ctx = ObserverContext::new(®ion); + + let mut pending_handle_ssts = None; + let mut delete_ssts = Vec::new(); + let mut pending_delete_ssts = Vec::new(); + + let mut apply = ApplyCtxInfo { + pending_handle_ssts: &mut pending_handle_ssts, + delete_ssts: &mut delete_ssts, + pending_delete_ssts: &mut pending_delete_ssts, + }; + let request = new_admin_request_batch_split(); + let response = RaftCmdResponse::default(); + let cmd = Cmd::new(0, 0, request, response); + + // Must not evict range for region split. + observer.post_exec_cmd(&mut ctx, &cmd, &RegionState::default(), &mut apply); + assert!(&cache_engine.region_events.lock().unwrap().is_empty()); + } + + #[test] + fn test_evict_region_ingest_sst() { + let cache_engine = Arc::new(MockRegionCacheEngine::default()); + let observer = LoadEvictionObserver::new(cache_engine.clone()); + + let mut region = Region::default(); + region.set_id(1); + region.mut_peers().push(Peer::default()); + let mut ctx = ObserverContext::new(®ion); + + let mut meta = SstMeta::default(); + meta.set_region_id(1); + let meta = SstMetaInfo { + total_bytes: 0, + total_kvs: 0, + meta, + }; + let mut pending_handle_ssts = Some(vec![meta]); + let mut delete_ssts = Vec::new(); + let mut pending_delete_ssts = Vec::new(); + + let mut apply = ApplyCtxInfo { + pending_handle_ssts: &mut pending_handle_ssts, + delete_ssts: &mut delete_ssts, + pending_delete_ssts: &mut pending_delete_ssts, + }; + let request = RaftCmdRequest::default(); + let response = RaftCmdResponse::default(); + let cmd = Cmd::new(0, 0, request, response); + + observer.post_exec_cmd(&mut ctx, &cmd, &RegionState::default(), &mut apply); + let cached_region = CacheRegion::from_region(®ion); + let expected = RegionEvent::Eviction { + region: cached_region, + reason: EvictReason::Merge, + }; + assert_eq!(&cache_engine.region_events.lock().unwrap()[0], &expected); + } + + #[test] + fn test_load_region_became_leader() { + let cache_engine = Arc::new(MockRegionCacheEngine::default()); + let observer = LoadEvictionObserver::new(cache_engine.clone()); + + let mut region = Region::default(); + region.set_id(1); + region.mut_peers().push(Peer::default()); + let mut ctx = ObserverContext::new(®ion); + let role_change = RoleChange::new_for_test(StateRole::Leader); + observer.on_role_change(&mut ctx, &role_change); + let cached_region = CacheRegion::from_region(®ion); + let expected = RegionEvent::TryLoad { + region: cached_region, + for_manual_range: true, + }; + assert_eq!(&cache_engine.region_events.lock().unwrap()[0], &expected); + } +} diff --git a/components/hybrid_engine/src/observer/mod.rs b/components/hybrid_engine/src/observer/mod.rs new file mode 100644 index 00000000000..5144395357d --- /dev/null +++ b/components/hybrid_engine/src/observer/mod.rs @@ -0,0 +1,11 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +mod load_eviction; +mod snapshot; +#[cfg(test)] +mod test_write_batch; +mod write_batch; + +pub use load_eviction::LoadEvictionObserver; +pub use snapshot::{HybridSnapshotObserver, RegionCacheSnapshotPin}; +pub use write_batch::RegionCacheWriteBatchObserver; diff --git a/components/hybrid_engine/src/observer/snapshot.rs b/components/hybrid_engine/src/observer/snapshot.rs new file mode 100644 index 00000000000..edf8d9729f9 --- /dev/null +++ b/components/hybrid_engine/src/observer/snapshot.rs @@ -0,0 +1,102 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::result; + +use engine_traits::{CacheRegion, FailedReason, KvEngine, RegionCacheEngine}; +use in_memory_engine::{RegionCacheMemoryEngine, RegionCacheSnapshot}; +use kvproto::metapb::Region; +use raftstore::coprocessor::{ + dispatcher::BoxSnapshotObserver, CoprocessorHost, ObservedSnapshot, SnapshotObserver, +}; + +use crate::metrics::{ + IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC, SNAPSHOT_TYPE_COUNT_STATIC, +}; + +/// RegionCacheSnapshotPin pins data of a RegionCacheMemoryEngine during taking +/// snapshot. It prevents the data from being evicted or deleted from the cache. +// TODO: Remove it, theoretically it can be remove if we don't need an +// in-memory engine snapshot when a region is removed or splitted. +pub struct RegionCacheSnapshotPin { + pub snap: Option>, +} + +impl Drop for RegionCacheSnapshotPin { + fn drop(&mut self) { + if matches!(self.snap, Some(Ok(_))) { + // ime snapshot is acquired successfully but not used in coprocessor request. + SNAPSHOT_TYPE_COUNT_STATIC.wasted.inc(); + } + } +} + +impl RegionCacheSnapshotPin { + pub fn take(&mut self) -> Option { + if let Some(snap_result) = self.snap.take() { + match snap_result { + Ok(snap) => { + SNAPSHOT_TYPE_COUNT_STATIC.in_memory_engine.inc(); + Some(snap) + } + Err(FailedReason::TooOldRead) => { + IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC + .too_old_read + .inc(); + SNAPSHOT_TYPE_COUNT_STATIC.rocksdb.inc(); + None + } + Err(FailedReason::NotCached) => { + IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC + .not_cached + .inc(); + SNAPSHOT_TYPE_COUNT_STATIC.rocksdb.inc(); + None + } + Err(FailedReason::EpochNotMatch) => { + IN_MEMORY_ENGINE_SNAPSHOT_ACQUIRE_FAILED_REASON_COUNT_STAIC + .epoch_not_match + .inc(); + SNAPSHOT_TYPE_COUNT_STATIC.rocksdb.inc(); + None + } + } + } else { + None + } + } +} + +impl ObservedSnapshot for RegionCacheSnapshotPin {} + +#[derive(Clone)] +pub struct HybridSnapshotObserver { + cache_engine: RegionCacheMemoryEngine, +} + +impl HybridSnapshotObserver { + pub fn new(cache_engine: RegionCacheMemoryEngine) -> Self { + HybridSnapshotObserver { cache_engine } + } + + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { + coprocessor_host + .registry + .register_snapshot_observer(BoxSnapshotObserver::new(self.clone())); + } +} + +impl SnapshotObserver for HybridSnapshotObserver { + fn on_snapshot( + &self, + region: &Region, + read_ts: u64, + sequence_number: u64, + ) -> Box { + // Taking a snapshot to pin data in the cache engine which prevents the + // data from being evicted or deleted from the cache. + // The data should be released when the snapshot is dropped. + let region = CacheRegion::from_region(region); + let snap = Some(self.cache_engine.snapshot(region, read_ts, sequence_number)); + Box::new(RegionCacheSnapshotPin { snap }) + } +} diff --git a/components/hybrid_engine/src/observer/test_write_batch.rs b/components/hybrid_engine/src/observer/test_write_batch.rs new file mode 100644 index 00000000000..faa1cbf30d1 --- /dev/null +++ b/components/hybrid_engine/src/observer/test_write_batch.rs @@ -0,0 +1,278 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::mpsc::sync_channel, time::Duration}; + +use crossbeam::epoch; +use engine_traits::{CacheRegion, Mutable, Peekable, RegionCacheEngine, WriteBatch, WriteBatchExt}; +use in_memory_engine::{ + decode_key, test_util::new_region, InMemoryEngineConfig, InternalKey, RegionCacheStatus, + ValueType, +}; +use raftstore::coprocessor::{WriteBatchObserver, WriteBatchWrapper}; + +use super::RegionCacheWriteBatchObserver; +use crate::{engine::SnapshotContext, util::hybrid_engine_for_tests}; + +#[test] +fn test_sequence_number_unique() { + let (_path, hybrid_engine) = + hybrid_engine_for_tests("temp", InMemoryEngineConfig::config_for_test(), |_| {}).unwrap(); + + let (tx, rx) = sync_channel(0); + fail::cfg_callback("ime_on_completes_batch_loading", move || { + fail::cfg("ime_on_start_loading_region", "pause").unwrap(); + tx.send(true).unwrap(); + }) + .unwrap(); + + let engine = hybrid_engine.region_cache_engine().clone(); + let observer = RegionCacheWriteBatchObserver::new(engine.clone()); + + // first write some data, these data should be handled by batch loading. + let mut wb = WriteBatchWrapper::new( + hybrid_engine.disk_engine().write_batch(), + Some(observer.create_observable_write_batch()), + ); + + wb.put(b"zk5", b"val").unwrap(); // seq 1 + wb.put(b"zk7", b"val").unwrap(); // seq 2 + + let r = new_region(1, b"k", b"k5"); + engine.new_region(r.clone()); + wb.write().unwrap(); + + // Mock that we have a loading range, and there are some keys written in it + // during the load + let r2 = new_region(2, b"k5", b"k7"); + let r3 = new_region(3, b"k7", b"k9"); + let cache_region2 = CacheRegion::from_region(&r2); + let cache_region3 = CacheRegion::from_region(&r3); + engine.load_region(cache_region2.clone()).unwrap(); + engine.load_region(cache_region3.clone()).unwrap(); + + // The sequence number of write batch should be increased one by one, otherwise + // if a delete and a put of the same key occurs in the same write batch, + // the delete will be hidden by the put even the delete is performed + // after the put. + // while we block the batch loading of region3, it's new KVs are still directly + // written into the skiplist. + let mut wb = WriteBatchWrapper::new( + hybrid_engine.disk_engine().write_batch(), + Some(observer.create_observable_write_batch()), + ); + wb.prepare_for_region(&r); + wb.put(b"zk", b"val").unwrap(); // seq 3 + wb.delete(b"zk").unwrap(); // seq 4 + wb.put(b"zk2", b"val").unwrap(); // seq 5 + + wb.prepare_for_region(&r2); + wb.put(b"zk6", b"val").unwrap(); // seq 6 + wb.delete(b"zk5").unwrap(); // seq 7 + wb.put(b"zk5", b"val2").unwrap(); // seq 8 + + wb.prepare_for_region(&r3); + wb.put(b"zk8", b"val").unwrap(); // seq 9 + wb.put(b"zk7", b"val2").unwrap(); // seq 10 + + rx.recv().unwrap(); + wb.write().unwrap(); + + let mut iter = engine.core().engine().cf_handle("default").iterator(); + let guard = &epoch::pin(); + + let mut first = true; + + for (k, sequence, v_type) in [ + (b"zk".to_vec(), 4, ValueType::Deletion), + (b"zk".to_vec(), 3, ValueType::Value), + (b"zk2".to_vec(), 5, ValueType::Value), + (b"zk5".to_vec(), 8, ValueType::Value), + (b"zk5".to_vec(), 7, ValueType::Deletion), + // NOTE: for batch loading, we always use the current seq number + // to write all the keys. + (b"zk5".to_vec(), 2, ValueType::Value), + (b"zk6".to_vec(), 6, ValueType::Value), + (b"zk7".to_vec(), 10, ValueType::Value), + // "zk7" with seq 2 is block, so invisible here. + (b"zk8".to_vec(), 9, ValueType::Value), + ] { + if first { + iter.seek_to_first(guard); + first = false; + } else { + iter.next(guard); + } + + let expected_key = InternalKey { + user_key: k.as_slice(), + v_type, + sequence, + }; + let key = iter.key(); + let got_key = decode_key(key.as_bytes()); + assert_eq!(expected_key, got_key); + } +} + +#[test] +fn test_write_to_both_engines() { + let region = new_region(1, b"", b"z"); + let region_clone = region.clone(); + let (_path, hybrid_engine) = hybrid_engine_for_tests( + "temp", + InMemoryEngineConfig::config_for_test(), + move |memory_engine| { + let id = region_clone.id; + memory_engine.new_region(region_clone); + memory_engine.core().region_manager().set_safe_point(id, 5); + }, + ) + .unwrap(); + let engine = hybrid_engine.region_cache_engine().clone(); + let observer = RegionCacheWriteBatchObserver::new(engine.clone()); + + let cache_region = CacheRegion::from_region(®ion); + let mut ob_wb = observer.new_observable_write_batch(); + ob_wb.cache_write_batch.prepare_for_region(®ion); + ob_wb + .cache_write_batch + .set_region_cache_status(RegionCacheStatus::Cached); + let mut write_batch = WriteBatchWrapper::new( + hybrid_engine.disk_engine().write_batch(), + Some(Box::new(ob_wb)), + ); + write_batch.put(b"zhello", b"world").unwrap(); + let seq = write_batch.write().unwrap(); + assert!(seq > 0); + let actual: &[u8] = &hybrid_engine + .disk_engine() + .get_value(b"zhello") + .unwrap() + .unwrap(); + assert_eq!(b"world", &actual); + let ctx = SnapshotContext { + region: Some(cache_region.clone()), + read_ts: 10, + }; + let snap = hybrid_engine.new_snapshot(Some(ctx)); + let actual: &[u8] = &snap.get_value(b"zhello").unwrap().unwrap(); + assert_eq!(b"world", &actual); + let actual: &[u8] = &snap.disk_snap().get_value(b"zhello").unwrap().unwrap(); + assert_eq!(b"world", &actual); + let actual: &[u8] = &snap + .region_cache_snap() + .unwrap() + .get_value(b"zhello") + .unwrap() + .unwrap(); + assert_eq!(b"world", &actual); +} + +#[test] +fn test_set_sequence_number() { + let (_path, hybrid_engine) = hybrid_engine_for_tests( + "temp", + InMemoryEngineConfig::config_for_test(), + |memory_engine| { + let region = new_region(1, b"k00", b"k10"); + memory_engine.new_region(region); + memory_engine.core().region_manager().set_safe_point(1, 10); + }, + ) + .unwrap(); + + let engine = hybrid_engine.region_cache_engine().clone(); + let observer = RegionCacheWriteBatchObserver::new(engine.clone()); + let mut write_batch = observer.new_observable_write_batch(); + + write_batch + .cache_write_batch + .set_sequence_number(0) + .unwrap(); // First call ok. + assert!( + write_batch + .cache_write_batch + .set_sequence_number(0) + .is_err() + ); // Second call err. +} + +#[test] +fn test_delete_range() { + let region1 = new_region(1, b"k00", b"k10"); + let region2 = new_region(2, b"k20", b"k30"); + let cache_region1 = CacheRegion::from_region(®ion1); + let cache_region2 = CacheRegion::from_region(®ion2); + + let region1_clone = region1.clone(); + let region2_clone = region2.clone(); + let (_path, hybrid_engine) = hybrid_engine_for_tests( + "temp", + InMemoryEngineConfig::config_for_test(), + move |memory_engine| { + memory_engine.new_region(region1_clone); + memory_engine.new_region(region2_clone); + }, + ) + .unwrap(); + + let engine = hybrid_engine.region_cache_engine().clone(); + let observer = RegionCacheWriteBatchObserver::new(engine.clone()); + + let mut wb = WriteBatchWrapper::new( + hybrid_engine.disk_engine().write_batch(), + Some(observer.create_observable_write_batch()), + ); + wb.prepare_for_region(®ion1); + wb.put(b"zk05", b"val").unwrap(); + wb.put(b"zk08", b"val2").unwrap(); + wb.prepare_for_region(®ion2); + wb.put(b"zk25", b"val3").unwrap(); + wb.put(b"zk27", b"val4").unwrap(); + wb.write().unwrap(); + + hybrid_engine + .region_cache_engine() + .snapshot(cache_region1.clone(), 1000, 1000) + .unwrap(); + hybrid_engine + .region_cache_engine() + .snapshot(cache_region2.clone(), 1000, 1000) + .unwrap(); + assert_eq!( + 4, + hybrid_engine + .region_cache_engine() + .core() + .engine() + .cf_handle("default") + .len() + ); + + let mut wb = WriteBatchWrapper::new( + hybrid_engine.disk_engine().write_batch(), + Some(observer.create_observable_write_batch()), + ); + // all ranges overlapped with it will be evicted + wb.prepare_for_region(®ion1); + wb.delete_range(b"zk05", b"zk08").unwrap(); + wb.prepare_for_region(®ion2); + wb.delete_range(b"zk20", b"zk21").unwrap(); + wb.write().unwrap(); + + hybrid_engine + .region_cache_engine() + .snapshot(cache_region1.clone(), 1000, 1000) + .unwrap_err(); + hybrid_engine + .region_cache_engine() + .snapshot(cache_region2.clone(), 1000, 1000) + .unwrap_err(); + let m_engine = hybrid_engine.region_cache_engine(); + + test_util::eventually( + Duration::from_millis(100), + Duration::from_millis(2000), + || m_engine.core().engine().cf_handle("default").is_empty(), + ); +} diff --git a/components/hybrid_engine/src/observer/write_batch.rs b/components/hybrid_engine/src/observer/write_batch.rs new file mode 100644 index 00000000000..854dc6a2eb7 --- /dev/null +++ b/components/hybrid_engine/src/observer/write_batch.rs @@ -0,0 +1,141 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{is_data_cf, KvEngine, Mutable, Result, WriteBatch, WriteOptions}; +use in_memory_engine::{RegionCacheMemoryEngine, RegionCacheWriteBatch}; +use kvproto::metapb; +use raftstore::coprocessor::{ + dispatcher::BoxWriteBatchObserver, Coprocessor, CoprocessorHost, ObservableWriteBatch, + WriteBatchObserver, +}; + +#[derive(Clone)] +pub struct RegionCacheWriteBatchObserver { + cache_engine: RegionCacheMemoryEngine, +} + +impl RegionCacheWriteBatchObserver { + pub fn new(cache_engine: RegionCacheMemoryEngine) -> Self { + RegionCacheWriteBatchObserver { cache_engine } + } + + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { + coprocessor_host + .registry + .register_write_batch_observer(BoxWriteBatchObserver::new(self.clone())); + } + + pub(crate) fn new_observable_write_batch(&self) -> HybridObservableWriteBatch { + HybridObservableWriteBatch { + cache_write_batch: RegionCacheWriteBatch::from(&self.cache_engine), + } + } +} + +impl Coprocessor for RegionCacheWriteBatchObserver {} + +impl WriteBatchObserver for RegionCacheWriteBatchObserver { + fn create_observable_write_batch(&self) -> Box { + Box::new(self.new_observable_write_batch()) + } +} + +pub(crate) struct HybridObservableWriteBatch { + pub(crate) cache_write_batch: RegionCacheWriteBatch, +} + +impl ObservableWriteBatch for HybridObservableWriteBatch { + fn prepare_for_region(&mut self, region: &metapb::Region) { + self.cache_write_batch.prepare_for_region(region); + } + fn write_opt_seq(&mut self, opts: &WriteOptions, seq_num: u64) { + self.cache_write_batch.set_sequence_number(seq_num).unwrap(); + self.cache_write_batch.write_opt(opts).unwrap(); + } + fn post_write(&mut self) { + self.cache_write_batch.maybe_compact_lock_cf(); + } +} + +/// Implements the `WriteBatch` trait for `HybridObservableWriteBatch`. +/// +/// The following methods are not implemented because they are not used +/// through the interface `Box`. +/// +/// - `write`, `write_opt`, `write_callback_opt`, `merge` +/// +/// Implements the remaining methods of the `WriteBatch` trait by delegating +/// the calls to the `cache_write_batch` field. +impl WriteBatch for HybridObservableWriteBatch { + fn write(&mut self) -> Result { + unimplemented!("write") + } + fn write_opt(&mut self, _: &WriteOptions) -> Result { + unimplemented!("write_opt") + } + fn write_callback_opt(&mut self, _: &WriteOptions, _: impl FnMut(u64)) -> Result + where + Self: Sized, + { + unimplemented!("write_callback_opt") + } + fn merge(&mut self, _: Self) -> Result<()> + where + Self: Sized, + { + unimplemented!("merge") + } + + fn data_size(&self) -> usize { + self.cache_write_batch.data_size() + } + fn count(&self) -> usize { + self.cache_write_batch.count() + } + fn is_empty(&self) -> bool { + self.cache_write_batch.is_empty() + } + fn should_write_to_engine(&self) -> bool { + self.cache_write_batch.should_write_to_engine() + } + fn clear(&mut self) { + self.cache_write_batch.clear() + } + fn set_save_point(&mut self) { + self.cache_write_batch.set_save_point() + } + fn pop_save_point(&mut self) -> Result<()> { + self.cache_write_batch.pop_save_point() + } + fn rollback_to_save_point(&mut self) -> Result<()> { + self.cache_write_batch.rollback_to_save_point() + } +} + +impl Mutable for HybridObservableWriteBatch { + fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { + self.cache_write_batch.put(key, value) + } + fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + if is_data_cf(cf) { + self.cache_write_batch.put_cf(cf, key, value)?; + } + Ok(()) + } + fn delete(&mut self, key: &[u8]) -> Result<()> { + self.cache_write_batch.delete(key) + } + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + self.cache_write_batch.delete_cf(cf, key) + } + fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + // delete_range in in memory engine means eviction -- all ranges overlapped + // with [begin_key, end_key] will be evicted. + self.cache_write_batch.delete_range(begin_key, end_key) + } + fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + // delete_range in in memory engine means eviction -- all ranges overlapped + // with [begin_key, end_key] will be evicted. + self.cache_write_batch + .delete_range_cf(cf, begin_key, end_key) + } +} diff --git a/components/hybrid_engine/src/perf_context.rs b/components/hybrid_engine/src/perf_context.rs deleted file mode 100644 index 86b22958b0e..00000000000 --- a/components/hybrid_engine/src/perf_context.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{KvEngine, PerfContextExt, PerfContextKind, RangeCacheEngine}; - -use crate::engine::HybridEngine; - -impl PerfContextExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type PerfContext = EK::PerfContext; - - fn get_perf_context( - level: engine_traits::PerfLevel, - kind: PerfContextKind, - ) -> Self::PerfContext { - EK::get_perf_context(level, kind) - } -} diff --git a/components/hybrid_engine/src/range_cache_engine.rs b/components/hybrid_engine/src/range_cache_engine.rs deleted file mode 100644 index 2c87abdcada..00000000000 --- a/components/hybrid_engine/src/range_cache_engine.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{CacheRange, KvEngine, RangeCacheEngine, RangeCacheEngineExt}; - -use crate::HybridEngine; - -impl RangeCacheEngineExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn range_cache_engine_enabled(&self) -> bool { - true - } - - #[inline] - fn evict_range(&self, range: &CacheRange) { - self.range_cache_engine().evict_range(range); - } -} diff --git a/components/hybrid_engine/src/range_properties.rs b/components/hybrid_engine/src/range_properties.rs deleted file mode 100644 index 14deb77ec52..00000000000 --- a/components/hybrid_engine/src/range_properties.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{KvEngine, Range, RangeCacheEngine, RangePropertiesExt, Result}; - -use crate::engine::HybridEngine; - -impl RangePropertiesExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn get_range_approximate_keys(&self, range: Range<'_>, large_threshold: u64) -> Result { - self.disk_engine() - .get_range_approximate_keys(range, large_threshold) - } - - fn get_range_approximate_keys_cf( - &self, - cfname: &str, - range: Range<'_>, - large_threshold: u64, - ) -> Result { - self.disk_engine() - .get_range_approximate_keys_cf(cfname, range, large_threshold) - } - - fn get_range_approximate_size(&self, range: Range<'_>, large_threshold: u64) -> Result { - self.disk_engine() - .get_range_approximate_size(range, large_threshold) - } - - fn get_range_approximate_size_cf( - &self, - cfname: &str, - range: Range<'_>, - large_threshold: u64, - ) -> Result { - self.disk_engine() - .get_range_approximate_size_cf(cfname, range, large_threshold) - } - - fn get_range_approximate_split_keys( - &self, - range: Range<'_>, - key_count: usize, - ) -> Result>> { - self.disk_engine() - .get_range_approximate_split_keys(range, key_count) - } - - fn get_range_approximate_split_keys_cf( - &self, - cfname: &str, - range: Range<'_>, - key_count: usize, - ) -> Result>> { - self.disk_engine() - .get_range_approximate_split_keys_cf(cfname, range, key_count) - } -} diff --git a/components/hybrid_engine/src/snapshot.rs b/components/hybrid_engine/src/snapshot.rs index a4ed06bd91a..01587de3fa9 100644 --- a/components/hybrid_engine/src/snapshot.rs +++ b/components/hybrid_engine/src/snapshot.rs @@ -1,41 +1,49 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -use std::fmt::{self, Debug, Formatter}; +use std::{ + any::Any, + fmt::{self, Debug, Formatter}, +}; use engine_traits::{ - is_data_cf, CfNamesExt, IterOptions, Iterable, KvEngine, Peekable, RangeCacheEngine, - ReadOptions, Result, Snapshot, SnapshotMiscExt, CF_DEFAULT, + is_data_cf, CfNamesExt, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, + RegionCacheEngine, Result, Snapshot, SnapshotMiscExt, CF_DEFAULT, }; +use in_memory_engine::RegionCacheMemoryEngine; +use raftstore::coprocessor::ObservedSnapshot; -use crate::{db_vector::HybridDbVector, engine_iterator::HybridEngineIterator}; +use crate::{ + db_vector::HybridDbVector, engine_iterator::HybridEngineIterator, + observer::RegionCacheSnapshotPin, +}; pub struct HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { disk_snap: EK::Snapshot, - range_cache_snap: Option, + region_cache_snap: Option, } impl HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { - pub fn new(disk_snap: EK::Snapshot, range_cache_snap: Option) -> Self { + pub fn new(disk_snap: EK::Snapshot, region_cache_snap: Option) -> Self { HybridEngineSnapshot { disk_snap, - range_cache_snap, + region_cache_snap, } } - pub fn range_cache_snapshot_available(&self) -> bool { - self.range_cache_snap.is_some() + pub fn region_cache_snapshot_available(&self) -> bool { + self.region_cache_snap.is_some() } - pub fn range_cache_snap(&self) -> Option<&EC::Snapshot> { - self.range_cache_snap.as_ref() + pub fn region_cache_snap(&self) -> Option<&EC::Snapshot> { + self.region_cache_snap.as_ref() } pub fn disk_snap(&self) -> &EK::Snapshot { @@ -43,17 +51,42 @@ where } } +impl HybridEngineSnapshot +where + EK: KvEngine, +{ + pub fn from_observed_snapshot( + disk_snap: EK::Snapshot, + snap_pin: Option>, + ) -> Self { + let mut region_cache_snap = None; + if let Some(snap_pin) = snap_pin { + let snap_any: Box = snap_pin; + let mut region_cache_snap_pin: Box = + snap_any.downcast().unwrap(); + region_cache_snap = region_cache_snap_pin.take(); + } + HybridEngineSnapshot { + disk_snap, + region_cache_snap, + } + } +} + impl Snapshot for HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { + fn in_memory_engine_hit(&self) -> bool { + self.region_cache_snap.is_some() + } } impl Debug for HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { write!(fmt, "Hybrid Engine Snapshot Impl") @@ -63,15 +96,15 @@ where impl Iterable for HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { type Iterator = HybridEngineIterator; fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { - Ok(match self.range_cache_snap() { - Some(range_cache_snap) if is_data_cf(cf) => { - HybridEngineIterator::range_cache_engine_iterator( - range_cache_snap.iterator_opt(cf, opts)?, + Ok(match self.region_cache_snap() { + Some(region_cache_snap) if is_data_cf(cf) => { + HybridEngineIterator::region_cache_engine_iterator( + region_cache_snap.iterator_opt(cf, opts)?, ) } _ => HybridEngineIterator::disk_engine_iterator(self.disk_snap.iterator_opt(cf, opts)?), @@ -82,7 +115,7 @@ where impl Peekable for HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { type DbVector = HybridDbVector; @@ -96,9 +129,9 @@ where cf: &str, key: &[u8], ) -> Result> { - match self.range_cache_snap() { - Some(range_cache_snap) if is_data_cf(cf) => { - Self::DbVector::try_from_cache_snap(range_cache_snap, opts, cf, key) + match self.region_cache_snap() { + Some(region_cache_snap) if is_data_cf(cf) => { + Self::DbVector::try_from_cache_snap(region_cache_snap, opts, cf, key) } _ => Self::DbVector::try_from_disk_snap(&self.disk_snap, opts, cf, key), } @@ -108,7 +141,7 @@ where impl CfNamesExt for HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn cf_names(&self) -> Vec<&str> { self.disk_snap.cf_names() @@ -118,7 +151,7 @@ where impl SnapshotMiscExt for HybridEngineSnapshot where EK: KvEngine, - EC: RangeCacheEngine, + EC: RegionCacheEngine, { fn sequence_number(&self) -> u64 { self.disk_snap.sequence_number() @@ -127,59 +160,66 @@ where #[cfg(test)] mod tests { - use engine_traits::{ - CacheRange, IterOptions, Iterable, Iterator, KvEngine, Mutable, SnapshotContext, - WriteBatch, WriteBatchExt, CF_DEFAULT, + CacheRegion, IterOptions, Iterable, Iterator, Mutable, WriteBatch, WriteBatchExt, + CF_DEFAULT, }; - use range_cache_memory_engine::{RangeCacheEngineConfig, RangeCacheStatus}; + use in_memory_engine::{test_util::new_region, InMemoryEngineConfig, RegionCacheStatus}; + use raftstore::coprocessor::WriteBatchWrapper; - use crate::util::hybrid_engine_for_tests; + use crate::{ + engine::SnapshotContext, observer::RegionCacheWriteBatchObserver, + util::hybrid_engine_for_tests, + }; #[test] fn test_iterator() { - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); + let region = new_region(1, b"", b"z"); + let cache_region = CacheRegion::from_region(®ion); let mut iter_opt = IterOptions::default(); - iter_opt.set_upper_bound(&range.end, 0); - iter_opt.set_lower_bound(&range.start, 0); + iter_opt.set_upper_bound(&cache_region.end, 0); + iter_opt.set_lower_bound(&cache_region.start, 0); - let range_clone = range.clone(); + let region_clone = region.clone(); let (_path, hybrid_engine) = hybrid_engine_for_tests( "temp", - RangeCacheEngineConfig::config_for_test(), + InMemoryEngineConfig::config_for_test(), move |memory_engine| { - memory_engine.new_range(range_clone.clone()); - { - let mut core = memory_engine.core().write(); - core.mut_range_manager().set_safe_point(&range_clone, 5); - } + memory_engine.new_region(region_clone); + memory_engine.core().region_manager().set_safe_point(1, 5); }, ) .unwrap(); - let snap = hybrid_engine.snapshot(None); + let snap = hybrid_engine.new_snapshot(None); { let mut iter = snap.iterator_opt(CF_DEFAULT, iter_opt.clone()).unwrap(); assert!(!iter.seek_to_first().unwrap()); } - let mut write_batch = hybrid_engine.write_batch(); - write_batch.prepare_for_range(range.clone()); - write_batch + let engine = hybrid_engine.region_cache_engine().clone(); + let observer = RegionCacheWriteBatchObserver::new(engine.clone()); + let mut ob_wb = observer.new_observable_write_batch(); + ob_wb.cache_write_batch.prepare_for_region(®ion); + ob_wb .cache_write_batch - .set_range_cache_status(RangeCacheStatus::Cached); - write_batch.put(b"hello", b"world").unwrap(); + .set_region_cache_status(RegionCacheStatus::Cached); + let mut write_batch = WriteBatchWrapper::new( + hybrid_engine.disk_engine().write_batch(), + Some(Box::new(ob_wb)), + ); + write_batch.put(b"zhello", b"world").unwrap(); let seq = write_batch.write().unwrap(); assert!(seq > 0); let ctx = SnapshotContext { - range: Some(range.clone()), + region: Some(cache_region.clone()), read_ts: 10, }; - let snap = hybrid_engine.snapshot(Some(ctx)); + let snap = hybrid_engine.new_snapshot(Some(ctx)); { let mut iter = snap.iterator_opt(CF_DEFAULT, iter_opt).unwrap(); assert!(iter.seek_to_first().unwrap()); let actual_key = iter.key(); let actual_value = iter.value(); - assert_eq!(actual_key, b"hello"); + assert_eq!(actual_key, b"zhello"); assert_eq!(actual_value, b"world"); } } diff --git a/components/hybrid_engine/src/sst.rs b/components/hybrid_engine/src/sst.rs deleted file mode 100644 index 6b7b8ab0171..00000000000 --- a/components/hybrid_engine/src/sst.rs +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{ - KvEngine, RangeCacheEngine, Result, SstCompressionType, SstExt, SstWriterBuilder, -}; - -use crate::engine::HybridEngine; - -pub struct HybridEngineSstWriteBuilder(EK::SstWriterBuilder); - -impl SstExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type SstReader = EK::SstReader; - type SstWriter = EK::SstWriter; - type SstWriterBuilder = HybridEngineSstWriteBuilder; -} - -impl SstWriterBuilder> for HybridEngineSstWriteBuilder -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn new() -> Self { - HybridEngineSstWriteBuilder(EK::SstWriterBuilder::new()) - } - - fn set_db(self, db: &HybridEngine) -> Self { - HybridEngineSstWriteBuilder(self.0.set_db(db.disk_engine())) - } - - fn set_cf(self, cf: &str) -> Self { - HybridEngineSstWriteBuilder(self.0.set_cf(cf)) - } - - fn set_in_memory(self, in_memory: bool) -> Self { - HybridEngineSstWriteBuilder(self.0.set_in_memory(in_memory)) - } - - fn set_compression_type(self, compression: Option) -> Self { - HybridEngineSstWriteBuilder(self.0.set_compression_type(compression)) - } - - fn set_compression_level(self, level: i32) -> Self { - HybridEngineSstWriteBuilder(self.0.set_compression_level(level)) - } - - fn build(self, path: &str) -> Result< as SstExt>::SstWriter> { - self.0.build(path) - } -} diff --git a/components/hybrid_engine/src/table_properties.rs b/components/hybrid_engine/src/table_properties.rs deleted file mode 100644 index 0d5c2c5fd39..00000000000 --- a/components/hybrid_engine/src/table_properties.rs +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{KvEngine, Range, RangeCacheEngine, Result, TablePropertiesExt}; - -use crate::engine::HybridEngine; - -impl TablePropertiesExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - type TablePropertiesCollection = EK::TablePropertiesCollection; - - fn table_properties_collection( - &self, - cf: &str, - ranges: &[Range<'_>], - ) -> Result { - self.disk_engine().table_properties_collection(cf, ranges) - } -} diff --git a/components/hybrid_engine/src/ttl_properties.rs b/components/hybrid_engine/src/ttl_properties.rs deleted file mode 100644 index 47e362bccf7..00000000000 --- a/components/hybrid_engine/src/ttl_properties.rs +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::{KvEngine, RangeCacheEngine, Result, TtlProperties, TtlPropertiesExt}; - -use crate::engine::HybridEngine; - -impl TtlPropertiesExt for HybridEngine -where - EK: KvEngine, - EC: RangeCacheEngine, -{ - fn get_range_ttl_properties_cf( - &self, - cf: &str, - start_key: &[u8], - end_key: &[u8], - ) -> Result> { - self.disk_engine() - .get_range_ttl_properties_cf(cf, start_key, end_key) - } -} diff --git a/components/hybrid_engine/src/util.rs b/components/hybrid_engine/src/util.rs index 4d8a614a36d..2a5828a7658 100644 --- a/components/hybrid_engine/src/util.rs +++ b/components/hybrid_engine/src/util.rs @@ -3,10 +3,8 @@ use std::sync::Arc; use engine_rocks::{util::new_engine, RocksEngine}; -use engine_traits::{RangeCacheEngine, Result, CF_DEFAULT, CF_LOCK, CF_WRITE}; -use range_cache_memory_engine::{ - RangeCacheEngineConfig, RangeCacheEngineContext, RangeCacheMemoryEngine, -}; +use engine_traits::{RegionCacheEngine, Result, CF_DEFAULT, CF_LOCK, CF_WRITE}; +use in_memory_engine::{InMemoryEngineConfig, InMemoryEngineContext, RegionCacheMemoryEngine}; use tempfile::{Builder, TempDir}; use tikv_util::config::VersionTrack; @@ -19,31 +17,29 @@ use crate::HybridEngine; /// /// ``` /// use hybrid_engine::util::hybrid_engine_for_tests; -/// let (_path, _hybrid_engine) = hybrid_engine_for_tests("temp", |memory_engine| { -/// let range = engine_traits::CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); -/// memory_engine.new_range(range.clone()); -/// { -/// let mut core = memory_engine.core().write().unwrap(); -/// core.mut_range_manager().set_range_readable(&range, true); -/// core.mut_range_manager().set_safe_ts(&range, 10); -/// } +/// use in_memory_engine::{test_util::new_region, InMemoryEngineConfig}; +/// let mut config = InMemoryEngineConfig::default(); +/// config.enable = true; +/// let (_path, _hybrid_engine) = hybrid_engine_for_tests("temp", config, |memory_engine| { +/// let region = new_region(1, b"", b"z"); +/// memory_engine.new_region(region); /// }) /// .unwrap(); /// ``` pub fn hybrid_engine_for_tests( prefix: &str, - config: RangeCacheEngineConfig, + config: InMemoryEngineConfig, configure_memory_engine_fn: F, -) -> Result<(TempDir, HybridEngine)> +) -> Result<(TempDir, HybridEngine)> where - F: FnOnce(&RangeCacheMemoryEngine), + F: FnOnce(&RegionCacheMemoryEngine), { let path = Builder::new().prefix(prefix).tempdir()?; let disk_engine = new_engine( path.path().to_str().unwrap(), &[CF_DEFAULT, CF_LOCK, CF_WRITE], )?; - let mut memory_engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( + let mut memory_engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( Arc::new(VersionTrack::new(config)), )); memory_engine.set_disk_engine(disk_engine.clone()); diff --git a/components/hybrid_engine/src/write_batch.rs b/components/hybrid_engine/src/write_batch.rs deleted file mode 100644 index 136f99c0ce5..00000000000 --- a/components/hybrid_engine/src/write_batch.rs +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::atomic::{AtomicBool, Ordering}; - -use engine_traits::{ - is_data_cf, CacheRange, KvEngine, Mutable, Result, WriteBatch, WriteBatchExt, WriteOptions, -}; -use range_cache_memory_engine::{RangeCacheMemoryEngine, RangeCacheWriteBatch}; - -use crate::engine::HybridEngine; - -pub struct HybridEngineWriteBatch { - disk_write_batch: EK::WriteBatch, - pub(crate) cache_write_batch: RangeCacheWriteBatch, -} - -impl WriteBatchExt for HybridEngine -where - EK: KvEngine, -{ - type WriteBatch = HybridEngineWriteBatch; - const WRITE_BATCH_MAX_KEYS: usize = EK::WRITE_BATCH_MAX_KEYS; - - fn write_batch(&self) -> Self::WriteBatch { - HybridEngineWriteBatch { - disk_write_batch: self.disk_engine().write_batch(), - cache_write_batch: self.range_cache_engine().write_batch(), - } - } - - fn write_batch_with_cap(&self, cap: usize) -> Self::WriteBatch { - HybridEngineWriteBatch { - disk_write_batch: self.disk_engine().write_batch_with_cap(cap), - cache_write_batch: self.range_cache_engine().write_batch_with_cap(cap), - } - } -} - -impl WriteBatch for HybridEngineWriteBatch { - fn write_opt(&mut self, opts: &WriteOptions) -> Result { - self.write_callback_opt(opts, |_| ()) - } - - fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut(u64)) -> Result { - let called = AtomicBool::new(false); - let res = self - .disk_write_batch - .write_callback_opt(opts, |s| { - if !called.fetch_or(true, Ordering::SeqCst) { - self.cache_write_batch.set_sequence_number(s).unwrap(); - self.cache_write_batch.write_opt(opts).unwrap(); - } - }) - .map(|s| { - cb(s); - s - }); - self.cache_write_batch.maybe_compact_lock_cf(); - res - } - - fn data_size(&self) -> usize { - self.disk_write_batch.data_size() - } - - fn count(&self) -> usize { - self.disk_write_batch.count() - } - - fn is_empty(&self) -> bool { - self.disk_write_batch.is_empty() - } - - fn should_write_to_engine(&self) -> bool { - self.disk_write_batch.should_write_to_engine() - } - - fn clear(&mut self) { - self.disk_write_batch.clear(); - self.cache_write_batch.clear() - } - - fn set_save_point(&mut self) { - self.disk_write_batch.set_save_point(); - self.cache_write_batch.set_save_point() - } - - fn pop_save_point(&mut self) -> Result<()> { - self.disk_write_batch.pop_save_point()?; - self.cache_write_batch.pop_save_point() - } - - fn rollback_to_save_point(&mut self) -> Result<()> { - self.disk_write_batch.rollback_to_save_point()?; - self.cache_write_batch.rollback_to_save_point() - } - - fn merge(&mut self, other: Self) -> Result<()> { - self.disk_write_batch.merge(other.disk_write_batch)?; - self.cache_write_batch.merge(other.cache_write_batch) - } - - fn prepare_for_range(&mut self, range: CacheRange) { - self.cache_write_batch.prepare_for_range(range); - } -} - -impl Mutable for HybridEngineWriteBatch { - fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { - self.disk_write_batch.put(key, value)?; - self.cache_write_batch.put(key, value) - } - - fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { - self.disk_write_batch.put_cf(cf, key, value)?; - if is_data_cf(cf) { - self.cache_write_batch.put_cf(cf, key, value)?; - } - Ok(()) - } - - fn delete(&mut self, key: &[u8]) -> Result<()> { - self.disk_write_batch.delete(key)?; - self.cache_write_batch.delete(key) - } - - fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { - self.disk_write_batch.delete_cf(cf, key)?; - self.cache_write_batch.delete_cf(cf, key) - } - - fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - self.disk_write_batch.delete_range(begin_key, end_key)?; - // delete_range in range cache engine means eviction -- all ranges overlapped - // with [begin_key, end_key] will be evicted. - self.cache_write_batch.delete_range(begin_key, end_key) - } - - fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - self.disk_write_batch - .delete_range_cf(cf, begin_key, end_key)?; - // delete_range in range cache engine means eviction -- all ranges overlapped - // with [begin_key, end_key] will be evicted. - self.cache_write_batch - .delete_range_cf(cf, begin_key, end_key) - } -} - -#[cfg(test)] -mod tests { - - use std::time::Duration; - - use engine_traits::{ - CacheRange, KvEngine, Mutable, Peekable, RangeCacheEngine, SnapshotContext, WriteBatch, - WriteBatchExt, - }; - use range_cache_memory_engine::{RangeCacheEngineConfig, RangeCacheStatus}; - - use crate::util::hybrid_engine_for_tests; - - #[test] - fn test_write_to_both_engines() { - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - let range_clone = range.clone(); - let (_path, hybrid_engine) = hybrid_engine_for_tests( - "temp", - RangeCacheEngineConfig::config_for_test(), - move |memory_engine| { - memory_engine.new_range(range_clone.clone()); - { - let mut core = memory_engine.core().write(); - core.mut_range_manager().set_safe_point(&range_clone, 5); - } - }, - ) - .unwrap(); - let mut write_batch = hybrid_engine.write_batch(); - write_batch.prepare_for_range(range.clone()); - write_batch - .cache_write_batch - .set_range_cache_status(RangeCacheStatus::Cached); - write_batch.put(b"hello", b"world").unwrap(); - let seq = write_batch.write().unwrap(); - assert!(seq > 0); - let actual: &[u8] = &hybrid_engine.get_value(b"hello").unwrap().unwrap(); - assert_eq!(b"world", &actual); - let ctx = SnapshotContext { - range: Some(range.clone()), - read_ts: 10, - }; - let snap = hybrid_engine.snapshot(Some(ctx)); - let actual: &[u8] = &snap.get_value(b"hello").unwrap().unwrap(); - assert_eq!(b"world", &actual); - let actual: &[u8] = &snap.disk_snap().get_value(b"hello").unwrap().unwrap(); - assert_eq!(b"world", &actual); - let actual: &[u8] = &snap - .range_cache_snap() - .unwrap() - .get_value(b"hello") - .unwrap() - .unwrap(); - assert_eq!(b"world", &actual); - } - - #[test] - fn test_range_cache_memory_engine() { - let (_path, hybrid_engine) = hybrid_engine_for_tests( - "temp", - RangeCacheEngineConfig::config_for_test(), - |memory_engine| { - let range = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - memory_engine.new_range(range.clone()); - { - let mut core = memory_engine.core().write(); - core.mut_range_manager().set_safe_point(&range, 10); - } - }, - ) - .unwrap(); - - let mut write_batch = hybrid_engine.write_batch(); - write_batch - .cache_write_batch - .set_sequence_number(0) - .unwrap(); // First call ok. - assert!( - write_batch - .cache_write_batch - .set_sequence_number(0) - .is_err() - ); // Second call err. - } - - #[test] - fn test_delete_range() { - let range1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let range2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - - let range1_clone = range1.clone(); - let range2_clone = range2.clone(); - let (_path, hybrid_engine) = hybrid_engine_for_tests( - "temp", - RangeCacheEngineConfig::config_for_test(), - move |memory_engine| { - memory_engine.new_range(range1_clone); - memory_engine.new_range(range2_clone); - }, - ) - .unwrap(); - - let mut wb = hybrid_engine.write_batch(); - wb.prepare_for_range(range1.clone()); - wb.put(b"k05", b"val").unwrap(); - wb.put(b"k08", b"val2").unwrap(); - wb.prepare_for_range(range2.clone()); - wb.put(b"k25", b"val3").unwrap(); - wb.put(b"k27", b"val4").unwrap(); - wb.write().unwrap(); - - hybrid_engine - .range_cache_engine() - .snapshot(range1.clone(), 1000, 1000) - .unwrap(); - hybrid_engine - .range_cache_engine() - .snapshot(range2.clone(), 1000, 1000) - .unwrap(); - assert_eq!( - 4, - hybrid_engine - .range_cache_engine() - .core() - .read() - .engine() - .cf_handle("default") - .len() - ); - - let mut wb = hybrid_engine.write_batch(); - // all ranges overlapped with it will be evicted - wb.delete_range(b"k05", b"k21").unwrap(); - wb.write().unwrap(); - - hybrid_engine - .range_cache_engine() - .snapshot(range1, 1000, 1000) - .unwrap_err(); - hybrid_engine - .range_cache_engine() - .snapshot(range2, 1000, 1000) - .unwrap_err(); - let m_engine = hybrid_engine.range_cache_engine(); - - let mut times = 0; - while times < 10 { - if m_engine - .core() - .read() - .engine() - .cf_handle("default") - .is_empty() - { - return; - } - times += 1; - std::thread::sleep(Duration::from_millis(200)); - } - panic!("data is not empty"); - } -} diff --git a/components/hybrid_engine/tests/failpoints/test_write_batch.rs b/components/hybrid_engine/tests/failpoints/test_write_batch.rs deleted file mode 100644 index 2a28a80dc9b..00000000000 --- a/components/hybrid_engine/tests/failpoints/test_write_batch.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::mpsc::sync_channel; - -use crossbeam::epoch; -use engine_traits::{CacheRange, Mutable, WriteBatch, WriteBatchExt}; -use hybrid_engine::util::hybrid_engine_for_tests; -use range_cache_memory_engine::{decode_key, InternalKey, RangeCacheEngineConfig, ValueType}; - -#[test] -fn test_sequence_number_unique() { - let (_path, hybrid_engine) = - hybrid_engine_for_tests("temp", RangeCacheEngineConfig::config_for_test(), |_| {}).unwrap(); - - let (tx, rx) = sync_channel(0); - fail::cfg_callback("pending_range_completes_loading", move || { - fail::cfg("on_snapshot_load_finished", "pause").unwrap(); - tx.send(true).unwrap(); - }) - .unwrap(); - - let engine = hybrid_engine.range_cache_engine().clone(); - let r = CacheRange::new(b"k".to_vec(), b"k5".to_vec()); - engine.new_range(r.clone()); - - // Mock that we have a loading range, and there are some keys written in it - // during the load - let r2 = CacheRange::new(b"k5".to_vec(), b"k7".to_vec()); - let r3 = CacheRange::new(b"k7".to_vec(), b"k9".to_vec()); - - engine.load_range(r2.clone()).unwrap(); - engine.load_range(r3.clone()).unwrap(); - - // The sequence number of write batch should be increased one by one, otherwise - // if a delete and a put of the same key occurs in the same write batch, - // the delete will be hidden by the put even the delete is performed - // after the put. - let mut wb = hybrid_engine.write_batch(); - wb.prepare_for_range(r.clone()); - wb.put(b"k", b"val").unwrap(); // seq 6 - wb.delete(b"k").unwrap(); // seq 7 - wb.put(b"k2", b"val").unwrap(); // seq 8 - - wb.prepare_for_range(r2.clone()); - wb.put(b"k6", b"val").unwrap(); // seq 3 - wb.put(b"k5", b"val").unwrap(); // seq 4 - wb.delete(b"k5").unwrap(); // seq 5 - - wb.prepare_for_range(r3.clone()); - wb.put(b"k8", b"val").unwrap(); // seq 1 - wb.put(b"k7", b"val").unwrap(); // seq 2 - - rx.recv().unwrap(); - wb.write().unwrap(); - - // For sequence number increment, the loading range get increment first, the - // loading range that completes the loading before consuming the write batch get - // increment second, and the cached range get increment last. - let mut iter = engine - .core() - .read() - .engine() - .cf_handle("default") - .iterator(); - let guard = &epoch::pin(); - - let mut first = true; - - for (k, seq, ty) in [ - (b"k".to_vec(), 7, ValueType::Deletion), - (b"k".to_vec(), 6, ValueType::Value), - (b"k2".to_vec(), 8, ValueType::Value), - (b"k5".to_vec(), 5, ValueType::Deletion), - (b"k5".to_vec(), 4, ValueType::Value), - (b"k6".to_vec(), 3, ValueType::Value), - ] { - if first { - iter.seek_to_first(guard); - first = false; - } else { - iter.next(guard); - } - - let key = iter.key(); - let InternalKey { - user_key, - sequence, - v_type, - } = decode_key(key.as_bytes()); - assert_eq!(sequence, seq); - assert_eq!(user_key, &k); - assert_eq!(v_type, ty); - } -} diff --git a/components/range_cache_memory_engine/Cargo.toml b/components/in_memory_engine/Cargo.toml similarity index 83% rename from components/range_cache_memory_engine/Cargo.toml rename to components/in_memory_engine/Cargo.toml index 45e94d82e07..a29ad0d6daf 100644 --- a/components/range_cache_memory_engine/Cargo.toml +++ b/components/in_memory_engine/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "range_cache_memory_engine" +name = "in_memory_engine" version = "0.0.1" edition = "2021" publish = false @@ -14,43 +14,50 @@ name = "failpoints" path = "tests/failpoints/mod.rs" required-features = ["failpoints"] +[[bench]] +name = "load_region" +path = "benches/load_region.rs" +harness = false + [dependencies] -bytes = "1.0" +engine_traits = { workspace = true } collections = { workspace = true } -crossbeam = { workspace = true } crossbeam-skiplist = { workspace = true } -dashmap = "5.1" -engine_rocks = { workspace = true } -engine_traits = { workspace = true } -fail = "0.5" +bytes = "1.0" +crossbeam = { workspace = true } futures = { version = "0.3", features = ["compat"] } -hex = "0.4" -keys = { workspace = true } +tikv_util = { workspace = true } +txn_types = { workspace = true } kvproto = { workspace = true } -lazy_static = "1.4.0" -libc = "0.2" log_wrappers = { workspace = true } -online_config = { workspace = true } -parking_lot = "0.12" pd_client = { workspace = true } -prometheus = { version = "0.13", default-features = false, features = ["nightly"] } -prometheus-static-metric = "0.5" raftstore = { workspace = true } -rand = "0.8" +dashmap = "5.1" security = { workspace = true } serde = "1.0" -serde_derive = "1.0" serde_json = "1.0" -slog = { workspace = true } slog-global = { workspace = true } -thiserror = "1.0" -tikv_util = { workspace = true } -txn_types = { workspace = true } +slog = { workspace = true } +strum = { version = "0.20", features = ["derive"] } +engine_rocks = { workspace = true } +fail = "0.5" yatp = { workspace = true } +parking_lot = "0.12" +keys = { workspace = true } +prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +prometheus-static-metric = "0.5" +lazy_static = "1.4.0" +hex = "0.4" +thiserror = "1.0" +online_config = { workspace = true } +libc = "0.2" +rand = "0.8" +tokio = { version = "1.5", features = ["rt-multi-thread"] } +smallvec = "1.4" [dev-dependencies] -proptest = "1.0.0" +criterion = "0.3" tempfile = "3.0" test_pd = { workspace = true } -test_pd_client = { workspace = true } test_util = { workspace = true } +proptest = "1.0.0" diff --git a/components/in_memory_engine/benches/load_region.rs b/components/in_memory_engine/benches/load_region.rs new file mode 100644 index 00000000000..77315ffc3cb --- /dev/null +++ b/components/in_memory_engine/benches/load_region.rs @@ -0,0 +1,149 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(test)] + +use std::sync::Arc; + +use criterion::*; +use engine_rocks::{util::new_engine, RocksEngine}; +use engine_traits::{ + CacheRegion, KvEngine, Mutable, RegionCacheEngine, WriteBatch, WriteBatchExt, CF_DEFAULT, + CF_WRITE, DATA_CFS, +}; +use futures::future::ready; +use in_memory_engine::{BackgroundRunner, *}; +use keys::{DATA_MAX_KEY, DATA_MIN_KEY}; +use pd_client::PdClient; +use raftstore::coprocessor::config::SPLIT_SIZE; +use rand::{thread_rng, RngCore}; +use tikv_util::config::VersionTrack; +use txn_types::{Key, TimeStamp, Write, WriteType}; + +/// Benches the performace of background load region +fn bench_load_region(c: &mut Criterion) { + for value_size in [32, 128, 512, 4096] { + bench_with_args(c, 128, value_size, SPLIT_SIZE.0 as usize, 100); + } +} + +fn bench_with_args( + c: &mut Criterion, + key_size: usize, + value_size: usize, + region_size: usize, + mvcc_amp_thres: usize, +) { + use std::time::Duration; + + let rocks_engine = prepare_data(key_size, value_size, region_size, mvcc_amp_thres); + let mut group = c.benchmark_group("load_region"); + // the bench is slow and workload is not useful. + group.warm_up_time(Duration::from_millis(1)).sample_size(10); + group.bench_function(format!("value size {}", value_size), |b| { + b.iter_with_large_drop(|| { + load_region(&rocks_engine); + }) + }); +} + +fn prepare_data( + key_size: usize, + value_size: usize, + region_size: usize, + mvcc_amp_thres: usize, +) -> RocksEngine { + let path = tempfile::Builder::new() + .prefix("bench_load") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + // prepare for test data + let mut r = thread_rng(); + let mut wb = rocks_engine.write_batch(); + let mut ts_count = 1; + let count = (region_size + key_size + value_size - 1) / (key_size + value_size); + let mut key = vec![0u8; key_size]; + r.fill_bytes(&mut key[..key_size - 8]); + let mut key_version_count = 0; + let mut mvcc_version = r.next_u32() as usize % (mvcc_amp_thres * 2) + 1; + for _i in 0..count { + if key_version_count >= mvcc_version { + r.fill_bytes(&mut key[..key_size - 8]); + mvcc_version = r.next_u32() as usize % (mvcc_amp_thres * 2) + 1; + key_version_count = 0; + } + + let ts = TimeStamp::new(ts_count); + let k = keys::data_key(Key::from_raw(&key).append_ts(ts).as_encoded()); + let mut value = vec![0u8; value_size]; + r.fill_bytes(&mut value); + + let v = if value_size <= 256 { + Some(value) + } else { + wb.put_cf(CF_DEFAULT, &k, &value).unwrap(); + None + }; + let w = Write::new(WriteType::Put, ts, v); + wb.put_cf(CF_WRITE, &k, &w.as_ref().to_bytes()).unwrap(); + + key_version_count += 1; + ts_count += 1; + } + + wb.write().unwrap(); + + rocks_engine +} + +fn load_region(rocks_engine: &RocksEngine) { + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + engine.set_disk_engine(rocks_engine.clone()); + let memory_controller = engine.memory_controller(); + + // do load + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core().clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockTsPdClient::new()), + None, + ); + + let region = CacheRegion::new(1, 1, DATA_MIN_KEY, DATA_MAX_KEY); + engine.load_region(region.clone()).unwrap(); + // update region state to loading to avoid background load. + engine.must_set_region_state(1, RegionState::Loading); + + let snapshot = Arc::new(rocks_engine.snapshot()); + worker.run_load_region(region, snapshot); +} + +struct MockTsPdClient { + ts: TimeStamp, +} + +impl MockTsPdClient { + fn new() -> Self { + // use now to build a big enough timestamp to ensure gc can run. + let now = TimeStamp::physical_now(); + Self { + ts: TimeStamp::compose(now, 0), + } + } +} + +impl PdClient for MockTsPdClient { + fn get_tso(&self) -> pd_client::PdFuture { + Box::pin(ready(Ok(self.ts))) + } +} + +criterion_group!(benches, bench_load_region); +criterion_main!(benches); diff --git a/components/in_memory_engine/src/background.rs b/components/in_memory_engine/src/background.rs new file mode 100644 index 00000000000..23b593b60d0 --- /dev/null +++ b/components/in_memory_engine/src/background.rs @@ -0,0 +1,3324 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{borrow::Cow, fmt, sync::Arc, time::Duration}; + +use bytes::Bytes; +use crossbeam::{ + channel::{bounded, tick, Sender}, + epoch, select, +}; +use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_traits::{ + CacheRegion, EvictReason, IterOptions, Iterable, Iterator, MiscExt, RangeHintService, + SnapshotMiscExt, CF_DEFAULT, CF_WRITE, DATA_CFS, +}; +use fail::fail_point; +use keys::{origin_end_key, origin_key}; +use pd_client::{PdClient, RpcClient}; +use raftstore::{ + coprocessor::RegionInfoProvider, + store::{CasualMessage, CasualRouter}, +}; +use slog_global::{error, info, warn}; +use strum::EnumCount; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + future::block_on_timeout, + keybuilder::KeyBuilder, + time::Instant, + worker::{Builder, Runnable, RunnableWithTimer, ScheduleError, Scheduler, Worker}, +}; +use tokio::sync::mpsc; +use txn_types::{Key, TimeStamp, WriteRef, WriteType}; +use yatp::Remote; + +use crate::{ + cross_check::CrossChecker, + engine::{RegionCacheMemoryEngineCore, SkiplistHandle}, + keys::{ + decode_key, encode_key, encode_key_for_boundary_with_mvcc, encoding_for_filter, + InternalBytes, InternalKey, ValueType, + }, + memory_controller::{MemoryController, MemoryUsage}, + metrics::{ + IN_MEMORY_ENGINE_CACHE_COUNT, IN_MEMORY_ENGINE_GC_FILTERED_STATIC, + IN_MEMORY_ENGINE_GC_TIME_HISTOGRAM, IN_MEMORY_ENGINE_LOAD_TIME_HISTOGRAM, + IN_MEMORY_ENGINE_MEMORY_USAGE, IN_MEMORY_ENGINE_NEWEST_SAFE_POINT, + IN_MEMORY_ENGINE_OLDEST_SAFE_POINT, SAFE_POINT_GAP, + }, + region_label::{ + LabelRule, RegionLabelChangedCallback, RegionLabelRulesManager, RegionLabelServiceBuilder, + }, + region_manager::{AsyncFnOnce, CacheRegionMeta, RegionState}, + region_stats::{RegionStatsManager, DEFAULT_EVICT_MIN_DURATION}, + write_batch::RegionCacheWriteBatchEntry, + InMemoryEngineConfig, RegionCacheMemoryEngine, +}; + +// 5 seconds should be long enough for getting a TSO from PD. +const TIMTOUT_FOR_TSO: Duration = Duration::from_secs(5); + +/// Try to extract the key and `u64` timestamp from `encoded_key`. +/// +/// See also: [`txn_types::Key::split_on_ts_for`] +pub(crate) fn split_ts(key: &[u8]) -> Result<(&[u8], u64), String> { + match Key::split_on_ts_for(key) { + Ok((key, ts)) => Ok((key, ts.into_inner())), + Err(_) => Err(format!( + "invalid write cf key: {}", + log_wrappers::Value(key) + )), + } +} + +pub(crate) fn parse_write(value: &[u8]) -> Result, String> { + match WriteRef::parse(value) { + Ok(write) => Ok(write), + Err(_) => Err(format!( + "invalid write cf value: {}", + log_wrappers::Value(value) + )), + } +} + +pub enum BackgroundTask { + Gc(GcTask), + LoadRegion(CacheRegion, Arc), + MemoryCheckAndEvict, + DeleteRegions(Vec), + TopRegionsLoadEvict, + CleanLockTombstone(u64), + TurnOnCrossCheck( + ( + RegionCacheMemoryEngine, + RocksEngine, + Arc, + Duration, + Box Option + Send>, + ), + ), + SetRocksEngine(RocksEngine), + CheckLoadPendingRegions(Scheduler), +} + +impl fmt::Display for BackgroundTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self { + BackgroundTask::Gc(t) => t.fmt(f), + BackgroundTask::LoadRegion(..) => f.debug_struct("LoadTask").finish(), + BackgroundTask::MemoryCheckAndEvict => f.debug_struct("MemoryCheckAndEvict").finish(), + BackgroundTask::DeleteRegions(r) => { + f.debug_struct("DeleteRegions").field("region", r).finish() + } + BackgroundTask::TopRegionsLoadEvict => f.debug_struct("CheckTopRegions").finish(), + BackgroundTask::CleanLockTombstone(r) => f + .debug_struct("CleanLockTombstone") + .field("seqno", r) + .finish(), + BackgroundTask::TurnOnCrossCheck(_) => f.debug_struct("TurnOnCrossCheck").finish(), + BackgroundTask::SetRocksEngine(_) => f.debug_struct("SetDiskEngine").finish(), + BackgroundTask::CheckLoadPendingRegions(_) => { + f.debug_struct("CheckLoadPendingRegions").finish() + } + } + } +} + +impl fmt::Debug for BackgroundTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self) + } +} + +#[derive(Debug)] +pub struct GcTask { + pub safe_point: u64, +} + +impl fmt::Display for GcTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("GcTask") + .field("safe_point", &self.safe_point) + .finish() + } +} + +// BgWorkManager managers the worker inits, stops, and task schedules. When +// created, it starts a worker which receives tasks such as gc task, range +// delete task, region snapshot load and so on, and starts a thread for +// periodically schedule gc tasks. +pub struct BgWorkManager { + worker: Worker, + scheduler: Scheduler, + delete_region_scheduler: Scheduler, + tick_stopper: Option<(Worker, Sender)>, + core: Arc, + region_info_provider: Option>, +} + +impl Drop for BgWorkManager { + fn drop(&mut self) { + let (ticker, tx) = self.tick_stopper.take().unwrap(); + let _ = tx.send(true); + ticker.stop(); + self.worker.stop(); + } +} + +pub struct PdRangeHintService(Arc); + +impl RangeHintService for PdRangeHintService {} + +impl From> for PdRangeHintService { + fn from(pd_client: Arc) -> Self { + PdRangeHintService(pd_client) + } +} + +const CACHE_LABEL_RULE_KEY: &str = "cache"; +const CACHE_LABEL_RULE_ALWAYS: &str = "always"; + +/// This implementation starts a background task using to pull down region label +/// rules from PD. +impl PdRangeHintService { + /// Spawn a background task on `remote` to continuosly watch for region + /// label rules that contain the label `cache`; if a new added for which + /// `cache` is set to `always`, request loading the label's keyranges using + /// `range_manager_load_cb`. + /// + /// TODO (afeinberg): Add support for evicting key ranges when the `cache` + /// label is removed or no longer set to always. + pub fn start(&self, remote: Remote, range_manager_load_cb: F) + where + F: Fn(&CacheRegion, bool) + Send + Sync + 'static, + { + let pd_client = self.0.clone(); + let region_label_changed_cb: RegionLabelChangedCallback = Arc::new( + move |label_rule: &LabelRule, is_add: bool| { + if !label_rule + .labels + .iter() + .any(|e| e.key == CACHE_LABEL_RULE_KEY && e.value == CACHE_LABEL_RULE_ALWAYS) + { + // not related to caching, skip. + return; + } + for key_range in &label_rule.data { + match CacheRegion::try_from(key_range) { + Ok(cache_region) => { + info!("ime requested to cache range"; "range" => ?&cache_region); + range_manager_load_cb(&cache_region, is_add); + } + Err(e) => { + error!("ime unable to convert key_range rule to cache range"; "error" => ?e); + } + } + } + }, + ); + let mut region_label_svc = RegionLabelServiceBuilder::new( + Arc::new(RegionLabelRulesManager { + region_label_change_cb: Some(region_label_changed_cb), + ..RegionLabelRulesManager::default() + }), + pd_client, + ) + .rule_filter_fn(|label_rule| { + label_rule + .labels + .iter() + .any(|e| e.key == CACHE_LABEL_RULE_KEY) + }) + .build() + .unwrap(); + remote.spawn(async move { region_label_svc.watch_region_labels().await }) + } +} + +impl BgWorkManager { + pub fn new( + core: Arc, + pd_client: Arc, + config: Arc>, + memory_controller: Arc, + region_info_provider: Option>, + raft_casual_router: Option>>, + ) -> Self { + let worker = Worker::new("ime-bg"); + let (runner, delete_range_scheduler) = BackgroundRunner::new( + core.clone(), + memory_controller, + region_info_provider.clone(), + config.clone(), + pd_client.clone(), + raft_casual_router, + ); + let scheduler = worker.start_with_timer("ime-bg-runner", runner); + + let (ticker, tx) = BgWorkManager::start_tick(scheduler.clone(), pd_client, config.clone()); + + Self { + worker, + scheduler, + delete_region_scheduler: delete_range_scheduler, + tick_stopper: Some((ticker, tx)), + core, + region_info_provider, + } + } + + pub fn schedule_task(&self, task: BackgroundTask) -> Result<(), ScheduleError> { + match task { + task @ BackgroundTask::DeleteRegions(_) => { + self.delete_region_scheduler.schedule_force(task) + } + task => self.scheduler.schedule_force(task), + } + } + + pub(crate) fn background_scheduler(&self) -> &Scheduler { + &self.scheduler + } + + pub fn start_bg_hint_service(&self, range_hint_service: PdRangeHintService) { + let core = self.core.clone(); + let region_info_provider = self.region_info_provider.clone(); + range_hint_service.start( + self.worker.remote(), + move |range: &CacheRegion, is_add: bool| { + let region_manager = core.region_manager(); + if !is_add { + region_manager + .regions_map() + .write() + .remove_manual_load_range(range.clone()); + region_manager.evict_region(range, EvictReason::Manual, None); + return; + } + + region_manager + .regions_map() + .write() + .add_manual_load_range(range.clone()); + + let Some(ref info_provider) = region_info_provider else { + warn!("ime region info provider is none, skip manual load range."); + return; + }; + + let start = origin_key(&range.start); + let end = origin_end_key(&range.end); + let regions = match info_provider.get_regions_in_range(start, end) { + Ok(r) => r, + Err(e) => { + warn!( + "ime get regions in range failed"; "err" => ?e, + "start" => ?log_wrappers::Value(start), + "end" => ?log_wrappers::Value(end) + ); + return; + } + }; + + let total = regions.len(); + let mut failed = 0; + for r in regions { + // TODO: Only load region leaders. + let cache_region = CacheRegion::from_region(&r); + if let Err(e) = region_manager.load_region(cache_region) { + failed += 1; + warn!("ime load region failed"; "err" => ?e, "region" => ?r); + } + } + info!( + "ime manual load summary"; + "range" => ?range, + "success" => total - failed, + "failed" => failed, + ); + }, + ); + } + + fn start_tick( + scheduler: Scheduler, + pd_client: Arc, + config: Arc>, + ) -> (Worker, Sender) { + let (tx, rx) = bounded(0); + // TODO: Instead of spawning a new thread, we should run this task + // in a shared background thread. + let ticker = Builder::new("ime-ticker").thread_count(1).create(); + // The interval here is somewhat arbitrary, as long as it is less than + // intervals in the loop, it should be fine, because it spawns a + // blocking task. + // TODO: Spawn non-blocking tasks and make full use of the ticker. + let interval = Duration::from_millis(100); + let check_load_pending_interval = (|| { + fail_point!("ime_background_check_load_pending_interval", |t| { + let t = t.unwrap().parse::().unwrap(); + Duration::from_millis(t) + }); + Duration::from_secs(5) + })(); + ticker.spawn_interval_task(interval, move || { + let mut gc_run_interval = config.value().gc_run_interval.0; + let mut gc_ticker = tick(gc_run_interval); + let mut load_evict_interval = config.value().load_evict_interval.0; + let mut load_evict_ticker = tick(load_evict_interval); + let mut tso_timeout = std::cmp::min(gc_run_interval, TIMTOUT_FOR_TSO); + let check_pending_region_ticker = tick(check_load_pending_interval); + 'LOOP: loop { + select! { + recv(gc_ticker) -> _ => { + let now = match block_on_timeout(pd_client.get_tso(), tso_timeout) { + Ok(Ok(ts)) => ts, + err => { + error!( + "ime schedule gc failed "; + "timeout_duration" => ?tso_timeout, + "error" => ?err, + ); + continue 'LOOP; + } + }; + let safe_point = now.physical() - gc_run_interval.as_millis() as u64; + let safe_point = TimeStamp::compose(safe_point, 0).into_inner(); + if let Err(e) = scheduler.schedule(BackgroundTask::Gc(GcTask {safe_point})) { + error!( + "ime schedule gc failed"; + "err" => ?e, + ); + } + let cur_gc_run_interval = config.value().gc_run_interval.0; + if cur_gc_run_interval != gc_run_interval { + tso_timeout = std::cmp::min(gc_run_interval, TIMTOUT_FOR_TSO); + info!( + "ime gc-run-interval changed"; + "from" => ?gc_run_interval, + "to" => ?cur_gc_run_interval, + ); + gc_run_interval = cur_gc_run_interval; + gc_ticker = tick(gc_run_interval); + } + }, + recv(load_evict_ticker) -> _ => { + if let Err(e) = scheduler.schedule(BackgroundTask::TopRegionsLoadEvict) { + error!( + "ime schedule load evict failed"; + "err" => ?e, + ); + } + let cur_load_evict_interval = config.value().load_evict_interval.0; + if cur_load_evict_interval != load_evict_interval { + info!( + "ime load-evict-interval changed"; + "from" => ?load_evict_interval, + "to" => ?cur_load_evict_interval, + ); + load_evict_interval = cur_load_evict_interval; + load_evict_ticker = tick(load_evict_interval); + } + }, + recv(check_pending_region_ticker) -> _ => { + let s = scheduler.clone(); + if let Err(e) = scheduler.schedule(BackgroundTask::CheckLoadPendingRegions(s)) { + error!( + "ime schedule check pending regions failed"; + "err" => ?e, + ); + } + } + recv(rx) -> r => { + if let Err(e) = r { + error!( + "ime receive error in gc ticker"; + "err" => ?e, + ); + } + return; + }, + } + } + }); + (ticker, tx) + } +} + +#[derive(Clone)] +struct BackgroundRunnerCore { + engine: Arc, + memory_controller: Arc, + region_stats_manager: Option, +} + +impl BackgroundRunnerCore { + /// Returns the regions that are eligible for garbage collection. + /// + /// Returns empty vector if there are no regions cached or the previous gc + /// is not finished. + fn regions_for_gc(&self) -> Vec { + // another gc task is running, skipped. + if !self.engine.region_manager().try_set_regions_in_gc(true) { + return vec![]; + } + + let regions_map = self.engine.region_manager().regions_map.read(); + regions_map + .regions() + .values() + .filter_map(|m| { + if m.get_state() == RegionState::Active { + Some(m.get_region().clone()) + } else { + None + } + }) + .collect() + } + + pub(crate) fn gc_region( + &self, + region: &CacheRegion, + safe_point: u64, + oldest_seqno: u64, + ) -> FilterMetrics { + let mut gc_region = Cow::Borrowed(region); + let safe_point = { + let region_manager = self.engine.region_manager(); + // We should also consider the ongoing snapshot of the historical regions + // (regions that have been evicted). + let historical_safe_point = region_manager + .get_history_regions_min_ts(region) + .unwrap_or(u64::MAX); + + let mut regions_map = region_manager.regions_map.write(); + let Some(region_meta) = regions_map.mut_region_meta(region.id) else { + return FilterMetrics::default(); + }; + + if region_meta.get_state() != RegionState::Active + || !region.contains_range(region_meta.get_region()) + { + return FilterMetrics::default(); + } + + // check if region epoch vesion changes. + if region.epoch_version != region_meta.get_region().epoch_version { + gc_region = Cow::Owned(region_meta.get_region().clone()); + } + + let min_snapshot = region_meta + .region_snapshot_list() + .lock() + .unwrap() + .min_snapshot_ts() + .unwrap_or(u64::MAX); + let safe_point = safe_point.min(min_snapshot).min(historical_safe_point); + if safe_point <= region_meta.safe_point() { + info!( + "ime safe point not large enough"; + "prev" => region_meta.safe_point(), + "current" => safe_point, + ); + return FilterMetrics::default(); + } + + // todo: change it to debug! + info!( + "ime safe point update"; + "prev" => region_meta.safe_point(), + "current" => safe_point, + "region" => ?region, + ); + region_meta.set_safe_point(safe_point); + region_meta.set_in_gc(true); + safe_point + }; + + let start = Instant::now(); + let skiplist_engine = self.engine.engine(); + let mut filter = Filter::new( + safe_point, + oldest_seqno, + skiplist_engine.cf_handle(CF_DEFAULT), + skiplist_engine.cf_handle(CF_WRITE), + ); + filter.filter_keys_in_region(&gc_region); + self.engine + .region_manager() + .on_gc_region_finished(&gc_region); + + let duration = start.saturating_elapsed(); + IN_MEMORY_ENGINE_GC_TIME_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + "ime region gc complete"; + "region" => ?gc_region.as_ref(), + "gc_duration" => ?duration, + "total_version" => filter.metrics.total, + "filtered_version" => filter.metrics.filtered, + "below_safe_point_unique_keys" => filter.metrics.unique_key, + "below_safe_point_version" => filter.metrics.versions, + "below_safe_point_delete_version" => filter.metrics.delete_versions, + "current_safe_point" => safe_point, + ); + + let mut metrics = std::mem::take(&mut filter.metrics); + if filter.cached_mvcc_delete_key.is_some() { + metrics.filtered += 1; + } + if filter.cached_skiplist_delete_key.is_some() { + metrics.filtered += 1; + } + metrics + } + + fn on_gc_finished(&self) { + let success = self.engine.region_manager().try_set_regions_in_gc(false); + assert!(success); + } + + // if `false` is returned, the load is canceled + fn on_snapshot_load_finished( + &self, + region: &CacheRegion, + delete_range_scheduler: &Scheduler, + safe_point: u64, + ) -> bool { + fail::fail_point!("ime_on_snapshot_load_finished"); + fail::fail_point!("ime_on_snapshot_load_finished2"); + // We still need to check whether the snapshot is canceled during the load + let mut regions_map = self.engine.region_manager().regions_map.write(); + let region_meta = regions_map.mut_region_meta(region.id).unwrap(); + let mut remove_regions = vec![]; + let mut on_region_meta = |meta: &mut CacheRegionMeta| { + assert!( + meta.get_state() == RegionState::Loading + || meta.get_state() == RegionState::LoadingCanceled, + "region meta: {:?}", + meta, + ); + if meta.get_state() == RegionState::Loading { + meta.set_state(RegionState::Active); + meta.set_safe_point(safe_point); + } else { + assert_eq!(meta.get_state(), RegionState::LoadingCanceled); + meta.mark_evict(RegionState::Evicting, EvictReason::LoadFailed, None); + remove_regions.push(meta.get_region().clone()); + } + }; + + if region_meta.get_region().epoch_version == region.epoch_version { + on_region_meta(region_meta); + } else { + // epoch version changed, should use scan to find all overlapped regions + regions_map.iter_overlapped_regions_mut(region, |meta| { + assert!(region.contains_range(meta.get_region())); + on_region_meta(meta); + }); + } + drop(regions_map); + + if !remove_regions.is_empty() { + fail::fail_point!("ime_snapshot_load_canceled"); + + if let Err(e) = + delete_range_scheduler.schedule_force(BackgroundTask::DeleteRegions(remove_regions)) + { + error!( + "ime schedule delete regions failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + + return false; + } + + fail::fail_point!("ime_on_completes_batch_loading"); + true + } + + fn on_snapshot_load_failed( + &self, + region: &CacheRegion, + delete_range_scheduler: &Scheduler, + started: bool, + ) { + let mut regions_map = self.engine.region_manager().regions_map.write(); + let region_meta = regions_map.mut_region_meta(region.id).unwrap(); + let mut remove_regions = vec![]; + let mut mark_region_evicted = |meta: &mut CacheRegionMeta| { + assert!( + meta.get_state() == RegionState::Loading + || meta.get_state() == RegionState::LoadingCanceled + ); + let reason = if started { + EvictReason::LoadFailed + } else { + EvictReason::LoadFailedWithoutStart + }; + meta.mark_evict(RegionState::Evicting, reason, None); + remove_regions.push(meta.get_region().clone()); + }; + + if region_meta.get_region().epoch_version == region.epoch_version { + mark_region_evicted(region_meta); + } else { + // epoch version changed, should use scan to find all overlap regions + regions_map.iter_overlapped_regions_mut(region, |meta| { + assert!(region.contains_range(meta.get_region())); + mark_region_evicted(meta); + }); + } + + if let Err(e) = + delete_range_scheduler.schedule_force(BackgroundTask::DeleteRegions(remove_regions)) + { + error!( + "ime schedule delete regions failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + + /// Periodically load top regions. + /// + /// If the evict threshold is exceeded, evict (some) regions no longer + /// considered top. + /// + /// See: [`RegionStatsManager::collect_changes_regions`] for + /// algorithm details. + async fn top_regions_load_evict(&self, delete_range_scheduler: &Scheduler) { + let region_stats_manager = match &self.region_stats_manager { + Some(m) => m, + None => { + return; + } + }; + if !region_stats_manager.ready_for_auto_load_and_evict() { + return; + } + + let (current_region_count, cached_regions) = { + let region_map = self.engine.region_manager().regions_map().read(); + (region_map.regions().len(), region_map.cached_regions()) + }; + let (regions_to_load, regions_to_evict) = region_stats_manager + .collect_regions_to_load_and_evict( + current_region_count, + cached_regions, + &self.memory_controller, + ); + + let evict_count = regions_to_evict.len(); + let mut regions_to_delete = Vec::with_capacity(evict_count); + info!( + "ime load_evict"; + "regions_to_load" => ?®ions_to_load, + "regions_to_evict" => ?®ions_to_evict, + ); + let (tx, mut rx) = mpsc::channel(evict_count + 1); + for evict_region in regions_to_evict { + let cache_region = CacheRegion::from_region(&evict_region); + let tx_clone = tx.clone(); + // Bound is set to 1 so that the sender side will not be blocked + let deletable_regions = self.engine.region_manager().evict_region( + &cache_region, + EvictReason::AutoEvict, + Some(Box::new(move || { + Box::pin(async move { + let _ = tx_clone.send(()).await; + }) + })), + ); + info!( + "ime load_evict: auto evict"; + "region_to_evict" => ?&cache_region, + "evicted_regions" => ?&deletable_regions, + ); + regions_to_delete.extend(deletable_regions); + } + + if !regions_to_delete.is_empty() { + if let Err(e) = delete_range_scheduler + .schedule_force(BackgroundTask::DeleteRegions(regions_to_delete)) + { + error!( + "ime schedule delete range failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + for _ in 0..evict_count { + if rx.recv().await.is_none() { + break; + } + } + if !self.memory_controller.reached_stop_load_threshold() { + let expected_new_count = self + .memory_controller + .evict_threshold() + .saturating_sub(self.memory_controller.mem_usage()) + / region_stats_manager.expected_region_size(); + let expected_new_count = usize::max(expected_new_count, 1); + let mut regions_map = self.engine.region_manager().regions_map.write(); + for region in regions_to_load.into_iter().take(expected_new_count) { + let cache_region = CacheRegion::from_region(®ion); + if let Err(e) = regions_map.load_region(cache_region) { + warn!("ime error loading region"; "cache_region" => ?region, "err" => ?e); + } + } + } + region_stats_manager.complete_auto_load_and_evict(); + info!("ime load_evict complete"); + } +} + +// Flush epoch and pin enough times to make the delayed operations be executed +#[cfg(test)] +pub(crate) fn flush_epoch() { + { + let guard = &epoch::pin(); + guard.flush(); + } + // Local epoch tries to advance the global epoch every 128 pins. When global + // epoch advances, the operations(here, means delete) in the older epoch can be + // executed. + for _ in 0..128 { + let _ = &epoch::pin(); + } +} + +pub struct BackgroundRunner { + core: BackgroundRunnerCore, + + config: Arc>, + pd_client: Arc, + + // We have following four separate workers so that each type of task would not block each + // others + region_load_remote: Remote, + region_load_worker: Worker, + + delete_range_scheduler: Scheduler, + delete_range_worker: Worker, + + gc_region_remote: Remote, + gc_region_worker: Worker, + + // Region load and eviction worker. + // TODO: this can be consolidated, possibly with the GC worker. + load_evict_remote: Remote, + load_evict_worker: Worker, + + lock_cleanup_remote: Remote, + lock_cleanup_worker: Worker, + + cross_check_worker: Option, + + // The last sequence number for the lock cf tombstone cleanup + last_seqno: u64, + // RocksEngine is used to get the oldest snapshot sequence number. + rocks_engine: Option, + raft_casual_router: Option>>, +} + +impl Drop for BackgroundRunner { + fn drop(&mut self) { + self.region_load_worker.stop(); + self.delete_range_worker.stop(); + self.gc_region_worker.stop(); + self.load_evict_worker.stop(); + self.lock_cleanup_worker.stop(); + if let Some(cross_check_worker) = self.cross_check_worker.take() { + cross_check_worker.stop() + }; + } +} + +impl BackgroundRunner { + pub fn new( + engine: Arc, + memory_controller: Arc, + region_info_provider: Option>, + config: Arc>, + pd_client: Arc, + raft_casual_router: Option>>, + ) -> (Self, Scheduler) { + let region_load_worker = Builder::new("ime-load") + // Range load now is implemented sequentially, so we must use exactly one thread to handle it. + // todo(SpadeA): if the load speed is a bottleneck, we may consider to use multiple threads to load ranges. + .thread_count(1) + .create(); + let region_load_remote = region_load_worker.remote(); + + let delete_range_worker = Worker::new("ime-delete"); + let delete_range_runner = DeleteRangeRunner::new(engine.clone()); + let delete_range_scheduler = + delete_range_worker.start_with_timer("ime-delete-runner", delete_range_runner); + + let lock_cleanup_worker = Worker::new("ime-lock-cleanup"); + let lock_cleanup_remote = lock_cleanup_worker.remote(); + + let gc_region_worker = Builder::new("ime-gc") + // Gc must also use exactly one thread to handle it. + .thread_count(1) + .create(); + let gc_region_remote = gc_region_worker.remote(); + + let load_evict_worker = Worker::new("ime-evict"); + let load_evict_remote = load_evict_worker.remote(); + + let region_stats_manager = region_info_provider.map(|region_info_provider| { + RegionStatsManager::new( + config.clone(), + DEFAULT_EVICT_MIN_DURATION, + region_info_provider, + ) + }); + ( + Self { + core: BackgroundRunnerCore { + engine, + memory_controller, + region_stats_manager, + }, + pd_client, + config, + region_load_worker, + region_load_remote, + delete_range_worker, + delete_range_scheduler: delete_range_scheduler.clone(), + gc_region_worker, + gc_region_remote, + load_evict_worker, + load_evict_remote, + lock_cleanup_remote, + lock_cleanup_worker, + cross_check_worker: None, + last_seqno: 0, + rocks_engine: None, + raft_casual_router, + }, + delete_range_scheduler, + ) + } + + // used for benchmark. + pub fn run_load_region(&self, region: CacheRegion, snapshot: Arc) { + Self::do_load_region( + region, + snapshot, + self.core.clone(), + self.delete_range_scheduler.clone(), + self.pd_client.clone(), + self.config.value().gc_run_interval.0, + ) + } + + fn do_load_region( + region: CacheRegion, + snapshot: Arc, + core: BackgroundRunnerCore, + delete_range_scheduler: Scheduler, + pd_client: Arc, + gc_run_interval: Duration, + ) { + fail::fail_point!("ime_before_start_loading_region"); + fail::fail_point!("ime_on_start_loading_region"); + let mut is_canceled = false; + { + let regions_map = core.engine.region_manager().regions_map.read(); + let region_meta = regions_map.region_meta(region.id).unwrap(); + // if loading is canceled, we skip the batch load. + // NOTE: here we don't check the region epoch version change, + // We will handle possible region split and partial cancelation + // in `on_snapshot_load_canceled` and `on_snapshot_load_finished`. + if region_meta.get_state() != RegionState::Loading { + assert_eq!(region_meta.get_state(), RegionState::LoadingCanceled); + is_canceled = true; + } + } + let skiplist_engine = core.engine.engine.clone(); + + if core.memory_controller.reached_stop_load_threshold() { + // We are running out of memory, so cancel the load. + is_canceled = true; + } + + if is_canceled { + info!( + "ime snapshot load canceled"; + "region" => ?region, + ); + core.on_snapshot_load_failed(®ion, &delete_range_scheduler, false); + return; + } + + info!("ime loading region"; "region" => ?®ion); + let start = Instant::now(); + let iter_opt = IterOptions::new( + Some(KeyBuilder::from_slice(®ion.start, 0, 0)), + Some(KeyBuilder::from_slice(®ion.end, 0, 0)), + false, + ); + + let safe_point = 'load_snapshot: { + for &cf in DATA_CFS { + let handle = skiplist_engine.cf_handle(cf); + let seq = snapshot.sequence_number(); + let guard = &epoch::pin(); + match snapshot.iterator_opt(cf, iter_opt.clone()) { + Ok(mut iter) => { + iter.seek_to_first().unwrap(); + while iter.valid().unwrap() { + // use the sequence number from RocksDB snapshot here as + // the kv is clearly visible + let mut encoded_key = encode_key(iter.key(), seq, ValueType::Value); + let mut val = InternalBytes::from_vec(iter.value().to_vec()); + + let mem_size = RegionCacheWriteBatchEntry::calc_put_entry_size( + iter.key(), + val.as_bytes(), + ); + + // todo(SpadeA): we can batch acquire the memory size + // here. + if let MemoryUsage::CapacityReached(n) = + core.memory_controller.acquire(mem_size) + { + warn!( + "ime stop loading snapshot due to memory reaching capacity"; + "region" => ?region, + "memory_usage(MB)" => ReadableSize(n as u64).as_mb_f64(), + ); + break 'load_snapshot None; + } + + encoded_key.set_memory_controller(core.memory_controller.clone()); + val.set_memory_controller(core.memory_controller.clone()); + handle.insert(encoded_key, val, guard); + iter.next().unwrap(); + } + } + Err(e) => { + error!("ime creating rocksdb iterator failed"; "cf" => cf, "err" => %e); + break 'load_snapshot None; + } + } + } + // gc the range + let tso_timeout = std::cmp::min(gc_run_interval, TIMTOUT_FOR_TSO); + let now = match block_on_timeout(pd_client.get_tso(), tso_timeout) { + Ok(Ok(ts)) => ts, + err => { + error!( + "ime get timestamp failed, skip gc loaded region"; + "timeout_duration" => ?tso_timeout, + "error" => ?err, + ); + // Get timestamp fail so don't do gc. + break 'load_snapshot Some(0); + } + }; + + let safe_point = (|| { + fail::fail_point!("ime_safe_point_in_loading", |t| { + t.unwrap().parse::().unwrap() + }); + + let safe_point = now + .physical() + .saturating_sub(gc_run_interval.as_millis() as u64); + TimeStamp::compose(safe_point, 0).into_inner() + })(); + + let mut filter = Filter::new( + safe_point, + u64::MAX, + skiplist_engine.cf_handle(CF_DEFAULT), + skiplist_engine.cf_handle(CF_WRITE), + ); + filter.filter_keys_in_region(®ion); + + Some(safe_point) + }; + + if let Some(safe_point) = safe_point { + if core.on_snapshot_load_finished(®ion, &delete_range_scheduler, safe_point) { + let duration = start.saturating_elapsed(); + IN_MEMORY_ENGINE_LOAD_TIME_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + "ime loading region finished"; + "region" => ?region, + "duration(sec)" => ?duration, + ); + } else { + info!("ime loading region canceled";"region" => ?region); + } + } else { + info!( + "ime snapshot load failed"; + "region" => ?region, + ); + core.on_snapshot_load_failed(®ion, &delete_range_scheduler, true); + } + } +} + +impl Runnable for BackgroundRunner { + type Task = BackgroundTask; + + fn run(&mut self, task: Self::Task) { + match task { + BackgroundTask::SetRocksEngine(rocks_engine) => { + self.rocks_engine = Some(rocks_engine); + fail::fail_point!("ime_set_rocks_engine"); + } + BackgroundTask::Gc(t) => { + let seqno = (|| { + fail::fail_point!("ime_gc_oldest_seqno", |t| { + Some(t.unwrap().parse::().unwrap()) + }); + + let Some(ref rocks_engine) = self.rocks_engine else { + return None; + }; + let latest_seqno = rocks_engine.get_latest_sequence_number(); + Some( + rocks_engine + .get_oldest_snapshot_sequence_number() + .unwrap_or(latest_seqno), + ) + })(); + + let Some(seqno) = seqno else { + return; + }; + + info!( + "ime start a new round of gc"; + "safe_point" => t.safe_point, + "oldest_sequence" => seqno, + ); + let core = self.core.clone(); + let regions = core.regions_for_gc(); + if !regions.is_empty() { + let f = async move { + let mut metrics = FilterMetrics::default(); + for region in ®ions { + let m = core.gc_region(region, t.safe_point, seqno); + metrics.merge(&m); + } + core.on_gc_finished(); + metrics.flush(); + fail::fail_point!("in_memory_engine_gc_finish"); + }; + self.gc_region_remote.spawn(f); + } else { + core.on_gc_finished(); + } + } + BackgroundTask::LoadRegion(region, snapshot) => { + let core = self.core.clone(); + let delete_range_scheduler = self.delete_range_scheduler.clone(); + let pd_client = self.pd_client.clone(); + let gc_run_interval = self.config.value().gc_run_interval.0; + let f = async move { + Self::do_load_region( + region, + snapshot, + core, + delete_range_scheduler, + pd_client, + gc_run_interval, + ); + }; + self.region_load_remote.spawn(f); + } + BackgroundTask::MemoryCheckAndEvict => { + let mem_usage_before_check = self.core.memory_controller.mem_usage(); + info!( + "ime start memory usage check and evict"; + "mem_usage(MB)" => ReadableSize(mem_usage_before_check as u64).as_mb() + ); + if mem_usage_before_check > self.core.memory_controller.evict_threshold() { + let delete_range_scheduler = self.delete_range_scheduler.clone(); + let core = self.core.clone(); + let task = async move { + if let Some(region_stats_manager) = &core.region_stats_manager { + let cached_region_ids = core + .engine + .region_manager + .regions_map + .read() + .cached_regions(); + + let evict_fn = |evict_region: &CacheRegion, + evict_reason: EvictReason, + cb: Option>| + -> Vec { + core.engine.region_manager.evict_region( + evict_region, + evict_reason, + cb, + ) + }; + + region_stats_manager + .evict_on_evict_threshold_reached( + evict_fn, + &delete_range_scheduler, + cached_region_ids, + &core.memory_controller, + ) + .await; + } + core.memory_controller.set_memory_checking(false); + let mem_usage = core.memory_controller.mem_usage(); + info!( + "ime memory usage check and evict completes"; + "mem_usage(MB)" => ReadableSize(mem_usage as u64).as_mb(), + "mem_usage_before_check(MB)" => ReadableSize(mem_usage_before_check as u64).as_mb() + ); + }; + self.load_evict_remote.spawn(task); + } else { + self.core.memory_controller.set_memory_checking(false); + } + } + // DeleteRange task is executed by `DeleteRangeRunner` with a different scheduler so + // that the task will not be scheduled to here. + BackgroundTask::DeleteRegions(_) => unreachable!(), + BackgroundTask::TopRegionsLoadEvict => { + let delete_range_scheduler = self.delete_range_scheduler.clone(); + let core = self.core.clone(); + let task = + async move { core.top_regions_load_evict(&delete_range_scheduler).await }; + self.load_evict_remote.spawn(task); + } + BackgroundTask::CleanLockTombstone(snapshot_seqno) => { + if snapshot_seqno < self.last_seqno { + return; + } + self.last_seqno = snapshot_seqno; + let core = self.core.clone(); + + let f = async move { + info!( + "ime begin to cleanup tombstones in lock cf"; + "seqno" => snapshot_seqno, + ); + + let mut last_user_key = vec![]; + let mut remove_rest = false; + let mut cached_to_remove: Option> = None; + + let mut removed = 0; + let mut total = 0; + let now = Instant::now(); + let lock_handle = core.engine.engine().cf_handle("lock"); + let guard = &epoch::pin(); + let mut iter = lock_handle.iterator(); + iter.seek_to_first(guard); + while iter.valid() { + total += 1; + let InternalKey { + user_key, + v_type, + sequence, + } = decode_key(iter.key().as_bytes()); + if user_key != last_user_key { + if let Some(remove) = cached_to_remove.take() { + removed += 1; + lock_handle.remove(&InternalBytes::from_vec(remove), guard); + } + last_user_key = user_key.to_vec(); + if sequence >= snapshot_seqno { + remove_rest = false; + } else { + remove_rest = true; + if v_type == ValueType::Deletion { + cached_to_remove = Some(iter.key().as_bytes().to_vec()); + } + } + } else if remove_rest { + assert!(sequence < snapshot_seqno); + removed += 1; + lock_handle.remove(iter.key(), guard); + } else if sequence < snapshot_seqno { + remove_rest = true; + if v_type == ValueType::Deletion { + assert!(cached_to_remove.is_none()); + cached_to_remove = Some(iter.key().as_bytes().to_vec()); + } + } + + iter.next(guard); + } + if let Some(remove) = cached_to_remove.take() { + removed += 1; + lock_handle.remove(&InternalBytes::from_vec(remove), guard); + } + + info!( + "ime cleanup tombstones in lock cf"; + "seqno" => snapshot_seqno, + "total" => total, + "removed" => removed, + "duration" => ?now.saturating_elapsed(), + "current_count" => lock_handle.len(), + ); + + fail::fail_point!("ime_clean_lock_tombstone_done"); + }; + + self.lock_cleanup_remote.spawn(f); + } + BackgroundTask::TurnOnCrossCheck(( + engine, + rocks_engine, + pd_client, + check_interval, + get_tikv_safe_point, + )) => { + let cross_check_worker = Worker::new("cross-check-worker"); + let cross_check_runner = CrossChecker::new( + pd_client, + engine, + rocks_engine, + check_interval, + get_tikv_safe_point, + ); + let _ = + cross_check_worker.start_with_timer("cross-check-runner", cross_check_runner); + self.cross_check_worker = Some(cross_check_worker); + } + BackgroundTask::CheckLoadPendingRegions(s) => { + if let Some(router) = &self.raft_casual_router + && let Some(e) = &self.rocks_engine + { + let pending_regions: Vec<_> = self + .core + .engine + .region_manager() + .regions_map() + .read() + .regions() + .values() + .filter_map(|meta| { + if meta.get_state() == RegionState::Pending { + Some(meta.get_region().id) + } else { + None + } + }) + .collect(); + + for region_id in pending_regions { + let scheduler = s.clone(); + let rocks_engine = e.clone(); + let ime_engine = self.core.engine.clone(); + if let Err(e) = router.send( + region_id, + CasualMessage::InMemoryEngineLoadRegion { + region_id, + trigger_load_cb: Box::new(move |r| { + let cache_region = CacheRegion::from_region(r); + _ = ime_engine.prepare_for_apply( + &cache_region, + Some(&rocks_engine), + &scheduler, + false, + r.is_in_flashback, + ); + }), + }, + ) { + warn!("ime send load pending cache region msg failed"; "err" => ?e); + } + } + } + } + } + } +} + +impl RunnableWithTimer for BackgroundRunner { + fn on_timeout(&mut self) { + let mem_usage = self.core.memory_controller.mem_usage(); + IN_MEMORY_ENGINE_MEMORY_USAGE.set(mem_usage as i64); + + let mut count_by_state = [0; RegionState::COUNT]; + let mut oldest_safe_point = u64::MAX; + let mut newest_safe_point = u64::MIN; + { + let regions_map = self.core.engine.region_manager().regions_map.read(); + for m in regions_map.regions().values() { + count_by_state[m.get_state() as usize] += 1; + if m.get_state() == RegionState::Active && m.safe_point() != 0 { + oldest_safe_point = u64::min(oldest_safe_point, m.safe_point()); + newest_safe_point = u64::max(newest_safe_point, m.safe_point()); + } + } + } + + if oldest_safe_point != u64::MAX { + IN_MEMORY_ENGINE_OLDEST_SAFE_POINT.set(oldest_safe_point as i64); + IN_MEMORY_ENGINE_NEWEST_SAFE_POINT.set(newest_safe_point as i64); + if let Ok(Ok(tikv_safe_point)) = + block_on_timeout(self.pd_client.get_gc_safe_point(), Duration::from_secs(5)) + { + if tikv_safe_point > oldest_safe_point { + warn!( + "ime oldest auto gc safe point is older than tikv's auto gc safe point"; + "tikv_safe_point" => tikv_safe_point, + "ime_oldest_safe_point" => oldest_safe_point, + ); + } + + let gap = + TimeStamp::new(oldest_safe_point.saturating_sub(tikv_safe_point)).physical(); + // If gap is too larger (more than a year), it means tikv safe point is not + // initialized, so we does not update the metrics now. + if gap < Duration::from_secs(365 * 24 * 3600).as_millis() as u64 { + SAFE_POINT_GAP.set(oldest_safe_point as i64 - tikv_safe_point as i64); + } + } + } + + for (i, count) in count_by_state.into_iter().enumerate() { + let state = RegionState::from_usize(i); + IN_MEMORY_ENGINE_CACHE_COUNT + .with_label_values(&[state.as_str()]) + .set(count); + } + } + + fn get_interval(&self) -> Duration { + Duration::from_secs(10) + } +} + +pub struct DeleteRangeRunner { + engine: Arc, + // It is possible that when `DeleteRangeRunner` begins to delete a range, the range is being + // written by apply threads. In that case, we have to delay the delete range task to avoid race + // condition between them. Periodically, these delayed ranges will be checked to see if it is + // ready to be deleted. + delay_regions: Vec, +} + +impl DeleteRangeRunner { + fn new(engine: Arc) -> Self { + Self { + engine, + delay_regions: vec![], + } + } + + fn delete_regions(&mut self, regions: &[CacheRegion]) { + let skiplist_engine = self.engine.engine(); + for r in regions { + skiplist_engine.delete_range(r); + } + self.engine.region_manager().on_delete_regions(regions); + + fail::fail_point!("ime_delete_range_done"); + + #[cfg(test)] + flush_epoch(); + } +} + +impl Runnable for DeleteRangeRunner { + type Task = BackgroundTask; + fn run(&mut self, task: Self::Task) { + match task { + BackgroundTask::DeleteRegions(regions) => { + fail::fail_point!("ime_on_delete_range"); + let (mut regions_to_delay, regions_to_delete) = { + let region_manager = self.engine.region_manager(); + let regions_map = region_manager.regions_map.read(); + let mut regions_to_delay = vec![]; + let mut regions_to_delete = vec![]; + for r in regions { + let region_meta = regions_map.region_meta(r.id).unwrap(); + assert_eq!(region_meta.get_region().epoch_version, r.epoch_version); + assert_eq!(region_meta.get_state(), RegionState::Evicting); + // If the region is currently written into, the region has to be delayed + // to delete. See comment on `delay_ranges`. + if region_meta.is_in_gc() || region_meta.is_written() { + regions_to_delay.push(r); + } else { + regions_to_delete.push(r); + } + } + (regions_to_delay, regions_to_delete) + }; + self.delay_regions.append(&mut regions_to_delay); + if !regions_to_delete.is_empty() { + self.delete_regions(®ions_to_delete); + } + } + _ => unreachable!(), + } + } +} + +impl RunnableWithTimer for DeleteRangeRunner { + fn on_timeout(&mut self) { + if self.delay_regions.is_empty() { + return; + } + let regions = std::mem::take(&mut self.delay_regions); + self.run(BackgroundTask::DeleteRegions(regions)); + } + + fn get_interval(&self) -> Duration { + Duration::from_millis(500) + } +} + +#[derive(Default)] +struct FilterMetrics { + total: usize, + versions: usize, + delete_versions: usize, + filtered: usize, + unique_key: usize, + mvcc_rollback_and_locks: usize, +} + +impl FilterMetrics { + fn merge(&mut self, other: &FilterMetrics) { + self.total += other.total; + self.versions += other.versions; + self.delete_versions += other.delete_versions; + self.filtered += other.filtered; + self.unique_key += other.unique_key; + self.mvcc_rollback_and_locks += other.mvcc_rollback_and_locks; + } + + fn flush(&self) { + IN_MEMORY_ENGINE_GC_FILTERED_STATIC + .total + .inc_by(self.total as u64); + IN_MEMORY_ENGINE_GC_FILTERED_STATIC + .below_safe_point_total + .inc_by(self.versions as u64); + IN_MEMORY_ENGINE_GC_FILTERED_STATIC + .filtered + .inc_by(self.filtered as u64); + IN_MEMORY_ENGINE_GC_FILTERED_STATIC + .below_safe_point_unique + .inc_by(self.unique_key as u64); + } +} + +struct Filter { + safe_point: u64, + oldest_seqno: u64, + mvcc_key_prefix: Vec, + remove_older: bool, + + default_cf_handle: SkiplistHandle, + write_cf_handle: SkiplistHandle, + + // When deleting some keys, the latest one should be deleted at last to avoid the older + // version appears. + cached_mvcc_delete_key: Option>, + cached_skiplist_delete_key: Option>, + + metrics: FilterMetrics, + + last_user_key: Vec, +} + +impl Drop for Filter { + fn drop(&mut self) { + if let Some(cached_delete_key) = self.cached_mvcc_delete_key.take() { + let guard = &epoch::pin(); + self.write_cf_handle + .remove(&InternalBytes::from_vec(cached_delete_key), guard); + } + if let Some(cached_delete_key) = self.cached_skiplist_delete_key.take() { + let guard = &epoch::pin(); + self.write_cf_handle + .remove(&InternalBytes::from_vec(cached_delete_key), guard); + } + } +} + +impl Filter { + fn new( + safe_point: u64, + oldest_seqno: u64, + default_cf_handle: SkiplistHandle, + write_cf_handle: SkiplistHandle, + ) -> Self { + Self { + safe_point, + oldest_seqno, + default_cf_handle, + write_cf_handle, + mvcc_key_prefix: vec![], + cached_mvcc_delete_key: None, + cached_skiplist_delete_key: None, + remove_older: false, + metrics: FilterMetrics::default(), + last_user_key: vec![], + } + } + + fn filter_keys_in_region(&mut self, region: &CacheRegion) { + let mut iter = self.write_cf_handle.iterator(); + let guard = &epoch::pin(); + let (start_key, end_key) = encode_key_for_boundary_with_mvcc(region); + iter.seek(&start_key, guard); + while iter.valid() && iter.key() < &end_key { + let k = iter.key(); + let v = iter.value(); + if let Err(e) = self.filter_key(k.as_bytes(), v.as_bytes()) { + warn!( + "ime something wrong GC"; + "error" => ?e, + ); + } + iter.next(guard); + } + } + + fn filter_key(&mut self, key: &Bytes, value: &Bytes) -> Result<(), String> { + self.metrics.total += 1; + let InternalKey { + user_key, + v_type, + sequence, + } = decode_key(key); + + if sequence > self.oldest_seqno { + // skip those under read by some snapshots + return Ok(()); + } + + let (mvcc_key_prefix, commit_ts) = split_ts(user_key)?; + if commit_ts > self.safe_point { + return Ok(()); + } + + // Just like what rocksdb compaction filter does, we do not handle internal + // keys (representing different MVCC versions of the same user key) that have + // been marked as tombstones. However, these keys need to be deleted. Since they + // are below the safe point, we can safely delete them directly now. + // For each user key, we cache the first ValueType::Deletion and delete all the + // older internal keys of the same user keys. The cached ValueType::Delete is + // deleted at last to avoid these older keys visible. + if v_type == ValueType::Deletion { + if let Some(cache_skiplist_delete_key) = self.cached_skiplist_delete_key.take() { + self.metrics.filtered += 1; + // Reaching here in two cases: + // 1. There are two ValueType::Deletion in the same user key. + // 2. Two consecutive ValueType::Deletion of different user keys. + // In either cases, we can delete the previous one directly. + let guard = &epoch::pin(); + self.write_cf_handle + .remove(&InternalBytes::from_vec(cache_skiplist_delete_key), guard) + } + self.cached_skiplist_delete_key = Some(key.to_vec()); + return Ok(()); + } else if let Some(ref cache_skiplist_delete_key) = self.cached_skiplist_delete_key { + let InternalKey { + user_key: cache_skiplist_delete_user_key, + .. + } = decode_key(cache_skiplist_delete_key); + let guard = &epoch::pin(); + if cache_skiplist_delete_user_key == user_key { + self.metrics.filtered += 1; + self.write_cf_handle + .remove(&InternalBytes::from_bytes(key.clone()), guard); + return Ok(()); + } else { + self.metrics.filtered += 1; + self.write_cf_handle.remove( + &InternalBytes::from_vec(self.cached_skiplist_delete_key.take().unwrap()), + guard, + ) + } + } + + let guard = &epoch::pin(); + // Also, we only handle the same user_key once (user_key here refers to the key + // with MVCC version but without sequence number). + if user_key != self.last_user_key { + self.last_user_key = user_key.to_vec(); + } else { + self.metrics.filtered += 1; + self.write_cf_handle + .remove(&InternalBytes::from_bytes(key.clone()), guard); + return Ok(()); + } + + self.metrics.versions += 1; + if self.mvcc_key_prefix != mvcc_key_prefix { + self.metrics.unique_key += 1; + self.mvcc_key_prefix.clear(); + self.mvcc_key_prefix.extend_from_slice(mvcc_key_prefix); + self.remove_older = false; + if let Some(cached_delete_key) = self.cached_mvcc_delete_key.take() { + self.metrics.filtered += 1; + self.write_cf_handle + .remove(&InternalBytes::from_vec(cached_delete_key), guard); + } + } + + let mut filtered = self.remove_older; + let write = parse_write(value)?; + if !self.remove_older { + match write.write_type { + WriteType::Rollback | WriteType::Lock => { + self.metrics.mvcc_rollback_and_locks += 1; + filtered = true; + } + WriteType::Put => self.remove_older = true, + WriteType::Delete => { + self.metrics.delete_versions += 1; + self.remove_older = true; + + // The first mvcc type below safe point is the mvcc delete. We should delay to + // remove it until all the followings with the same user key have been deleted + // to avoid older version apper. + self.cached_mvcc_delete_key = Some(key.to_vec()); + } + } + } + + if !filtered { + return Ok(()); + } + self.metrics.filtered += 1; + self.write_cf_handle + .remove(&InternalBytes::from_bytes(key.clone()), guard); + self.handle_filtered_write(write, guard); + + Ok(()) + } + + fn handle_filtered_write(&mut self, write: WriteRef<'_>, guard: &epoch::Guard) { + if write.short_value.is_none() && write.write_type == WriteType::Put { + // todo(SpadeA): We don't know the sequence number of the key in the skiplist so + // we cannot delete it directly. So we encoding a key with MAX sequence number + // so we can find the mvcc key with sequence number in the skiplist by using + // get_with_key and delete it with the result key. It involes more than one + // seek(both get and remove invovle seek). Maybe we can provide the API to + // delete the mvcc keys with all sequence numbers. + let default_key = encoding_for_filter(&self.mvcc_key_prefix, write.start_ts); + let mut iter = self.default_cf_handle.iterator(); + iter.seek(&default_key, guard); + while iter.valid() && iter.key().same_user_key_with(&default_key) { + self.default_cf_handle.remove(iter.key(), guard); + iter.next(guard); + } + } + } +} + +#[cfg(test)] +pub mod tests { + use std::{ + sync::{ + mpsc::{channel, Sender}, + Arc, Mutex, + }, + time::Duration, + }; + + use crossbeam::epoch; + use engine_rocks::util::new_engine; + use engine_traits::{ + CacheRegion, IterOptions, Iterable, Iterator, RegionCacheEngine, RegionCacheEngineExt, + RegionEvent, SyncMutable, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, + }; + use futures::future::ready; + use keys::{data_key, DATA_MAX_KEY, DATA_MIN_KEY}; + use kvproto::metapb::Region; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; + use pd_client::PdClient; + use tempfile::Builder; + use tikv_util::{ + config::{ReadableDuration, ReadableSize, VersionTrack}, + worker::dummy_scheduler, + }; + use txn_types::{Key, TimeStamp, Write, WriteType}; + + use super::*; + use crate::{ + background::BackgroundRunner, + config::InMemoryEngineConfigManager, + engine::{SkiplistEngine, SkiplistHandle}, + keys::{ + construct_key, construct_region_key, construct_value, encode_key, encode_seek_key, + encoding_for_filter, InternalBytes, ValueType, + }, + memory_controller::MemoryController, + region_label::{ + region_label_meta_client, + tests::{add_region_label_rule, new_region_label_rule, new_test_server_and_client}, + }, + region_manager::RegionState::*, + test_util::{new_region, put_data, put_data_with_overwrite}, + write_batch::RegionCacheWriteBatchEntry, + InMemoryEngineConfig, InMemoryEngineContext, RegionCacheMemoryEngine, + }; + + fn delete_data( + key: &[u8], + ts: u64, + seq_num: u64, + write_cf: &SkiplistHandle, + mem_controller: Arc, + ) { + let key = data_key(key); + let raw_write_k = Key::from_raw(&key) + .append_ts(TimeStamp::new(ts)) + .into_encoded(); + let mut write_k = encode_key(&raw_write_k, seq_num, ValueType::Value); + write_k.set_memory_controller(mem_controller.clone()); + let write_v = Write::new(WriteType::Delete, TimeStamp::new(ts), None); + let mut val = InternalBytes::from_vec(write_v.as_ref().to_bytes()); + val.set_memory_controller(mem_controller.clone()); + let guard = &epoch::pin(); + let _ = mem_controller.acquire(RegionCacheWriteBatchEntry::calc_put_entry_size( + &raw_write_k, + val.as_bytes(), + )); + write_cf.insert(write_k, val, guard); + } + + fn rollback_data( + key: &[u8], + ts: u64, + seq_num: u64, + write_cf: &SkiplistHandle, + mem_controller: Arc, + ) { + let key = data_key(key); + let raw_write_k = Key::from_raw(&key) + .append_ts(TimeStamp::new(ts)) + .into_encoded(); + let mut write_k = encode_key(&raw_write_k, seq_num, ValueType::Value); + write_k.set_memory_controller(mem_controller.clone()); + let write_v = Write::new(WriteType::Rollback, TimeStamp::new(ts), None); + let mut val = InternalBytes::from_vec(write_v.as_ref().to_bytes()); + val.set_memory_controller(mem_controller.clone()); + let guard = &epoch::pin(); + let _ = mem_controller.acquire(RegionCacheWriteBatchEntry::calc_put_entry_size( + &raw_write_k, + val.as_bytes(), + )); + write_cf.insert(write_k, val, guard); + } + + fn element_count(sklist: &SkiplistHandle) -> u64 { + let guard = &epoch::pin(); + let mut count = 0; + let mut iter = sklist.iterator(); + iter.seek_to_first(guard); + while iter.valid() { + count += 1; + iter.next(guard); + } + count + } + + // We should not use skiplist.get directly as we only cares keys without + // sequence number suffix + fn key_exist(sl: &SkiplistHandle, key: &InternalBytes, guard: &epoch::Guard) -> bool { + let mut iter = sl.iterator(); + iter.seek(key, guard); + if iter.valid() && iter.key().same_user_key_with(key) { + return true; + } + false + } + + // We should not use skiplist.get directly as we only cares keys without + // sequence number suffix + fn get_value( + sl: &SkiplistHandle, + key: &InternalBytes, + guard: &epoch::Guard, + ) -> Option> { + let mut iter = sl.iterator(); + iter.seek(key, guard); + if iter.valid() && iter.key().same_user_key_with(key) { + return Some(iter.value().as_slice().to_vec()); + } + None + } + + fn dummy_controller(skip_engine: SkiplistEngine) -> Arc { + let mut config = InMemoryEngineConfig::config_for_test(); + config.evict_threshold = Some(ReadableSize(u64::MAX)); + config.capacity = Some(ReadableSize(u64::MAX)); + let config = Arc::new(VersionTrack::new(config)); + Arc::new(MemoryController::new(config, skip_engine)) + } + + fn encode_raw_key_for_filter(key: &[u8], ts: TimeStamp) -> InternalBytes { + let key = data_key(key); + let key = Key::from_raw(&key); + encoding_for_filter(key.as_encoded(), ts) + } + + struct MockPdClient {} + impl PdClient for MockPdClient { + fn get_tso(&self) -> pd_client::PdFuture { + Box::pin(ready(Ok(TimeStamp::compose(TimeStamp::physical_now(), 0)))) + } + } + + #[test] + fn test_filter() { + let skiplist_engine = SkiplistEngine::new(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + let memory_controller = dummy_controller(skiplist_engine.clone()); + + put_data( + b"key1", + b"value1", + 10, + 15, + 10, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key2", + b"value21", + 10, + 15, + 12, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key2", + b"value22", + 20, + 25, + 14, + false, + &default, + &write, + memory_controller.clone(), + ); + // mock repeate apply + put_data( + b"key2", + b"value22", + 20, + 25, + 15, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key2", + b"value23", + 30, + 35, + 16, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key3", + b"value31", + 20, + 25, + 18, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key3", + b"value32", + 30, + 35, + 20, + false, + &default, + &write, + memory_controller.clone(), + ); + delete_data(b"key3", 40, 22, &write, memory_controller.clone()); + assert_eq!(7, element_count(&default)); + assert_eq!(8, element_count(&write)); + + let mut filter = Filter::new(50, 100, default.clone(), write.clone()); + let mut count = 0; + let mut iter = write.iterator(); + let guard = &epoch::pin(); + iter.seek_to_first(guard); + while iter.valid() { + let k = iter.key(); + let v = iter.value(); + filter.filter_key(k.as_bytes(), v.as_bytes()).unwrap(); + count += 1; + iter.next(guard); + } + assert_eq!(count, 8); + assert_eq!(5, filter.metrics.filtered); + drop(filter); + + assert_eq!(2, element_count(&write)); + assert_eq!(2, element_count(&default)); + + let key = encode_raw_key_for_filter(b"key1", TimeStamp::new(15)); + assert!(key_exist(&write, &key, guard)); + + let key = encode_raw_key_for_filter(b"key2", TimeStamp::new(35)); + assert!(key_exist(&write, &key, guard)); + + let key = encode_raw_key_for_filter(b"key3", TimeStamp::new(35)); + assert!(!key_exist(&write, &key, guard)); + + let key = encode_raw_key_for_filter(b"key1", TimeStamp::new(10)); + assert!(key_exist(&default, &key, guard)); + + let key = encode_raw_key_for_filter(b"key2", TimeStamp::new(30)); + assert!(key_exist(&default, &key, guard)); + + let key = encode_raw_key_for_filter(b"key3", TimeStamp::new(30)); + assert!(!key_exist(&default, &key, guard)); + } + + #[test] + fn test_filter_with_delete() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let memory_controller = engine.memory_controller(); + let region = new_region(1, b"", b"z"); + let cache_region = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); + + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + put_data( + b"key1", + b"value11", + 10, + 15, + 10, + false, + &default, + &write, + memory_controller.clone(), + ); + + // Delete the above key + let guard = &epoch::pin(); + let raw_write_k = Key::from_raw(&data_key(b"key1")) + .append_ts(TimeStamp::new(15)) + .into_encoded(); + let mut write_k = encode_key(&raw_write_k, 15, ValueType::Deletion); + write_k.set_memory_controller(memory_controller.clone()); + let mut val = InternalBytes::from_vec(b"".to_vec()); + val.set_memory_controller(memory_controller.clone()); + write.insert(write_k, val, guard); + + put_data( + b"key2", + b"value22", + 20, + 25, + 14, + false, + &default, + &write, + memory_controller.clone(), + ); + + // Delete the above key + let raw_write_k = Key::from_raw(&data_key(b"key2")) + .append_ts(TimeStamp::new(25)) + .into_encoded(); + let mut write_k = encode_key(&raw_write_k, 15, ValueType::Deletion); + write_k.set_memory_controller(memory_controller.clone()); + let mut val = InternalBytes::from_vec(b"".to_vec()); + val.set_memory_controller(memory_controller.clone()); + write.insert(write_k, val, guard); + + put_data( + b"key2", + b"value23", + 30, + 35, + 16, + false, + &default, + &write, + memory_controller.clone(), + ); + delete_data(b"key2", 40, 18, &write, memory_controller.clone()); + + let snap = engine + .snapshot(cache_region.clone(), u64::MAX, u64::MAX) + .unwrap(); + let mut iter_opts = IterOptions::default(); + iter_opts.set_lower_bound(&cache_region.start, 0); + iter_opts.set_upper_bound(&cache_region.end, 0); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockPdClient {}), + None, + ); + worker.core.gc_region(&cache_region, 40, 100); + + let mut iter = snap.iterator_opt("write", iter_opts).unwrap(); + iter.seek_to_first().unwrap(); + assert!(!iter.valid().unwrap()); + + let mut iter = write.iterator(); + iter.seek_to_first(guard); + assert!(!iter.valid()); + } + + #[test] + fn test_gc() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let memory_controller = engine.memory_controller(); + let region = new_region(1, b"", b"z"); + engine.new_region(region.clone()); + + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + let encode_key = |key, ts| { + let data_key = data_key(key); + let key = Key::from_raw(&data_key); + encoding_for_filter(key.as_encoded(), ts) + }; + + put_data( + b"key1", + b"value1", + 10, + 11, + 10, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key1", + b"value2", + 12, + 13, + 12, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key1", + b"value3", + 14, + 15, + 14, + false, + &default, + &write, + memory_controller.clone(), + ); + assert_eq!(3, element_count(&default)); + assert_eq!(3, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockPdClient {}), + None, + ); + + let cache_region = CacheRegion::from_region(®ion); + // gc should not hanlde keys with larger seqno than oldest seqno + worker.core.gc_region(&cache_region, 13, 10); + assert_eq!(3, element_count(&default)); + assert_eq!(3, element_count(&write)); + + // gc will not remove the latest mvcc put below safe point + worker.core.gc_region(&cache_region, 14, 100); + assert_eq!(2, element_count(&default)); + assert_eq!(2, element_count(&write)); + + worker.core.gc_region(&cache_region, 16, 100); + assert_eq!(1, element_count(&default)); + assert_eq!(1, element_count(&write)); + + // rollback will not make the first older version be filtered + rollback_data(b"key1", 17, 16, &write, memory_controller.clone()); + worker.core.gc_region(&cache_region, 17, 100); + assert_eq!(1, element_count(&default)); + assert_eq!(1, element_count(&write)); + let key = encode_key(b"key1", TimeStamp::new(15)); + let guard = &epoch::pin(); + assert!(key_exist(&write, &key, guard)); + let key = encode_key(b"key1", TimeStamp::new(14)); + assert!(key_exist(&default, &key, guard)); + + // unlike in WriteCompactionFilter, the latest mvcc delete below safe point will + // be filtered + delete_data(b"key1", 19, 18, &write, memory_controller.clone()); + worker.core.gc_region(&cache_region, 19, 100); + assert_eq!(0, element_count(&write)); + assert_eq!(0, element_count(&default)); + } + + // The GC of one region should not impact other regions + #[test] + fn test_gc_one_region() { + let config = InMemoryEngineConfig::config_for_test(); + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + let memory_controller = engine.memory_controller(); + let (write, default, region1, region2) = { + let region1 = CacheRegion::new(1, 0, b"zk00", b"zk10"); + engine.core.region_manager().new_region(region1.clone()); + + let region2 = CacheRegion::new(2, 0, b"zk30", b"zk40"); + engine.core.region_manager().new_region(region2.clone()); + + let engine = engine.core.engine(); + ( + engine.cf_handle(CF_WRITE), + engine.cf_handle(CF_DEFAULT), + region1, + region2, + ) + }; + + put_data( + b"k05", + b"val1", + 10, + 11, + 10, + false, + &default, + &write, + memory_controller.clone(), + ); + + put_data( + b"k05", + b"val2", + 12, + 13, + 14, + false, + &default, + &write, + memory_controller.clone(), + ); + + put_data( + b"k05", + b"val1", + 14, + 15, + 18, + false, + &default, + &write, + memory_controller.clone(), + ); + + put_data( + b"k35", + b"val1", + 10, + 11, + 12, + false, + &default, + &write, + memory_controller.clone(), + ); + + put_data( + b"k35", + b"val2", + 12, + 13, + 16, + false, + &default, + &write, + memory_controller.clone(), + ); + + put_data( + b"k35", + b"val1", + 14, + 15, + 20, + false, + &default, + &write, + memory_controller.clone(), + ); + + let encode_key = |key, commit_ts, seq_num| -> InternalBytes { + let data_key = data_key(key); + let raw_write_k = Key::from_raw(&data_key) + .append_ts(TimeStamp::new(commit_ts)) + .into_encoded(); + encode_key(&raw_write_k, seq_num, ValueType::Value) + }; + + let verify = |key, mvcc, seq, handle: &SkiplistHandle| { + let guard = &epoch::pin(); + let key = encode_key(key, mvcc, seq); + let mut iter = handle.iterator(); + iter.seek(&key, guard); + assert_eq!(iter.key(), &key); + iter.next(guard); + assert!(!iter.valid() || !iter.key().same_user_key_with(&key)); + }; + + assert_eq!(6, element_count(&default)); + assert_eq!(6, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockPdClient {}), + None, + ); + let filter = worker.core.gc_region(®ion1, 100, 100); + assert_eq!(2, filter.filtered); + + verify(b"k05", 15, 18, &write); + verify(b"k05", 14, 19, &default); + + assert_eq!(4, element_count(&default)); + assert_eq!(4, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockPdClient {}), + None, + ); + worker.core.gc_region(®ion2, 100, 100); + assert_eq!(2, filter.filtered); + + verify(b"k35", 15, 20, &write); + verify(b"k35", 14, 21, &default); + + assert_eq!(2, element_count(&default)); + assert_eq!(2, element_count(&write)); + } + + // test the condition that target region is split after scan need gc region and + // before the gc task actual start. + #[test] + fn test_gc_after_region_split() { + let config = InMemoryEngineConfig::config_for_test(); + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + let memory_controller = engine.memory_controller(); + let (write, default, region) = { + let region = CacheRegion::new(1, 0, b"zk00", b"zk20"); + engine.core.region_manager().new_region(region.clone()); + + let engine = engine.core.engine(); + ( + engine.cf_handle(CF_WRITE), + engine.cf_handle(CF_DEFAULT), + region, + ) + }; + + let test_data = [ + (b"k05".as_slice(), b"val1".as_slice(), 10, 11, 10), + (b"k05", b"val2", 12, 13, 14), + (b"k05", b"val1", 14, 15, 18), + (b"k15", b"val1", 10, 11, 10), + (b"k15", b"val2", 12, 13, 14), + (b"k15", b"val1", 14, 15, 18), + ]; + + for d in test_data { + put_data( + d.0, + d.1, + d.2, + d.3, + d.4, + false, + &default, + &write, + memory_controller.clone(), + ); + } + + assert_eq!(6, element_count(&default)); + assert_eq!(6, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockPdClient {}), + None, + ); + + // triggers region split. + let new_regions = vec![ + CacheRegion::new(1, 2, "zk00", "zk10"), + CacheRegion::new(2, 2, "zk10", "zk20"), + ]; + engine.on_region_event(RegionEvent::Split { + source: region.clone(), + new_regions, + }); + + // still use the original region to do gc, should only gc the region with the + // same id. + let filter = worker.core.gc_region(®ion, 100, 100); + assert_eq!(2, filter.filtered); + } + + #[test] + fn test_gc_for_overwrite_write() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let memory_controller = engine.memory_controller(); + let region = new_region(1, b"", b"z"); + engine.new_region(region.clone()); + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + put_data_with_overwrite( + b"key1", + b"value1", + 10, + 11, + 100, + 101, + false, + &default, + &write, + memory_controller.clone(), + ); + + assert_eq!(1, element_count(&default)); + assert_eq!(2, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller.clone(), + None, + config, + Arc::new(MockPdClient {}), + None, + ); + + let filter = worker + .core + .gc_region(&CacheRegion::from_region(®ion), 20, 200); + assert_eq!(1, filter.filtered); + assert_eq!(1, element_count(&default)); + assert_eq!(1, element_count(&write)); + } + + #[test] + fn test_snapshot_block_gc() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let memory_controller = engine.memory_controller(); + let region = new_region(1, b"", b"z"); + engine.new_region(region.clone()); + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + put_data( + b"key1", + b"value1", + 10, + 11, + 10, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key2", + b"value21", + 10, + 11, + 12, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key2", + b"value22", + 15, + 16, + 14, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key2", + b"value23", + 20, + 21, + 16, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key3", + b"value31", + 5, + 6, + 18, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key3", + b"value32", + 10, + 11, + 20, + false, + &default, + &write, + memory_controller.clone(), + ); + assert_eq!(6, element_count(&default)); + assert_eq!(6, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller, + None, + config, + Arc::new(MockPdClient {}), + None, + ); + let cache_region = CacheRegion::from_region(®ion); + let s1 = engine.snapshot(cache_region.clone(), 10, u64::MAX); + let s2 = engine.snapshot(cache_region.clone(), 11, u64::MAX); + let s3 = engine.snapshot(cache_region.clone(), 20, u64::MAX); + + // nothing will be removed due to snapshot 5 + let filter = worker.core.gc_region(&cache_region, 30, 100); + assert_eq!(0, filter.filtered); + assert_eq!(6, element_count(&default)); + assert_eq!(6, element_count(&write)); + + drop(s1); + let filter = worker.core.gc_region(&cache_region, 30, 100); + assert_eq!(1, filter.filtered); + assert_eq!(5, element_count(&default)); + assert_eq!(5, element_count(&write)); + + drop(s2); + let filter = worker.core.gc_region(&cache_region, 30, 100); + assert_eq!(1, filter.filtered); + assert_eq!(4, element_count(&default)); + assert_eq!(4, element_count(&write)); + + drop(s3); + let filter = worker.core.gc_region(&cache_region, 30, 100); + assert_eq!(1, filter.filtered); + assert_eq!(3, element_count(&default)); + assert_eq!(3, element_count(&write)); + } + + #[test] + fn test_gc_region_contained_in_historical_region() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let memory_controller = engine.memory_controller(); + let region = new_region(1, b"", b"z"); + engine.new_region(region.clone()); + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + put_data( + b"key1", + b"value1", + 9, + 10, + 10, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key1", + b"value2", + 11, + 12, + 11, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key1", + b"value3", + 30, + 31, + 20, + false, + &default, + &write, + memory_controller.clone(), + ); + + put_data( + b"key9", + b"value4", + 13, + 14, + 12, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key9", + b"value5", + 14, + 15, + 13, + false, + &default, + &write, + memory_controller.clone(), + ); + put_data( + b"key9", + b"value6", + 30, + 31, + 21, + false, + &default, + &write, + memory_controller.clone(), + ); + + let cache_region = CacheRegion::from_region(®ion); + let snap1 = engine.snapshot(cache_region.clone(), 20, 1000).unwrap(); + let snap2 = engine.snapshot(cache_region.clone(), 22, 1000).unwrap(); + let _snap3 = engine.snapshot(cache_region.clone(), 60, 1000).unwrap(); + + let new_regions = vec![ + CacheRegion::new(1, 1, "z", "zkey5"), + CacheRegion::new(2, 1, "zkey5", "zkey8"), + CacheRegion::new(3, 1, "zkey8", cache_region.end.clone()), + ]; + let region2 = new_regions[1].clone(); + engine.on_region_event(RegionEvent::Split { + source: cache_region.clone(), + new_regions, + }); + assert_eq!( + engine + .core + .region_manager() + .regions_map + .read() + .regions() + .len(), + 3 + ); + + engine.evict_region(®ion2, EvictReason::AutoEvict, None); + assert_eq!(6, element_count(&default)); + assert_eq!(6, element_count(&write)); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (worker, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller, + None, + config, + Arc::new(MockPdClient {}), + None, + ); + + let regions: Vec<_> = engine + .core + .region_manager() + .regions_map + .read() + .regions() + .values() + .filter_map(|m| { + if m.get_state() == RegionState::Active { + Some(m.get_region().clone()) + } else { + None + } + }) + .collect(); + assert_eq!(regions.len(), 2); + let mut filter = FilterMetrics::default(); + for r in ®ions { + filter.merge(&worker.core.gc_region(r, 50, 1000)); + } + assert_eq!(2, filter.filtered); + assert_eq!(4, element_count(&default)); + assert_eq!(4, element_count(&write)); + + drop(snap1); + let mut filter = FilterMetrics::default(); + for r in ®ions { + filter.merge(&worker.core.gc_region(r, 50, 1000)); + } + assert_eq!(0, filter.filtered); + assert_eq!(4, element_count(&default)); + assert_eq!(4, element_count(&write)); + + drop(snap2); + let mut filter = FilterMetrics::default(); + for r in ®ions { + filter.merge(&worker.core.gc_region(r, 50, 1000)); + } + assert_eq!(2, filter.filtered); + assert_eq!(2, element_count(&default)); + assert_eq!(2, element_count(&write)); + } + + #[test] + fn test_background_worker_load() { + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), + )); + let path = Builder::new().prefix("test_load").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + + for i in 10..20 { + let key = construct_key(i, 1); + let key = data_key(&key); + let value = construct_value(i, i); + rocks_engine + .put_cf(CF_DEFAULT, &key, value.as_bytes()) + .unwrap(); + rocks_engine + .put_cf(CF_WRITE, &key, value.as_bytes()) + .unwrap(); + } + + let k = format!("zk{:08}", 15).into_bytes(); + let region1 = CacheRegion::new(1, 0, DATA_MIN_KEY, k.clone()); + let region2 = CacheRegion::new(2, 0, k, DATA_MAX_KEY); + engine + .core + .region_manager() + .load_region(region1.clone()) + .unwrap(); + engine + .core + .region_manager() + .load_region(region2.clone()) + .unwrap(); + engine.prepare_for_apply(®ion1, false); + engine.prepare_for_apply(®ion2, false); + + // concurrent write to rocksdb, but the key will not be loaded in the memory + // engine + let key = construct_key(20, 1); + let key20 = data_key(&key); + let value = construct_value(20, 20); + rocks_engine + .put_cf(CF_DEFAULT, &key20, value.as_bytes()) + .unwrap(); + rocks_engine + .put_cf(CF_WRITE, &key20, value.as_bytes()) + .unwrap(); + + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + // wait for background load + std::thread::sleep(Duration::from_secs(1)); + + let _ = engine + .snapshot(region1.clone(), u64::MAX, u64::MAX) + .unwrap(); + let _ = engine + .snapshot(region2.clone(), u64::MAX, u64::MAX) + .unwrap(); + + let guard = &epoch::pin(); + for i in 10..20 { + let key = construct_key(i, 1); + let key = data_key(&key); + let value = construct_value(i, i); + let key = encode_seek_key(&key, u64::MAX); + assert_eq!( + get_value(&write, &key, guard).unwrap().as_slice(), + value.as_bytes() + ); + assert_eq!( + get_value(&default, &key, guard).unwrap().as_slice(), + value.as_bytes() + ); + } + + let key20 = encode_seek_key(&key20, u64::MAX); + assert!(!key_exist(&write, &key20, guard)); + assert!(!key_exist(&default, &key20, guard)); + } + + #[test] + fn test_regions_for_gc() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let memory_controller = engine.memory_controller(); + let r1 = new_region(1, b"a", b"b"); + let r2 = new_region(2, b"b", b"c"); + engine.new_region(r1); + engine.new_region(r2); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let (runner, _) = BackgroundRunner::new( + engine.core.clone(), + memory_controller, + None, + config, + Arc::new(MockPdClient {}), + None, + ); + let regions = runner.core.regions_for_gc(); + assert_eq!(2, regions.len()); + + // until the previous gc finished, node regions will be returned + assert!(runner.core.regions_for_gc().is_empty()); + runner.core.on_gc_finished(); + + let regions = runner.core.regions_for_gc(); + assert_eq!(2, regions.len()); + } + + #[derive(Default)] + struct MockRegionInfoProvider { + regions: Mutex>, + } + + impl MockRegionInfoProvider { + fn add_region(&self, region: Region) { + self.regions.lock().unwrap().push(region); + } + } + + impl RegionInfoProvider for MockRegionInfoProvider { + fn get_regions_in_range( + &self, + start: &[u8], + end: &[u8], + ) -> raftstore::coprocessor::Result> { + let regions: Vec<_> = self + .regions + .lock() + .unwrap() + .iter() + .filter(|r| { + (r.end_key.is_empty() || r.end_key.as_slice() > start) + && (end.is_empty() || end > r.start_key.as_slice()) + }) + .cloned() + .collect(); + Ok(regions) + } + } + + // Test creating and loading cache hint using a region label rule: + // 1. Insert some data into rocks engine, which is set as disk engine for the + // memory engine. + // 2. Use test pd client server to create a label rule for portion of the data. + // 3. Wait until data is loaded. + // 4. Verify that only the labeled key range has been loaded. + #[test] + fn test_load_from_pd_hint_service() { + let region_info_provider = Arc::new(MockRegionInfoProvider::default()); + + let mut engine = RegionCacheMemoryEngine::with_region_info_provider( + InMemoryEngineContext::new_for_tests(Arc::new(VersionTrack::new( + InMemoryEngineConfig::config_for_test(), + ))), + Some(region_info_provider.clone()), + None, + ); + let path = Builder::new() + .prefix("test_load_from_pd_hint_service") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + + for i in 10..20 { + let key = construct_key(i, 1); + let value = construct_value(i, i); + rocks_engine + .put_cf(CF_DEFAULT, &key, value.as_bytes()) + .unwrap(); + rocks_engine + .put_cf(CF_WRITE, &key, value.as_bytes()) + .unwrap(); + } + let region = new_region(1, format!("k{:08}", 10), format!("k{:08}", 15)); + region_info_provider.add_region(region.clone()); + + let (mut pd_server, pd_client) = new_test_server_and_client(ReadableDuration::millis(100)); + let cluster_id = pd_client.get_cluster_id().unwrap(); + let pd_client = Arc::new(pd_client); + engine.start_hint_service(PdRangeHintService::from(pd_client.clone())); + let meta_client = region_label_meta_client(pd_client.clone()); + let label_rule = new_region_label_rule( + "cache/0", + &hex::encode(format!("k{:08}", 0).into_bytes()), + &hex::encode(format!("k{:08}", 20).into_bytes()), + ); + add_region_label_rule(meta_client, cluster_id, &label_rule); + + // Wait for the watch to fire. + test_util::eventually( + Duration::from_millis(10), + Duration::from_millis(200), + || { + !engine + .core + .region_manager() + .regions_map + .read() + .regions() + .is_empty() + }, + ); + let cache_region = CacheRegion::from_region(®ion); + engine.prepare_for_apply(&cache_region, false); + + // Wait for the range to be loaded. + test_util::eventually( + Duration::from_millis(50), + Duration::from_millis(1000), + || { + let regions_map = engine.core.region_manager().regions_map.read(); + regions_map.region_meta(1).unwrap().get_state() == RegionState::Active + }, + ); + let _ = engine.snapshot(cache_region, u64::MAX, u64::MAX).unwrap(); + + let skiplist_engine = engine.core.engine(); + let write = skiplist_engine.cf_handle(CF_WRITE); + let default = skiplist_engine.cf_handle(CF_DEFAULT); + + let guard = &epoch::pin(); + for i in 10..15 { + let key = construct_key(i, 1); + let value = construct_value(i, i); + let key = encode_seek_key(&key, u64::MAX); + assert_eq!( + get_value(&write, &key, guard).unwrap().as_slice(), + value.as_bytes() + ); + assert_eq!( + get_value(&default, &key, guard).unwrap().as_slice(), + value.as_bytes() + ); + } + for i in 15..=20 { + let key = construct_key(i, 1); + let key = data_key(&key); + let key = encode_seek_key(&key, u64::MAX); + assert!(!key_exist(&write, &key, guard)); + assert!(!key_exist(&default, &key, guard)); + } + + pd_server.stop(); + } + + fn verify_load( + region: &Region, + engine: &RegionCacheMemoryEngine, + exist: bool, + expect_count: usize, + ) { + if exist { + let read_ts = TimeStamp::compose(TimeStamp::physical_now(), 0).into_inner(); + let snap = engine + .snapshot(CacheRegion::from_region(region), read_ts, u64::MAX) + .unwrap(); + let mut count = 0; + let range = CacheRegion::from_region(region); + for cf in DATA_CFS { + let mut iter = IterOptions::default(); + iter.set_lower_bound(&range.start, 0); + iter.set_upper_bound(&range.end, 0); + let mut iter = snap.iterator_opt(cf, iter).unwrap(); + let _ = iter.seek_to_first(); + while iter.valid().unwrap() { + let _ = iter.next(); + count += 1; + } + } + assert_eq!(count, expect_count); + } else { + engine + .snapshot(CacheRegion::from_region(region), 10, 10) + .unwrap_err(); + } + } + + #[test] + fn test_snapshot_load_reaching_stop_limit() { + let mut config = InMemoryEngineConfig::config_for_test(); + config.stop_load_threshold = Some(ReadableSize(500)); + config.evict_threshold = Some(ReadableSize(1000)); + config.capacity = Some(ReadableSize(1500)); + let config = Arc::new(VersionTrack::new(config)); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config)); + let path = Builder::new() + .prefix("test_snapshot_load_reaching_limit") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + let mem_controller = engine.memory_controller(); + + let region1 = new_region(1, construct_region_key(1), construct_region_key(3)); + // Memory for one put is 17(key) + 3(val) + 8(Seqno) + 16(Memory controller in + // key and val) + 96(Node overhead) = 140 + let key = construct_key(1, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + let key = construct_key(2, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + // After loading range1, the memory usage should be 140*6=840 + + let region2 = new_region(2, construct_region_key(3), construct_region_key(5)); + let key = construct_key(3, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + for r in [®ion1, ®ion2] { + let cache_region = CacheRegion::from_region(r); + engine.load_region(cache_region.clone()).unwrap(); + engine.prepare_for_apply(&cache_region, false); + } + + // ensure all ranges are finshed + test_util::eventually(Duration::from_millis(100), Duration::from_secs(2), || { + !engine + .core + .region_manager() + .regions_map() + .read() + .regions() + .values() + .any(|m| matches!(m.get_state(), Pending | Loading)) + }); + + verify_load(®ion1, &engine, true, 6); + verify_load(®ion2, &engine, false, 0); + assert_eq!(mem_controller.mem_usage(), 846); + } + + #[test] + fn test_snapshot_load_reaching_capacity() { + let mut config = InMemoryEngineConfig::config_for_test(); + config.stop_load_threshold = Some(ReadableSize(1000)); + config.evict_threshold = Some(ReadableSize(1000)); + config.capacity = Some(ReadableSize(1500)); + let config = Arc::new(VersionTrack::new(config)); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config)); + let path = Builder::new() + .prefix("test_snapshot_load_reaching_limit") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + let mem_controller = engine.memory_controller(); + + let region1 = new_region(1, construct_region_key(1), construct_region_key(3)); + // Memory for one put is 17(key) + 3(val) + 8(Seqno) + 16(Memory controller in + // key and val) + 96(Node overhead) = 140 + let key = construct_key(1, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + let key = construct_key(2, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + // After loading range1, the memory usage should be 140*6=840 + + let region2 = new_region(2, construct_region_key(3), construct_region_key(5)); + let key = construct_key(3, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + let key = construct_key(4, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + // 840*2 > capacity 1500, so the load will fail and the loaded keys should be + // removed + + let region3 = new_region(3, construct_region_key(5), construct_region_key(6)); + let key = construct_key(5, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + let key = construct_key(6, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + for r in [®ion1, ®ion2, ®ion3] { + let cache_region = CacheRegion::from_region(r); + engine.load_region(cache_region.clone()).unwrap(); + engine.prepare_for_apply(&cache_region, false); + } + + // ensure all ranges are finshed + test_util::eventually(Duration::from_millis(100), Duration::from_secs(2), || { + let regions_map = engine.core.region_manager().regions_map.read(); + !regions_map + .regions() + .values() + .any(|m| matches!(m.get_state(), Pending | Loading)) + }); + + verify_load(®ion1, &engine, true, 6); + verify_load(®ion2, &engine, false, 0); + verify_load(®ion3, &engine, false, 3); + assert_eq!(mem_controller.mem_usage(), 1551); + } + + #[test] + fn test_memory_config_change() { + let mut config = InMemoryEngineConfig::config_for_test(); + config.evict_threshold = Some(ReadableSize(1000)); + config.capacity = Some(ReadableSize(1500)); + let config = Arc::new(VersionTrack::new(config)); + let mut engine = + RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config.clone())); + let path = Builder::new() + .prefix("test_snapshot_load_reaching_limit") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + let mem_controller = engine.memory_controller(); + + let region1 = new_region(1, construct_region_key(1), construct_region_key(3)); + let cache_region1 = CacheRegion::from_region(®ion1); + // Memory for one put is 17(key) + 3(val) + 8(Seqno) + 16(Memory controller in + // key and val) + 96(Node overhead) = 140 + let key = construct_key(1, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + let key = construct_key(2, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + // After loading range1, the memory usage should be 140*6=840 + engine.load_region(cache_region1.clone()).unwrap(); + engine.prepare_for_apply(&cache_region1, false); + + let region2 = new_region(2, construct_region_key(3), construct_region_key(5)); + let key = construct_key(3, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + + let key = construct_key(4, 10); + rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); + rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); + // 840*2 > capacity 1500, so the load will fail and the loaded keys should be + // removed. However now we change the memory quota to 2000, so the range2 can be + // cached. + let mut config_manager = InMemoryEngineConfigManager(config.clone()); + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("capacity"), ConfigValue::Size(2000)); + config_manager.dispatch(config_change).unwrap(); + assert_eq!(config.value().capacity(), 2000); + + let cache_region2 = CacheRegion::from_region(®ion2); + engine.load_region(cache_region2.clone()).unwrap(); + engine.prepare_for_apply(&cache_region2, false); + + // ensure all ranges are finshed + test_util::eventually(Duration::from_millis(100), Duration::from_secs(2), || { + let regions_map = engine.core.region_manager().regions_map.read(); + !regions_map + .regions() + .values() + .any(|m| matches!(m.get_state(), Pending | Loading)) + }); + + let verify = |r: &Region, exist, expect_count| { + if exist { + let read_ts = TimeStamp::compose(TimeStamp::physical_now(), 0).into_inner(); + let snap = engine + .snapshot(CacheRegion::from_region(r), read_ts, u64::MAX) + .unwrap(); + let mut count = 0; + let range = CacheRegion::from_region(r); + for cf in DATA_CFS { + let mut iter = IterOptions::default(); + iter.set_lower_bound(&range.start, 0); + iter.set_upper_bound(&range.end, 0); + let mut iter = snap.iterator_opt(cf, iter).unwrap(); + let _ = iter.seek_to_first(); + while iter.valid().unwrap() { + let _ = iter.next(); + count += 1; + } + } + assert_eq!(count, expect_count); + } else { + engine + .snapshot(CacheRegion::from_region(r), 10, 10) + .unwrap_err(); + } + }; + verify(®ion1, true, 6); + verify(®ion2, true, 6); + assert_eq!(mem_controller.mem_usage(), 1692); + } + + #[test] + fn test_gc_use_pd_tso() { + struct MockPdClient { + tx: Mutex>, + } + impl PdClient for MockPdClient { + fn get_tso(&self) -> pd_client::PdFuture { + self.tx.lock().unwrap().send(()).unwrap(); + Box::pin(ready(Ok(TimeStamp::compose(TimeStamp::physical_now(), 0)))) + } + } + + let mut config = InMemoryEngineConfig::config_for_test(); + config.gc_run_interval = ReadableDuration(Duration::from_millis(100)); + config.load_evict_interval = ReadableDuration(Duration::from_millis(200)); + let config = Arc::new(VersionTrack::new(config)); + let start_time = TimeStamp::compose(TimeStamp::physical_now(), 0); + let (tx, pd_client_rx) = channel(); + let pd_client = Arc::new(MockPdClient { tx: Mutex::new(tx) }); + let (scheduler, mut rx) = dummy_scheduler(); + let (ticker, stop) = BgWorkManager::start_tick(scheduler, pd_client, config.clone()); + + let Some(BackgroundTask::Gc(GcTask { safe_point })) = rx + .recv_timeout(10 * config.value().gc_run_interval.0) + .unwrap() + else { + panic!("must be a GcTask"); + }; + let safe_point = TimeStamp::from(safe_point); + // Make sure it is a reasonable timestamp. + assert!(safe_point >= start_time, "{safe_point}, {start_time}"); + let now = TimeStamp::compose(TimeStamp::physical_now(), 0); + assert!(safe_point < now, "{safe_point}, {now}"); + // Must get ts from PD. + pd_client_rx.try_recv().unwrap(); + + stop.send(true).unwrap(); + ticker.stop(); + } +} diff --git a/components/in_memory_engine/src/config.rs b/components/in_memory_engine/src/config.rs new file mode 100644 index 00000000000..e0a56abfab2 --- /dev/null +++ b/components/in_memory_engine/src/config.rs @@ -0,0 +1,210 @@ +use std::{error::Error, sync::Arc, time::Duration}; + +use online_config::{ConfigChange, ConfigManager, OnlineConfig}; +use serde::{Deserialize, Serialize}; +use tikv_util::{ + config::{ReadableDuration, ReadableSize, VersionTrack}, + info, +}; + +const DEFAULT_GC_RUN_INTERVAL: Duration = Duration::from_secs(180); +// The minimum interval for GC run is 10 seconds. Shorter interval is not +// meaningful because the GC process is CPU intensive and may not complete in +// 10 seconds. +const MIN_GC_RUN_INTERVAL: Duration = Duration::from_secs(10); +// The maximum interval for GC run is 10 minutes which equals to the minimum +// value of TiDB GC lifetime. +const MAX_GC_RUN_INTERVAL: Duration = Duration::from_secs(600); + +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, OnlineConfig)] +#[serde(default, rename_all = "kebab-case")] +pub struct InMemoryEngineConfig { + /// Determines whether to enable the in memory engine feature. + pub enable: bool, + /// The maximum memory usage of the engine. + pub capacity: Option, + /// When memory usage reaches this amount, we start to pick some regions to + /// evict. + pub evict_threshold: Option, + /// When memory usage reaches this amount, we stop loading regions. + // TODO(SpadeA): ultimately we only expose one memory limit to user. + // When memory usage reaches this amount, no further load will be + // performed. + pub stop_load_threshold: Option, + /// Determines the oldest timestamp (approximately, now - gc_run_interval) + /// of the read request the in memory engine can serve. + pub gc_run_interval: ReadableDuration, + pub load_evict_interval: ReadableDuration, + /// used in getting top regions to filter those with less mvcc + /// amplification. Here, we define mvcc amplification to be + /// '(next + prev) / processed_keys'. + pub mvcc_amplification_threshold: usize, + /// Cross check is only for test usage and should not be turned on in + /// production environment. Interval 0 means it is turned off, which is + /// the default value. + #[online_config(skip)] + pub cross_check_interval: ReadableDuration, + + // It's always set to region split size, should not be modified manually. + #[online_config(skip)] + #[serde(skip)] + #[doc(hidden)] + pub expected_region_size: ReadableSize, +} + +impl Default for InMemoryEngineConfig { + fn default() -> Self { + Self { + enable: false, + gc_run_interval: ReadableDuration(DEFAULT_GC_RUN_INTERVAL), + stop_load_threshold: None, + // Each load/evict operation should run within five minutes. + load_evict_interval: ReadableDuration(Duration::from_secs(300)), + evict_threshold: None, + capacity: None, + mvcc_amplification_threshold: 100, + cross_check_interval: ReadableDuration(Duration::from_secs(0)), + expected_region_size: raftstore::coprocessor::config::SPLIT_SIZE, + } + } +} + +impl InMemoryEngineConfig { + pub fn validate(&mut self) -> Result<(), Box> { + if !self.enable { + return Ok(()); + } + + if self.evict_threshold.is_none() || self.capacity.is_none() { + return Err("evict-threshold or capacity not set".into()); + } + + if self.stop_load_threshold.is_none() { + self.stop_load_threshold = self.evict_threshold; + } + + if self.stop_load_threshold.as_ref().unwrap() > self.evict_threshold.as_ref().unwrap() { + return Err(format!( + "stop-load-threshold {:?} is larger to evict-threshold {:?}", + self.stop_load_threshold.as_ref().unwrap(), + self.evict_threshold.as_ref().unwrap() + ) + .into()); + } + + if self.evict_threshold.as_ref().unwrap() >= self.capacity.as_ref().unwrap() { + return Err(format!( + "evict-threshold {:?} is larger or equal to capacity {:?}", + self.evict_threshold.as_ref().unwrap(), + self.capacity.as_ref().unwrap() + ) + .into()); + } + + // The GC interval should be in the range + // [MIN_GC_RUN_INTERVAL, MAX_GC_RUN_INTERVAL]. + if self.gc_run_interval.0 < MIN_GC_RUN_INTERVAL + || self.gc_run_interval.0 > MAX_GC_RUN_INTERVAL + { + return Err(format!( + "gc-run-interval {:?} should be in the range [{:?}, {:?}]", + self.gc_run_interval, MIN_GC_RUN_INTERVAL, MAX_GC_RUN_INTERVAL + ) + .into()); + } + + Ok(()) + } + + pub fn stop_load_threshold(&self) -> usize { + self.stop_load_threshold.map_or(0, |r| r.0 as usize) + } + + pub fn evict_threshold(&self) -> usize { + self.evict_threshold.map_or(0, |r| r.0 as usize) + } + + pub fn capacity(&self) -> usize { + self.capacity.map_or(0, |r| r.0 as usize) + } + + pub fn config_for_test() -> InMemoryEngineConfig { + InMemoryEngineConfig { + enable: true, + gc_run_interval: ReadableDuration(Duration::from_secs(180)), + load_evict_interval: ReadableDuration(Duration::from_secs(300)), + stop_load_threshold: Some(ReadableSize::gb(1)), + evict_threshold: Some(ReadableSize::gb(1)), + capacity: Some(ReadableSize::gb(2)), + expected_region_size: ReadableSize::mb(20), + mvcc_amplification_threshold: 10, + cross_check_interval: ReadableDuration(Duration::from_secs(0)), + } + } +} + +#[derive(Clone)] +pub struct InMemoryEngineConfigManager(pub Arc>); + +impl InMemoryEngineConfigManager { + pub fn new(config: Arc>) -> Self { + Self(config) + } +} + +impl ConfigManager for InMemoryEngineConfigManager { + fn dispatch( + &mut self, + change: ConfigChange, + ) -> std::result::Result<(), Box> { + { + let change = change.clone(); + self.0 + .update(move |cfg: &mut InMemoryEngineConfig| cfg.update(change))?; + } + info!("ime config changed"; "change" => ?change); + Ok(()) + } +} + +impl std::ops::Deref for InMemoryEngineConfigManager { + type Target = Arc>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate() { + let mut cfg = InMemoryEngineConfig::default(); + cfg.validate().unwrap(); + + cfg.enable = true; + assert!(cfg.validate().is_err()); + + cfg.capacity = Some(ReadableSize::gb(2)); + cfg.evict_threshold = Some(ReadableSize::gb(1)); + cfg.stop_load_threshold = Some(ReadableSize::gb(1)); + cfg.validate().unwrap(); + + // Error if less than MIN_GC_RUN_INTERVAL. + cfg.gc_run_interval = ReadableDuration(Duration::ZERO); + assert!(cfg.validate().is_err()); + cfg.gc_run_interval = ReadableDuration(Duration::from_secs(9)); + assert!(cfg.validate().is_err()); + + // Error if larger than MIN_GC_RUN_INTERVAL. + cfg.gc_run_interval = ReadableDuration(Duration::from_secs(601)); + assert!(cfg.validate().is_err()); + cfg.gc_run_interval = ReadableDuration(Duration::MAX); + assert!(cfg.validate().is_err()); + + cfg.gc_run_interval = ReadableDuration(Duration::from_secs(180)); + cfg.validate().unwrap(); + } +} diff --git a/components/in_memory_engine/src/cross_check.rs b/components/in_memory_engine/src/cross_check.rs new file mode 100644 index 00000000000..36165af1a77 --- /dev/null +++ b/components/in_memory_engine/src/cross_check.rs @@ -0,0 +1,1654 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fmt::Display, sync::Arc, time::Duration}; + +use engine_rocks::{RocksEngine, RocksEngineIterator, RocksSnapshot}; +use engine_traits::{ + iter_option, CacheRegion, Iterable, Iterator, KvEngine, Peekable, RegionCacheEngine, + SnapshotMiscExt, CF_LOCK, CF_WRITE, +}; +use pd_client::PdClient; +use slog_global::{error, info, warn}; +use tikv_util::{ + future::block_on_timeout, + time::Instant, + worker::{Runnable, RunnableWithTimer}, +}; +use txn_types::{Key, TimeStamp, WriteRef, WriteType}; + +use crate::{ + background::{parse_write, split_ts}, + read::{RegionCacheIterator, RegionCacheSnapshot}, + RegionCacheMemoryEngine, RegionState, +}; + +// Cross check stops for some reason. +#[derive(Debug)] +pub enum StopReason { + RegionMetaChanged, + KeyGcInRocksDB, + TiKVSafepointGetFailed, +} + +type Result = std::result::Result; + +#[derive(Debug)] +pub(crate) enum CrossCheckTask { + CrossCheck, +} + +impl Display for CrossCheckTask { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CrossCheckTask::CrossCheck => f.debug_struct("CrossCheck").finish(), + } + } +} + +// Checks the data consistency in the mvcc snapshot semantics between in-memory +// engine and rocksdb. It compares keys one by one in the in-memory engine +// with keys in the rocksdb and for those keys that are missed or redundant +// in the in-memory engine, check the validity. +pub(crate) struct CrossChecker { + pd_client: Arc, + memory_engine: RegionCacheMemoryEngine, + rocks_engine: RocksEngine, + interval: Duration, + get_tikv_safe_point: Box Option + Send>, +} + +impl CrossChecker { + pub(crate) fn new( + pd_client: Arc, + memory_engine: RegionCacheMemoryEngine, + rocks_engine: RocksEngine, + interval: Duration, + get_tikv_safe_point: Box Option + Send>, + ) -> CrossChecker { + CrossChecker { + pd_client, + memory_engine, + rocks_engine, + interval, + get_tikv_safe_point, + } + } + + fn cross_check_region( + &self, + region_snap: &RegionCacheSnapshot, + rocks_snap: &RocksSnapshot, + ) -> Result<()> { + info!( + "ime cross check region"; + "region" => ?region_snap.snapshot_meta().region, + ); + let opts = iter_option( + ®ion_snap.snapshot_meta().region.start, + ®ion_snap.snapshot_meta().region.end, + false, + ); + let mut safe_point = { + let region_maps = self + .memory_engine + .core() + .region_manager() + .regions_map() + .read(); + let Some(s) = region_maps.region_meta(region_snap.snapshot_meta().region.id) else { + return Ok(()); + }; + s.safe_point() + }; + + for cf in &[CF_LOCK, CF_WRITE] { + let mut mem_iter = region_snap.iterator_opt(cf, opts.clone()).unwrap(); + let mut disk_iter = rocks_snap.iterator_opt(cf, opts.clone()).unwrap(); + + let mem_valid = mem_iter.seek_to_first().unwrap(); + let disk_valid = disk_iter.seek_to_first().unwrap(); + if !mem_valid { + // There's no key in IME, we should check whether the rocks engine has no key + // visible to user with read_ts `safe_point`. + let mut last_disk_user_key = vec![]; + let mut last_disk_user_key_delete = false; + let mut prev_key_info = KeyCheckingInfo::default(); + CrossChecker::check_remain_disk_key( + cf, + ®ion_snap.snapshot_meta().region, + &mut safe_point, + &mem_iter, + &mut disk_iter, + &mut prev_key_info, + &mut last_disk_user_key, + &mut last_disk_user_key_delete, + &self.memory_engine, + )?; + continue; + } + + if !disk_valid { + if *cf != CF_LOCK { + let Some(tikv_safe_point) = (self.get_tikv_safe_point)() else { + return Err(StopReason::TiKVSafepointGetFailed); + }; + // The keys in rocksdb may have been gced, so keys in ime should have mvcc + // versions less or equal to the safe point of tikv. + loop { + let (_, mvcc) = split_ts(mem_iter.key()).unwrap(); + if mvcc > tikv_safe_point { + break; + } + mem_iter.next().unwrap(); + if !mem_iter.valid().unwrap() { + return Err(StopReason::KeyGcInRocksDB); + } + } + } + panic!( + "ime cross check fail(key should not exist): {:?} cf not match when seek_to_first; + cache_region={:?}; cache_key={:?}; sequence_numer={};", + cf, + region_snap.snapshot_meta().region, + log_wrappers::Value(mem_iter.key()), + mem_iter.sequence_number, + ); + } + + let check_default = |iter: &RegionCacheIterator| { + let write = WriteRef::parse(iter.value()).unwrap(); + if write.write_type == WriteType::Put && write.short_value.is_none() { + let start_ts = write.start_ts; + let (user_key, _) = split_ts(iter.key()).unwrap(); + let default_key = Key::from_encoded(user_key.to_vec()).append_ts(start_ts); + if let Ok(Some(_)) = region_snap.get_value(default_key.as_encoded()) { + } else { + // check again + if let Ok(Some(_)) = region_snap.get_value_cf(CF_WRITE, iter.key()) { + panic!( + "ime cross check fail(key should exist): default not found; + cache_region={:?}; default_key={:?}, write_key={:?}, start_ts={}; sequence_numer={};", + region_snap.snapshot_meta().region, + log_wrappers::Value(default_key.as_encoded()), + log_wrappers::Value(iter.key()), + start_ts, + iter.sequence_number, + ); + } + } + } + }; + + let mut last_disk_user_key = vec![]; + // We can have this state: + // Safe point: 6 + // IME: [k2-7] + // Rocks: k1-5-delete, [k1-3], k2-7 + // where k1-5-delete and k1-3 are filtered which is legal as k1-5 is a delete + // type. At some time, rocksdb iterator points to k1-3 while IME iterator points + // to k2-7 and we need last_disk_user_key_delete being true to verify the + // legality. + let mut last_disk_user_key_delete = false; + + let mut cur_key_info = KeyCheckingInfo { + user_key: vec![], + // Used to record mvcc versions of same user keys. So if safe point changed, we + // can found the last_mvcc_before_safe_point_of_cur_user_key and + // last_mvcc_before_safe_point_of_last_user_key + mvcc_recordings: vec![], + // We can have intermediate state: + // Safe point: 6 + // IME: k1-7, k1-5, k1-2 + // Rocks: k1-7, k1-5, k1-3, k1-2 + // where k1-3 is GCed but k1-2 is not. It's safe because safe point is 6 and we + // have k1-5 so both k1-3 and k1-2 are not visible. + // So we record last_mvcc_before_safe_point_of_cur_user_key = 5 and we reject + // any version of this user key with mvcc between 5 and safe point 6. + last_mvcc_version_before_safe_point: 0, + }; + + let mut prev_key_info = KeyCheckingInfo { + user_key: vec![], + mvcc_recordings: vec![], + // We can have this sate: + // Safe point: 6 + // IME: k1-7, k1-5, [k2-7] + // Rocks: k1-7, k1-5, [k1-3], k1-2, k2-7 + // where k1-3 and k1-2 are filtered which is valid. At some time, rocksdb + // iterator points to k1-3 and IME iterator points to k2-7. We need + // to record last_mvcc_before_safe_point_of_last_user_key = 5 and + // reject any version of user key k1 (which is the last user key of + // IME) with mvcc between 5 and 6. + last_mvcc_version_before_safe_point: 0, + }; + + if *cf == CF_WRITE { + let write = match parse_write(mem_iter.value()) { + Ok(write) => write, + Err(e) => { + panic!( + "ime cross check fail(parse error); + cache_region={:?}; cache_key={:?}, cache_val={:?}; sequence_numer={}; Error={:?}", + region_snap.snapshot_meta().region, + log_wrappers::Value(mem_iter.key()), + log_wrappers::Value(mem_iter.value()), + mem_iter.sequence_number, + e, + ); + } + }; + let (user_key, ts) = split_ts(mem_iter.key()).unwrap(); + + if write.write_type != WriteType::Lock && write.write_type != WriteType::Rollback { + cur_key_info.mvcc_recordings.push(ts); + } + + cur_key_info.user_key = user_key.to_vec(); + } + + CrossChecker::check_with_key_in_disk_iter( + cf, + &mem_iter, + &mut disk_iter, + false, + &mut safe_point, + &self.memory_engine, + ®ion_snap.snapshot_meta().region, + &mut prev_key_info, + &mut cur_key_info, + &mut last_disk_user_key_delete, + &mut last_disk_user_key, + &self.get_tikv_safe_point, + )?; + + if *cf == CF_WRITE { + check_default(&mem_iter); + } + + while mem_iter.next().unwrap() { + if *cf == CF_WRITE { + let (user_key, ts) = split_ts(mem_iter.key()).unwrap(); + let write = match parse_write(mem_iter.value()) { + Ok(write) => write, + Err(e) => { + panic!( + "ime cross check fail(parse error); + cache_region={:?}; cache_key={:?}, cache_val={:?}; sequence_numer={}; Error={:?}", + region_snap.snapshot_meta().region, + log_wrappers::Value(mem_iter.key()), + log_wrappers::Value(mem_iter.value()), + mem_iter.sequence_number, + e, + ); + } + }; + + if cur_key_info.user_key != user_key { + prev_key_info = cur_key_info; + cur_key_info = KeyCheckingInfo { + user_key: user_key.to_vec(), + mvcc_recordings: vec![], + last_mvcc_version_before_safe_point: 0, + }; + } + + if write.write_type != WriteType::Lock + && write.write_type != WriteType::Rollback + { + cur_key_info.mvcc_recordings.push(ts); + } + } + + CrossChecker::check_with_key_in_disk_iter( + cf, + &mem_iter, + &mut disk_iter, + true, + &mut safe_point, + &self.memory_engine, + ®ion_snap.snapshot_meta().region, + &mut prev_key_info, + &mut cur_key_info, + &mut last_disk_user_key_delete, + &mut last_disk_user_key, + &self.get_tikv_safe_point, + )?; + + if *cf == CF_WRITE { + check_default(&mem_iter); + } + } + prev_key_info = cur_key_info; + disk_iter.next().unwrap(); + CrossChecker::check_remain_disk_key( + cf, + ®ion_snap.snapshot_meta().region, + &mut safe_point, + &mem_iter, + &mut disk_iter, + &mut prev_key_info, + &mut last_disk_user_key, + &mut last_disk_user_key_delete, + &self.memory_engine, + )?; + } + + info!( + "ime cross check range done"; + "region" => ?region_snap.snapshot_meta().region, + ); + + Ok(()) + } + + // In-memory engine may have GCed some versions, so we should call next of + // disk_iter for some times to get aligned with mem_iter. + // After each call of disk_iter, we will check whether the key missed in the + // in-memory engine will not make it compromise data consistency. + // `next_first` denotes whether disk_iter should call next before comparison. + #[allow(clippy::collapsible_if)] + fn check_with_key_in_disk_iter( + cf: &str, + mem_iter: &RegionCacheIterator, + disk_iter: &mut RocksEngineIterator, + next_fisrt: bool, + safe_point: &mut u64, + engine: &RegionCacheMemoryEngine, + cached_region: &CacheRegion, + prev_key_info: &mut KeyCheckingInfo, + cur_key_info: &mut KeyCheckingInfo, + last_disk_user_key_delete: &mut bool, + last_disk_user_key: &mut Vec, + get_tikv_safe_point: &(dyn Fn() -> Option + Send), + ) -> Result<()> { + let read_ts = mem_iter.snapshot_read_ts; + let mem_key = mem_iter.key(); + if next_fisrt { + if !disk_iter.next().unwrap() { + if cf != CF_LOCK { + let (_, mem_mvcc) = split_ts(mem_key).unwrap(); + // The keys in rocksdb may have been gced. Check the mvcc version of `mem_key`, + // and if it has mvcc less or equal to the safe point of tikv, for simplicity, + // break the check in such cases. + if mem_mvcc <= get_tikv_safe_point().unwrap() { + return Err(StopReason::KeyGcInRocksDB); + } + } + panic!( + "ime cross check fail(key should not exist): disk iterator next failed; + cache_region={:?}; cache_key={:?}; sequence_numer={}; cf={:?}", + cached_region, + log_wrappers::Value(mem_key), + mem_iter.sequence_number, + cf, + ); + } + } + + loop { + let disk_key = disk_iter.key(); + if cf == CF_LOCK { + // CF_LOCK should always have the same view + if disk_key != mem_key { + panic!( + "ime cross check fail(key not equal): lock cf not match; + cache_region={:?}; cache_key={:?}, disk_key={:?}; sequence_numer={};", + cached_region, + log_wrappers::Value(mem_key), + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + ); + } + if mem_iter.value() != disk_iter.value() { + panic!( + "ime cross check fail(value not equal): lock cf not match; + cache_region={:?}; key={:?}, mem_value={:?} disk_key={:?};", + cached_region, + log_wrappers::Value(mem_key), + log_wrappers::Value(mem_iter.value()), + log_wrappers::Value(disk_iter.value()), + ); + } + break; + } + + if disk_key == mem_key { + if mem_iter.value() != disk_iter.value() { + panic!( + "ime cross check fail(value not equal): write cf not match; + cache_region={:?}; key={:?}, mem_value={:?} disk_key={:?};", + cached_region, + log_wrappers::Value(mem_key), + log_wrappers::Value(mem_iter.value()), + log_wrappers::Value(disk_iter.value()), + ); + } + break; + } + + let (mem_user_key, mem_mvcc) = split_ts(mem_key).unwrap(); + let (disk_user_key, disk_mvcc) = split_ts(disk_key).unwrap(); + + let write = match parse_write(disk_iter.value()) { + Ok(write) => write, + Err(e) => { + panic!( + "ime cross check fail(parse error); + cache_region={:?}; cache_key={:?}, cache_val={:?}; sequence_numer={}; Error={:?}", + cached_region, + log_wrappers::Value(mem_iter.key()), + log_wrappers::Value(mem_iter.value()), + mem_iter.sequence_number, + e, + ); + } + }; + + if mem_user_key == disk_user_key { + if disk_mvcc > mem_mvcc { + if write.write_type == WriteType::Rollback + || write.write_type == WriteType::Lock + { + info!( + "ime cross check: meet gced rollback or lock"; + "cache_key" => log_wrappers::Value(mem_key), + "disk_key" => log_wrappers::Value(disk_key), + "cache_region" => ?cached_region, + "seqno" => mem_iter.sequence_number, + "cf" => ?cf, + ); + } else { + // [k1-10, k1-8, k1-5(mvcc delete), k1-4, k1-3] + // safe_point: 6 + // If we gc this range, we will filter k-5, k1-4, and + // k1-3 but with k1-5 deleted at last, so we may see an + // intermediate state: + // [k1-10, k1-8, k1-5(mvcc delete), k1-3] where k1-4 is + // filtered so we have a lower mvcc + // key k1-3 and a higher mvcc key k1-5. So we should use + // the safe_point to compare the mvcc version. + + if disk_mvcc >= *safe_point { + if disk_mvcc < read_ts { + // get safe point again as it may be updated + *safe_point = { + let region_maps = + engine.core().region_manager().regions_map().read(); + let meta = region_maps.region_meta(cached_region.id).unwrap(); + // region might have split + if meta.get_region() != cached_region { + return Err(StopReason::RegionMetaChanged); + } + assert!(meta.safe_point() >= *safe_point); + meta.safe_point() + }; + } + // check again + if disk_mvcc >= *safe_point { + if write.write_type == WriteType::Put || disk_mvcc > *safe_point { + panic!( + "ime cross check fail(key should exist): miss valid mvcc version(larger than safe point); + cache_region={:?}; cache_key={:?}, disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}; cur_key_info={:?}", + cached_region, + log_wrappers::Value(mem_key), + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + read_ts, + *safe_point, + cur_key_info, + ); + } + } + } + + cur_key_info.update_last_mvcc_version_before_safe_point(*safe_point); + // We record the largest mvcc version below safe_point for each user_key -- + // and there should not be any version between it and safe_point + // So, for [k1-10, k1-8, k1-5, k1-4, k1-3] + // safe_point: 6 + // If we see [k1-10, k1-8, k1-4, k1-3] in the in-memory engine, and we + // record the last_mvcc_version_before_safe_point be 4. When we see k1-5 + // in rocksdb, we have this version 5 which is between 6 and 4 which + // denotes we have GCed a version that should not be GCed. + if disk_mvcc < *safe_point + && disk_mvcc > cur_key_info.last_mvcc_version_before_safe_point + && (write.write_type != WriteType::Rollback + && write.write_type != WriteType::Lock) + { + panic!( + "ime cross check fail(key should exist): miss valid mvcc version(less than safe point); + cache_region={:?}; cache_key={:?}, disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}; cur_key_info={:?}", + cached_region, + log_wrappers::Value(mem_key), + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + read_ts, + *safe_point, + cur_key_info, + ); + } + } + } + } else { + if disk_mvcc > *safe_point { + *safe_point = { + let region_maps = engine.core().region_manager().regions_map().read(); + let meta = region_maps.region_meta(cached_region.id).unwrap(); + // region might have split + if meta.get_region() != cached_region { + return Err(StopReason::RegionMetaChanged); + } + assert!(meta.safe_point() >= *safe_point); + meta.safe_point() + }; + if disk_mvcc > *safe_point { + panic!( + "ime cross check fail(key should exist): keys newer than safe_point have been gced; + cache_region={:?}; disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}", + cached_region, + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + read_ts, + *safe_point, + ); + } + } + + CrossChecker::check_duplicated_mvcc_version_for_last_user_key( + cf, + cached_region, + mem_iter, + &write, + safe_point, + disk_key, + disk_mvcc, + disk_user_key, + prev_key_info, + last_disk_user_key, + last_disk_user_key_delete, + engine, + )?; + } + + if disk_key > mem_key { + // The keys in rocksdb may have been gced. Check the mvcc version of `mem_key`, + // and if it has mvcc less or equal to the safe point of tikv, for simplicity, + // break the check in such cases. + if mem_mvcc <= get_tikv_safe_point().unwrap() { + return Err(StopReason::KeyGcInRocksDB); + } + + panic!( + "ime cross check fail(key should not exist): write cf not match; + cache_region={:?}; cache_key={:?}, disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}", + cached_region, + log_wrappers::Value(mem_key), + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + read_ts, + *safe_point, + ); + } + + assert!(disk_iter.next().unwrap()); + } + + Ok(()) + } + + // IME iterator has reached to end, now check the validity of the remaining keys + // in rocksdb iterator. + fn check_remain_disk_key( + cf: &&str, + cached_region: &CacheRegion, + safe_point: &mut u64, + mem_iter: &RegionCacheIterator, + disk_iter: &mut RocksEngineIterator, + prev_key_info: &mut KeyCheckingInfo, + last_disk_user_key: &mut Vec, + last_disk_user_key_delete: &mut bool, + engine: &RegionCacheMemoryEngine, + ) -> Result<()> { + while disk_iter.valid().unwrap() { + // IME and rocks enigne should have the same data view for CF_LOCK + if *cf == CF_LOCK { + panic!( + "ime cross check fail(key should exist): lock cf not match; + cache_region={:?}; disk_key={:?}; sequence_numer={};", + cached_region, + log_wrappers::Value(disk_iter.key()), + mem_iter.sequence_number, + ); + } + + let (disk_user_key, disk_mvcc) = split_ts(disk_iter.key()).unwrap(); + // We cannot miss any types of write if the mvcc version is larger + // than `safe_point` of the relevant region. But the safe + // point can be updated during the cross check. Fetch it + // and check again. + if disk_mvcc > *safe_point { + *safe_point = { + let region_maps = engine.core().region_manager().regions_map().read(); + let meta = region_maps.region_meta(cached_region.id).unwrap(); + // region might have split + if meta.get_region() != cached_region { + return Err(StopReason::RegionMetaChanged); + } + assert!(meta.safe_point() >= *safe_point); + meta.safe_point() + }; + if disk_mvcc > *safe_point { + panic!( + "ime cross check fail(key should exist): write cf not match; + cache_region={:?}; disk_key={:?}, disk_mvcc={}; sequence_numer={}; prev_key_info={:?}", + cached_region, + log_wrappers::Value(disk_iter.key()), + disk_mvcc, + mem_iter.sequence_number, + prev_key_info, + ); + } + } + let write = match parse_write(disk_iter.value()) { + Ok(write) => write, + Err(e) => { + panic!( + "ime cross check fail(parse error); + cache_region={:?}; cache_key={:?}, cache_val={:?}; sequence_numer={}; Error={:?}", + cached_region, + log_wrappers::Value(mem_iter.key()), + log_wrappers::Value(mem_iter.value()), + mem_iter.sequence_number, + e, + ); + } + }; + + CrossChecker::check_duplicated_mvcc_version_for_last_user_key( + cf, + cached_region, + mem_iter, + &write, + safe_point, + disk_iter.key(), + disk_mvcc, + disk_user_key, + prev_key_info, + last_disk_user_key, + last_disk_user_key_delete, + engine, + )?; + + disk_iter.next().unwrap(); + } + + Ok(()) + } + + // mem_iter has pointed to the next user key whereas disk_iter still has some + // versions. + #[allow(clippy::collapsible_else_if)] + fn check_duplicated_mvcc_version_for_last_user_key( + cf: &str, + cached_region: &CacheRegion, + mem_iter: &RegionCacheIterator, + write: &WriteRef<'_>, + safe_point: &mut u64, + disk_key: &[u8], + disk_mvcc: u64, + disk_user_key: &[u8], + prev_key_info: &mut KeyCheckingInfo, + last_disk_user_key: &mut Vec, + last_disk_user_key_delete: &mut bool, + engine: &RegionCacheMemoryEngine, + ) -> Result<()> { + if write.write_type == WriteType::Rollback || write.write_type == WriteType::Lock { + info!( + "ime meet gced rollback or lock"; + "disk_key" => log_wrappers::Value(disk_key), + "cache_region" => ?cached_region, + "seqno" => mem_iter.sequence_number, + "cf" => ?cf, + ); + return Ok(()); + } + + if disk_user_key == prev_key_info.user_key { + prev_key_info.update_last_mvcc_version_before_safe_point(*safe_point); + // It means all versions below safe point are GCed which means the + // latest write below safe point is mvcc delete. + // IME: k1-9, [k2-9] + // Rocks:k1-9, k1-5, [k1-3], k2-9 + // Safe point: 6 + // In this case, k1-5 must be MVCC delete. + // So when disk points to k1-5 we set last_disk_user_key_delete be + // true so that when we check k1-3 we can know it is deleted + // legally. + if prev_key_info.last_mvcc_version_before_safe_point == 0 { + *safe_point = { + let region_maps = engine.core().region_manager().regions_map().read(); + let meta = region_maps.region_meta(cached_region.id).unwrap(); + // region might have split + if meta.get_region() != cached_region { + return Err(StopReason::RegionMetaChanged); + } + assert!(meta.safe_point() >= *safe_point); + meta.safe_point() + }; + prev_key_info.update_last_mvcc_version_before_safe_point(*safe_point); + } + if prev_key_info.last_mvcc_version_before_safe_point == 0 { + if disk_user_key != last_disk_user_key { + *last_disk_user_key = disk_user_key.to_vec(); + *last_disk_user_key_delete = false; + } + if !*last_disk_user_key_delete { + if write.write_type == WriteType::Delete { + *last_disk_user_key_delete = true; + } else { + panic!( + "ime cross check fail(key should exist): miss valid mvcc version; + cache_region={:?}; disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}; prev_key_info={:?}", + cached_region, + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + mem_iter.snapshot_read_ts, + safe_point, + prev_key_info, + ); + } + } + } else { + if disk_mvcc > prev_key_info.last_mvcc_version_before_safe_point { + if write.write_type == WriteType::Rollback + || write.write_type == WriteType::Lock + { + info!( + "ime meet gced rollback or lock"; + "disk_key" => log_wrappers::Value(disk_key), + "cache_region" => ?cached_region, + "seqno" => mem_iter.sequence_number, + "cf" => ?cf, + ); + } else { + panic!( + "ime cross check fail(key should exist): miss valid mvcc version; + cache_region={:?}; disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}", + cached_region, + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + mem_iter.snapshot_read_ts, + safe_point, + ); + } + } else { + // It's ok + } + } + } else { + // IME: k2-9 + // Rocks: k1-5, k1-3, k2-9 + // Safe point: 6 + // In this case, k1-5 must be MVCC delete. + // So when disk points to k1-5 we set last_disk_user_key_delete be true so that + // when we check k1-3 we can know it is deleted legally. + if disk_user_key != last_disk_user_key { + *last_disk_user_key = disk_user_key.to_vec(); + *last_disk_user_key_delete = false; + } + if !*last_disk_user_key_delete { + if write.write_type == WriteType::Delete { + *last_disk_user_key_delete = true; + } else { + panic!( + "ime cross check fail(key should exist): miss valid mvcc version; + cache_region={:?}; disk_key={:?}; sequence_numer={}; read_ts={}, safe_point={}", + cached_region, + log_wrappers::Value(disk_key), + mem_iter.sequence_number, + mem_iter.snapshot_read_ts, + safe_point, + ); + } + } + } + + Ok(()) + } +} + +impl Runnable for CrossChecker { + type Task = CrossCheckTask; + + fn run(&mut self, _: Self::Task) { + let active_regions: Vec<_> = { + let regions_map = self + .memory_engine + .core() + .region_manager() + .regions_map() + .read(); + regions_map + .regions() + .iter() + .filter_map(|(_, meta)| { + if meta.get_state() == RegionState::Active { + Some(meta.get_region().clone()) + } else { + None + } + }) + .collect() + }; + + let snap = self.rocks_engine.snapshot(); + + let tso_timeout = Duration::from_secs(5); + let now = match block_on_timeout(self.pd_client.get_tso(), tso_timeout) { + Ok(Ok(ts)) => ts, + err => { + error!( + "ime schedule gc failed "; + "timeout_duration" => ?tso_timeout, + "error" => ?err, + ); + return; + } + }; + + // Check the snapshot with read_ts one minute ago + let read_ts = now.physical() - Duration::from_secs(60).as_millis() as u64; + let read_ts = TimeStamp::compose(read_ts, 0).into_inner(); + + let ranges_to_audit: Vec<_> = active_regions + .iter() + .filter_map(|range| { + match self + .memory_engine + .snapshot(range.clone(), read_ts, snap.sequence_number()) + { + Ok(range_snap) => Some(range_snap), + Err(_) => { + warn!( + "ime failed to get snap in cross check"; + "range" => ?range, + ); + None + } + } + }) + .collect(); + + if ranges_to_audit.is_empty() { + return; + } + + let now = Instant::now(); + + ranges_to_audit.into_iter().for_each(|r| { + if let Err(e) = self.cross_check_region(&r, &snap) { + info!( + "ime cross check stopped"; + "reason" => ?e, + "region" => ?r.snapshot_meta().region, + ); + } + }); + info!( + "ime cross check finished"; + "duration" => ?now.saturating_elapsed(), + ); + } +} + +impl RunnableWithTimer for CrossChecker { + fn get_interval(&self) -> Duration { + self.interval + } + + fn on_timeout(&mut self) { + self.run(CrossCheckTask::CrossCheck); + } +} + +#[derive(Default)] +struct KeyCheckingInfo { + user_key: Vec, + mvcc_recordings: Vec, + last_mvcc_version_before_safe_point: u64, +} + +impl KeyCheckingInfo { + fn update_last_mvcc_version_before_safe_point(&mut self, safe_point: u64) { + self.last_mvcc_version_before_safe_point = *self + .mvcc_recordings + .iter() + .find(|&mvcc| mvcc <= &safe_point) + .unwrap_or(&0); + } +} + +impl std::fmt::Debug for KeyCheckingInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KeyCheckingInfo") + .field("user_key", &log_wrappers::Value(&self.user_key)) + .field("mvcc_recordings", &self.mvcc_recordings) + .field( + "last_mvcc_version_before_safe_point", + &self.last_mvcc_version_before_safe_point, + ) + .finish() + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use engine_rocks::{util::new_engine_opt, RocksDbOptions, RocksWriteBatchVec}; + use engine_traits::{ + CacheRegion, KvEngine, Mutable, RegionCacheEngine, WriteBatch, WriteBatchExt, CF_DEFAULT, + CF_LOCK, CF_WRITE, + }; + use futures::future::ready; + use keys::data_key; + use kvproto::metapb::{Region, RegionEpoch}; + use pd_client::PdClient; + use raftstore::{ + coprocessor::{RegionInfoCallback, RegionInfoProvider}, + RegionInfo, SeekRegionCallback, + }; + use tempfile::Builder; + use tikv_util::{config::VersionTrack, store::new_peer}; + use txn_types::{Key, TimeStamp, Write, WriteType}; + + use super::Result; + use crate::{ + cross_check::CrossChecker, InMemoryEngineConfig, InMemoryEngineContext, + RegionCacheMemoryEngine, RegionCacheWriteBatch, + }; + + #[derive(Clone)] + struct MockRegionInfoProvider; + impl RegionInfoProvider for MockRegionInfoProvider { + fn seek_region( + &self, + _: &[u8], + _: SeekRegionCallback, + ) -> raftstore::coprocessor::Result<()> { + Ok(()) + } + fn find_region_by_id( + &self, + _: u64, + _: RegionInfoCallback>, + ) -> raftstore::coprocessor::Result<()> { + Ok(()) + } + fn get_regions_in_range( + &self, + _start_key: &[u8], + _end_key: &[u8], + ) -> raftstore::coprocessor::Result> { + Ok(vec![]) + } + } + + fn cross_check( + prepare_data: F, + get_tikv_safe_point: Option Option + Send>>, + ) -> Result<()> + where + F: FnOnce(&mut RegionCacheWriteBatch, &mut RocksWriteBatchVec), + { + let mut engine = RegionCacheMemoryEngine::with_region_info_provider( + InMemoryEngineContext::new_for_tests(Arc::new(VersionTrack::new( + InMemoryEngineConfig::config_for_test(), + ))), + Some(Arc::new(MockRegionInfoProvider {})), + None, + ); + let mut region = Region::default(); + region.set_peers(vec![new_peer(1, 1)].into()); + region.set_id(1); + region.set_end_key(b"z".to_vec()); + let mut epoch = RegionEpoch::default(); + epoch.conf_ver = 1; + epoch.version = 1; + region.set_region_epoch(epoch); + let cache_region = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); + + let path = Builder::new().prefix("temp").tempdir().unwrap(); + let db_opts = RocksDbOptions::default(); + let cf_opts = [CF_DEFAULT, CF_LOCK, CF_WRITE] + .iter() + .map(|name| (*name, Default::default())) + .collect(); + let rocks_engine = new_engine_opt(path.path().to_str().unwrap(), db_opts, cf_opts).unwrap(); + + engine.set_disk_engine(rocks_engine.clone()); + engine + .core() + .region_manager() + .regions_map() + .write() + .mut_region_meta(region.id) + .unwrap() + .set_safe_point(6); + + struct MockPdClient {} + impl PdClient for MockPdClient { + fn get_tso(&self) -> pd_client::PdFuture { + Box::pin(ready(Ok(TimeStamp::compose(TimeStamp::physical_now(), 0)))) + } + } + + let cross_checker = CrossChecker::new( + Arc::new(MockPdClient {}), + engine.clone(), + rocks_engine.clone(), + Duration::from_secs(100000), + get_tikv_safe_point.unwrap_or(Box::new(|| None)), + ); + + { + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + let mut disk_wb = rocks_engine.write_batch(); + + prepare_data(&mut wb, &mut disk_wb); + + wb.set_sequence_number(1000).unwrap(); + wb.write().unwrap(); + disk_wb.write().unwrap(); + + let snap = engine.snapshot(cache_region.clone(), 10, 10000).unwrap(); + let disk_snap = rocks_engine.snapshot(); + + cross_checker.cross_check_region(&snap, &disk_snap) + } + } + + fn write_key(k: &[u8], ts: u64, ty: WriteType) -> (Vec, Vec) { + let data_key = data_key(k); + let raw_write_k = Key::from_raw(&data_key).append_ts(ts.into()); + let val = Write::new(ty, ts.into(), Some(vec![])).as_ref().to_bytes(); + (raw_write_k.into_encoded(), val) + } + + #[test] + fn test_cross_check() { + // Safe point: 6 + // IME: + // Disk: k1-4-r, + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Rollback); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: + // Disk: k1-4-d, + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: + // Disk: k1-4-d, k1-3 + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: + // Disk: k1-5-r, k1-4-d, k1-3 + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 5, WriteType::Rollback); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-9, k1-5, + // Disk: k1-9, k1-5, k1-4, k1-2 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k2-5, + // Disk: k2-5, k2-4, k2-2 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-2", 5, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-5, k2-4, k3-4, k4-4 + // Disk: k1-5, k1-3, k2-4, k2-2, k3-4, k3-2, k4-4, k4-2 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 4, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-3", 4, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-3", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-4", 4, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-4", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-9, k1-5, k2-7 + // Disk: k1-9, k1-5, k1-4, k1-2, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Temporary state in GC: k1-4 is filtered + // Safe point: 6 + // IME: k1-9, k1-5-d, k1-2 k2-7 + // Disk: k1-9, k1-5-d, k1-4, k1-2, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Delete); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-9, k2-7 + // Disk: k1-9, k1-5-d, k1-4, k1-2, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-9, k1-5, k3-7 + // Disk: k1-9, k1-5, k1-4, k1-2, k2-4-d, k2-3, k3-7 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 4, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-3", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-9, k2-4-d k2-1 k3-7 + // Disk: k1-9, k1-5-d, k1-4, k1-2, k2-4-d, k2-3, k2-1 k3-7 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 4, WriteType::Delete); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 2, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-3", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + + // Safe point: 6 + // IME: k1-9, k3-7 + // Disk: k1-9, k1-5-d, k1-4, k1-2, k2-4-d, k2-3, k3-7 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 4, WriteType::Delete); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-3", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + fn test_keys_are_gced_in_rocksdb() { + // TiKV Safe point: 6 + // IME: k1-4-d, k1-3 + // Disk: + cross_check( + |wb, _| { + let (k, v) = write_key(b"k-1", 4, WriteType::Delete); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 3, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + Some(Box::new(|| Some(6))), + ) + .unwrap_err(); + + // TiKV Safe point: 6 + // IME: k1-6, k1-4-d, k1-3 + // Disk: k1-6 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 6, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Delete); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 3, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + Some(Box::new(|| Some(6))), + ) + .unwrap_err(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic1() { + // Safe point: 6 + // IME: k1-9, k1-5-r, k2-7 + // Disk: k1-9, k1-5-r, k1-4, k1-2, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Rollback); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic2() { + // Safe point: 6 + // IME: k1-9, k1-4, k2-7 + // Disk: k1-9, k1-5, k1-4, k1-2, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic2_2() { + // Safe point: 6 + // IME: k1-9, + // Disk: k1-9, k1-5, k1-4, k1-2, k-2-7 + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 5, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 2, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic3_1() { + // Safe point: 6 + // IME: k2-7 + // Disk: k1-9, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic3_2() { + // Safe point: 6 + // IME: + // Disk: k1-9, + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic3_3() { + // Safe point: 6 + // IME: + // Disk: k1-4, + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic3_4() { + // Safe point: 6 + // IME: + // Disk: k1-4-r, k1-3 + cross_check( + |_wb, disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Rollback); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-1", 3, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic4_1() { + // Safe point: 6 + // IME: k1-4 + // Disk: + cross_check( + |wb, _disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Rollback); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic4_2() { + // Safe point: 6 + // IME: k1-7, k2-4 + // Disk: k1-7 + cross_check( + |wb, _disk_wb| { + let (k, v) = write_key(b"k-1", 4, WriteType::Rollback); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } + + #[test] + #[should_panic] + fn test_cross_check_panic5() { + // Safe point: 6 + // IME: k2-7 + // Disk: k1-9, k2-7, + cross_check( + |wb, disk_wb| { + let (k, v) = write_key(b"k-1", 9, WriteType::Put); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + + let (k, v) = write_key(b"k-2", 7, WriteType::Put); + wb.put_cf(CF_WRITE, &k, &v).unwrap(); + disk_wb.put_cf(CF_WRITE, &k, &v).unwrap(); + }, + None, + ) + .unwrap(); + } +} diff --git a/components/in_memory_engine/src/engine.rs b/components/in_memory_engine/src/engine.rs new file mode 100644 index 00000000000..24c5cca5eeb --- /dev/null +++ b/components/in_memory_engine/src/engine.rs @@ -0,0 +1,930 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Debug}, + ops::Bound, + result, + sync::{atomic::AtomicU64, Arc}, +}; + +use crossbeam::epoch::{self, default_collector, Guard}; +use crossbeam_skiplist::{ + base::{Entry, OwnedIter}, + SkipList, +}; +use engine_rocks::RocksEngine; +use engine_traits::{ + CacheRegion, EvictReason, FailedReason, IterOptions, Iterable, KvEngine, RegionCacheEngine, + RegionCacheEngineExt, RegionEvent, Result, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, +}; +use fail::fail_point; +use kvproto::metapb::Region; +use pd_client::PdClient; +use raftstore::{coprocessor::RegionInfoProvider, store::CasualRouter}; +use slog_global::error; +use tikv_util::{config::VersionTrack, info, warn, worker::Scheduler}; + +use crate::{ + background::{BackgroundTask, BgWorkManager, PdRangeHintService}, + keys::{ + encode_key_for_boundary_with_mvcc, encode_key_for_boundary_without_mvcc, InternalBytes, + }, + memory_controller::MemoryController, + read::{RegionCacheIterator, RegionCacheSnapshot}, + region_manager::{ + AsyncFnOnce, LoadFailedReason, RegionCacheStatus, RegionManager, RegionState, + }, + statistics::Statistics, + InMemoryEngineConfig, InMemoryEngineContext, +}; + +pub(crate) const CF_DEFAULT_USIZE: usize = 0; +pub(crate) const CF_LOCK_USIZE: usize = 1; +pub(crate) const CF_WRITE_USIZE: usize = 2; + +pub(crate) fn cf_to_id(cf: &str) -> usize { + match cf { + CF_DEFAULT => CF_DEFAULT_USIZE, + CF_LOCK => CF_LOCK_USIZE, + CF_WRITE => CF_WRITE_USIZE, + _ => panic!("unrecognized cf {}", cf), + } +} + +pub(crate) fn id_to_cf(id: usize) -> &'static str { + match id { + CF_DEFAULT_USIZE => CF_DEFAULT, + CF_LOCK_USIZE => CF_LOCK, + CF_WRITE_USIZE => CF_WRITE, + _ => panic!("unrecognized id {}", id), + } +} + +#[inline] +pub(crate) fn is_lock_cf(cf: usize) -> bool { + cf == CF_LOCK_USIZE +} + +// A wrapper for skiplist to provide some check and clean up worker +#[derive(Clone)] +pub struct SkiplistHandle(Arc>); + +impl SkiplistHandle { + pub fn get<'a: 'g, 'g>( + &'a self, + key: &InternalBytes, + guard: &'g Guard, + ) -> Option> { + self.0.get(key, guard) + } + + pub fn get_with_user_key<'a: 'g, 'g>( + &'a self, + key: &InternalBytes, + guard: &'g Guard, + ) -> Option> { + let n = self.0.lower_bound(Bound::Included(key), guard)?; + if n.key().same_user_key_with(key) { + Some(n) + } else { + None + } + } + + pub fn insert(&self, key: InternalBytes, value: InternalBytes, guard: &Guard) { + assert!(key.memory_controller_set() && value.memory_controller_set()); + self.0.insert(key, value, guard).release(guard); + } + + pub fn remove(&self, key: &InternalBytes, guard: &Guard) { + if let Some(entry) = self.0.remove(key, guard) { + entry.release(guard); + } + } + + pub fn iterator( + &self, + ) -> OwnedIter>, InternalBytes, InternalBytes> { + self.0.owned_iter() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// A single global set of skiplists shared by all cached regions +#[derive(Clone)] +pub struct SkiplistEngine { + pub(crate) data: [Arc>; 3], +} + +impl Default for SkiplistEngine { + fn default() -> Self { + Self::new() + } +} + +impl SkiplistEngine { + pub fn new() -> Self { + let collector = default_collector().clone(); + SkiplistEngine { + data: [ + Arc::new(SkipList::new(collector.clone())), + Arc::new(SkipList::new(collector.clone())), + Arc::new(SkipList::new(collector)), + ], + } + } + + pub fn cf_handle(&self, cf: &str) -> SkiplistHandle { + SkiplistHandle(self.data[cf_to_id(cf)].clone()) + } + + pub fn node_count(&self) -> usize { + let mut count = 0; + self.data.iter().for_each(|s| count += s.len()); + count + } + + pub(crate) fn delete_range_cf(&self, cf: &str, region: &CacheRegion) { + let (start, end) = if cf == CF_LOCK { + encode_key_for_boundary_without_mvcc(region) + } else { + encode_key_for_boundary_with_mvcc(region) + }; + + let handle = self.cf_handle(cf); + let mut iter = handle.iterator(); + let guard = &epoch::pin(); + iter.seek(&start, guard); + while iter.valid() && iter.key() < &end { + handle.remove(iter.key(), guard); + iter.next(guard); + } + // guard will buffer 8 drop methods, flush here to clear the buffer. + guard.flush(); + } + + pub(crate) fn delete_range(&self, region: &CacheRegion) { + DATA_CFS.iter().for_each(|&cf| { + self.delete_range_cf(cf, region); + }); + } +} + +impl Debug for SkiplistEngine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Region Memory Engine") + } +} + +pub struct RegionCacheMemoryEngineCore { + pub(crate) engine: SkiplistEngine, + pub(crate) region_manager: RegionManager, +} + +impl Default for RegionCacheMemoryEngineCore { + fn default() -> Self { + Self::new() + } +} + +impl RegionCacheMemoryEngineCore { + pub fn new() -> RegionCacheMemoryEngineCore { + RegionCacheMemoryEngineCore { + engine: SkiplistEngine::new(), + region_manager: RegionManager::default(), + } + } + + pub fn engine(&self) -> SkiplistEngine { + self.engine.clone() + } + + pub fn region_manager(&self) -> &RegionManager { + &self.region_manager + } + + // It handles the pending range and check whether to buffer write for this + // range. + pub(crate) fn prepare_for_apply( + &self, + region: &CacheRegion, + rocks_engine: Option<&RocksEngine>, + scheduler: &Scheduler, + should_set_in_written: bool, + in_flashback: bool, + ) -> RegionCacheStatus { + if !self.region_manager.is_active() { + return RegionCacheStatus::NotInCache; + } + + // fast path, only need to hold the read lock. + { + let regions_map = self.region_manager.regions_map.read(); + let Some(region_meta) = regions_map.region_meta(region.id) else { + return RegionCacheStatus::NotInCache; + }; + let state = region_meta.get_state(); + if state == RegionState::Active { + if should_set_in_written { + region_meta.set_being_written(); + } + return RegionCacheStatus::Cached; + } else if state.is_evict() { + return RegionCacheStatus::NotInCache; + } else if state == RegionState::Loading { + if should_set_in_written { + region_meta.set_being_written(); + } + return RegionCacheStatus::Loading; + } + } + + // slow path, handle pending region + let mut regions_map = self.region_manager.regions_map.write(); + let cached_count = regions_map.regions().len(); + let Some(mut region_meta) = regions_map.mut_region_meta(region.id) else { + return RegionCacheStatus::NotInCache; + }; + + if region_meta.get_region().epoch_version < region.epoch_version || in_flashback { + let meta = regions_map.remove_region(region.id); + assert_eq!(meta.get_state(), RegionState::Pending); + // try update outdated region. + if !in_flashback && meta.can_be_updated_to(region) { + info!("ime update outdated pending region"; + "current_meta" => ?meta, + "new_region" => ?region); + // the new region's range is smaller than removed region, so it is impossible to + // be overlapped with other existing regions. + regions_map.load_region(region.clone()).unwrap(); + region_meta = regions_map.mut_region_meta(region.id).unwrap(); + } else { + fail_point!("ime_fail_to_schedule_load"); + info!("ime remove outdated pending region"; + "pending_region" => ?meta.get_region(), + "new_region" => ?region); + return RegionCacheStatus::NotInCache; + } + } + + let mut region_state = region_meta.get_state(); + let schedule_load = region_state == RegionState::Pending; + if schedule_load { + region_meta.set_state(RegionState::Loading); + info!( + "ime region to load"; + "region" => ?region, + "cached" => cached_count, + ); + region_state = RegionState::Loading; + } + + let mut result = RegionCacheStatus::NotInCache; + if region_state == RegionState::Loading || region_state == RegionState::Active { + if should_set_in_written { + region_meta.set_being_written(); + } + if region_state == RegionState::Active { + result = RegionCacheStatus::Cached; + } else { + result = RegionCacheStatus::Loading; + } + } + drop(regions_map); + + // get snapshot and schedule loading task at last to avoid locking IME for too + // long. + if schedule_load { + let rocks_snap = Arc::new(rocks_engine.unwrap().snapshot()); + if let Err(e) = + scheduler.schedule(BackgroundTask::LoadRegion(region.clone(), rocks_snap)) + { + error!( + "ime schedule region load failed"; + "err" => ?e, + "region" => ?region, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + + result + } +} + +/// The RegionCacheMemoryEngine serves as a region cache, storing hot regions in +/// the leaders' store. Incoming writes that are written to disk engine (now, +/// RocksDB) are also written to the RegionCacheMemoryEngine, leading to a +/// mirrored data set in the cached regions with the disk engine. +/// +/// A load/evict unit manages the memory, deciding which regions should be +/// evicted when the memory used by the RegionCacheMemoryEngine reaches a +/// certain limit, and determining which regions should be loaded when there is +/// spare memory capacity. +/// +/// The safe point lifetime differs between RegionCacheMemoryEngine and the disk +/// engine, often being much shorter in RegionCacheMemoryEngine. This means that +/// RegionCacheMemoryEngine may filter out some keys that still exist in the +/// disk engine, thereby improving read performance as fewer duplicated keys +/// will be read. If there's a need to read keys that may have been filtered by +/// RegionCacheMemoryEngine (as indicated by read_ts and safe_point of the +/// cached region), we resort to using a the disk engine's snapshot instead. +#[derive(Clone)] +pub struct RegionCacheMemoryEngine { + bg_work_manager: Arc, + pub(crate) core: Arc, + pub(crate) rocks_engine: Option, + memory_controller: Arc, + statistics: Arc, + config: Arc>, + + // The increment amount of tombstones in the lock cf. + // When reaching to the threshold, a CleanLockTombstone task will be scheduled to clean lock cf + // tombstones. + pub(crate) lock_modification_bytes: Arc, +} + +impl RegionCacheMemoryEngine { + pub fn new(in_memory_engine_context: InMemoryEngineContext) -> Self { + RegionCacheMemoryEngine::with_region_info_provider(in_memory_engine_context, None, None) + } + + pub fn with_region_info_provider( + in_memory_engine_context: InMemoryEngineContext, + region_info_provider: Option>, + raft_casual_router: Option>>, + ) -> Self { + let core = Arc::new(RegionCacheMemoryEngineCore::new()); + let skiplist_engine = core.engine().clone(); + + let InMemoryEngineContext { + config, + statistics, + pd_client, + } = in_memory_engine_context; + assert!(config.value().enable); + let memory_controller = Arc::new(MemoryController::new(config.clone(), skiplist_engine)); + + let bg_work_manager = Arc::new(BgWorkManager::new( + core.clone(), + pd_client, + config.clone(), + memory_controller.clone(), + region_info_provider, + raft_casual_router, + )); + + Self { + core, + rocks_engine: None, + bg_work_manager, + memory_controller, + statistics, + config, + lock_modification_bytes: Arc::default(), + } + } + + pub fn new_region(&self, region: Region) { + let cache_region = CacheRegion::from_region(®ion); + self.core.region_manager.new_region(cache_region); + } + + pub fn load_region(&self, cache_region: CacheRegion) -> result::Result<(), LoadFailedReason> { + self.core.region_manager().load_region(cache_region) + } + + // Used for benchmark. + pub fn must_set_region_state(&self, id: u64, state: RegionState) { + let mut regions_map = self.core.region_manager().regions_map().write(); + let meta = regions_map.mut_region_meta(id).unwrap(); + meta.set_state(state); + } + + /// Evict a region from the in-memory engine. After this call, the region + /// will not be readable, but the data of the region may not be deleted + /// immediately due to some ongoing snapshots. + pub fn evict_region( + &self, + region: &CacheRegion, + evict_reason: EvictReason, + cb: Option>, + ) { + let deletable_regions = self + .core + .region_manager + .evict_region(region, evict_reason, cb); + if !deletable_regions.is_empty() { + // The region can be deleted directly. + if let Err(e) = self + .bg_worker_manager() + .schedule_task(BackgroundTask::DeleteRegions(deletable_regions)) + { + error!( + "ime schedule delete region failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + } + + // It handles the pending region and check whether to buffer write for this + // region. + pub(crate) fn prepare_for_apply( + &self, + region: &CacheRegion, + in_flashback: bool, + ) -> RegionCacheStatus { + self.core.prepare_for_apply( + region, + self.rocks_engine.as_ref(), + self.bg_work_manager.background_scheduler(), + true, + in_flashback, + ) + } + + pub fn bg_worker_manager(&self) -> &BgWorkManager { + &self.bg_work_manager + } + + pub fn memory_controller(&self) -> Arc { + self.memory_controller.clone() + } + + pub fn statistics(&self) -> Arc { + self.statistics.clone() + } + + pub fn start_cross_check( + &self, + rocks_engine: RocksEngine, + pd_client: Arc, + get_tikv_safe_point: Box Option + Send>, + ) { + let cross_check_interval = self.config.value().cross_check_interval; + if !cross_check_interval.is_zero() { + if let Err(e) = + self.bg_worker_manager() + .schedule_task(BackgroundTask::TurnOnCrossCheck(( + self.clone(), + rocks_engine, + pd_client, + cross_check_interval.0, + get_tikv_safe_point, + ))) + { + error!( + "schedule TurnOnCrossCheck failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + } +} + +impl RegionCacheMemoryEngine { + pub fn core(&self) -> &Arc { + &self.core + } +} + +impl Debug for RegionCacheMemoryEngine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Region Cache Memory Engine") + } +} + +impl RegionCacheEngine for RegionCacheMemoryEngine { + type Snapshot = RegionCacheSnapshot; + + fn snapshot( + &self, + region: CacheRegion, + read_ts: u64, + seq_num: u64, + ) -> result::Result { + RegionCacheSnapshot::new(self.clone(), region, read_ts, seq_num) + } + + type DiskEngine = RocksEngine; + fn set_disk_engine(&mut self, disk_engine: Self::DiskEngine) { + self.rocks_engine = Some(disk_engine.clone()); + if let Err(e) = self + .bg_worker_manager() + .schedule_task(BackgroundTask::SetRocksEngine(disk_engine)) + { + error!( + "ime schedule set rocks_engine failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + + type RangeHintService = PdRangeHintService; + fn start_hint_service(&self, range_hint_service: Self::RangeHintService) { + self.bg_worker_manager() + .start_bg_hint_service(range_hint_service) + } + + fn get_region_for_key(&self, key: &[u8]) -> Option { + self.core.region_manager().get_region_for_key(key) + } + + fn enabled(&self) -> bool { + self.config.value().enable + } +} + +impl RegionCacheEngineExt for RegionCacheMemoryEngine { + fn on_region_event(&self, event: RegionEvent) { + match event { + RegionEvent::Eviction { region, reason } => { + self.evict_region(®ion, reason, None); + } + RegionEvent::TryLoad { + region, + for_manual_range, + } => { + if for_manual_range { + if self + .core + .region_manager() + .regions_map() + .read() + .overlap_with_manual_load_range(®ion) + { + info!( + "ime try to load region in manual load range"; + "region" => ?region, + ); + if let Err(e) = self.load_region(region.clone()) { + warn!( + "ime load region failed"; + "err" => ?e, + "region" => ?region, + ); + } + } + } else if let Err(e) = self.core.region_manager().load_region(region.clone()) { + warn!( + "ime load region failed"; + "error" => ?e, + "region" => ?region, + ); + } + } + RegionEvent::Split { + source, + new_regions, + } => { + self.core.region_manager.split_region(&source, new_regions); + } + RegionEvent::EvictByRange { range, reason } => { + let mut regions = vec![]; + { + let regions_map = self.core.region_manager.regions_map.read(); + regions_map.iter_overlapped_regions(&range, |meta| { + assert!(meta.get_region().overlaps(&range)); + regions.push(meta.get_region().clone()); + true + }); + } + + for r in regions { + self.evict_region(&r, reason, None); + } + } + } + } + + fn region_cached(&self, region: &Region) -> bool { + let regions_map = self.core.region_manager().regions_map().read(); + if let Some(meta) = regions_map.region_meta(region.get_id()) { + matches!(meta.get_state(), RegionState::Active | RegionState::Loading) + } else { + false + } + } + + fn load_region(&self, region: &Region) { + self.on_region_event(RegionEvent::TryLoad { + region: CacheRegion::from_region(region), + for_manual_range: false, + }); + } +} + +impl Iterable for RegionCacheMemoryEngine { + type Iterator = RegionCacheIterator; + + fn iterator_opt(&self, _: &str, _: IterOptions) -> Result { + // This engine does not support creating iterators directly by the engine. + panic!("iterator_opt is not supported on creating by RegionCacheMemoryEngine directly") + } +} + +#[cfg(test)] +pub mod tests { + use std::{sync::Arc, time::Duration}; + + use crossbeam::epoch; + use engine_rocks::util::new_engine; + use engine_traits::{ + CacheRegion, EvictReason, Mutable, RegionCacheEngine, RegionCacheEngineExt, RegionEvent, + WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, + }; + use tikv_util::config::{ReadableDuration, VersionTrack}; + use tokio::{ + runtime::Builder, + sync::{mpsc, Mutex}, + time::timeout, + }; + + use super::SkiplistEngine; + use crate::{ + keys::{construct_key, construct_user_key, encode_key}, + memory_controller::MemoryController, + region_manager::{CacheRegionMeta, RegionManager, RegionState::*}, + test_util::new_region, + InMemoryEngineConfig, InMemoryEngineContext, InternalBytes, RegionCacheMemoryEngine, + ValueType, + }; + + fn count_region(mgr: &RegionManager, mut f: impl FnMut(&CacheRegionMeta) -> bool) -> usize { + let regions_map = mgr.regions_map.read(); + regions_map.regions().values().filter(|m| f(m)).count() + } + #[test] + fn test_region_overlap_with_outdated_epoch() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let region1 = new_region(1, b"k1", b"k3"); + let cache_region1 = CacheRegion::from_region(®ion1); + engine.load_region(cache_region1).unwrap(); + + let mut region2 = new_region(1, b"k1", b"k5"); + region2.mut_region_epoch().version = 2; + engine.prepare_for_apply(&CacheRegion::from_region(®ion2), false); + assert_eq!( + count_region(engine.core.region_manager(), |m| { + matches!(m.get_state(), Pending | Loading) + }), + 0 + ); + + let region1 = new_region(1, b"k1", b"k3"); + let cache_region1 = CacheRegion::from_region(®ion1); + engine.load_region(cache_region1).unwrap(); + + let mut region2 = new_region(1, b"k2", b"k5"); + region2.mut_region_epoch().version = 2; + engine.prepare_for_apply(&CacheRegion::from_region(®ion2), false); + assert_eq!( + count_region(engine.core.region_manager(), |m| { + matches!(m.get_state(), Pending | Loading) + }), + 0 + ); + } + + #[test] + fn test_delete_range() { + let delete_range_cf = |cf| { + let skiplist = SkiplistEngine::default(); + let handle = skiplist.cf_handle(cf); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let mem_controller = Arc::new(MemoryController::new(config.clone(), skiplist.clone())); + + let guard = &epoch::pin(); + + let insert_kv = |k, mvcc, v: &[u8], seq| { + let user_key = construct_key(k, mvcc); + let mut key = encode_key(&user_key, seq, ValueType::Value); + let mut val = InternalBytes::from_vec(v.to_vec()); + key.set_memory_controller(mem_controller.clone()); + val.set_memory_controller(mem_controller.clone()); + handle.insert(key, val, guard); + }; + + insert_kv(0, 1, b"val", 100); + insert_kv(1, 2, b"val", 101); + insert_kv(1, 3, b"val", 102); + insert_kv(2, 2, b"val", 103); + insert_kv(9, 2, b"val", 104); + insert_kv(10, 2, b"val", 105); + + let start = construct_user_key(1); + let end = construct_user_key(10); + let range = CacheRegion::new(1, 0, start, end); + skiplist.delete_range(&range); + + let mut iter = handle.iterator(); + iter.seek_to_first(guard); + let expect = construct_key(0, 1); + let expect = encode_key(&expect, 100, ValueType::Value); + assert_eq!(iter.key(), &expect); + iter.next(guard); + + let expect = construct_key(10, 2); + let expect = encode_key(&expect, 105, ValueType::Value); + assert_eq!(iter.key(), &expect); + iter.next(guard); + assert!(!iter.valid()); + }; + delete_range_cf(CF_DEFAULT); + delete_range_cf(CF_WRITE); + } + + #[test] + fn test_delete_range_for_lock_cf() { + let skiplist = SkiplistEngine::default(); + let lock_handle = skiplist.cf_handle(CF_LOCK); + + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let mem_controller = Arc::new(MemoryController::new(config.clone(), skiplist.clone())); + + let guard = &epoch::pin(); + + let insert_kv = |k, v: &[u8], seq| { + let mut key = encode_key(k, seq, ValueType::Value); + let mut val = InternalBytes::from_vec(v.to_vec()); + key.set_memory_controller(mem_controller.clone()); + val.set_memory_controller(mem_controller.clone()); + lock_handle.insert(key, val, guard); + }; + + insert_kv(b"k", b"val", 100); + insert_kv(b"k1", b"val1", 101); + insert_kv(b"k2", b"val2", 102); + insert_kv(b"k3", b"val3", 103); + insert_kv(b"k4", b"val4", 104); + + let range = CacheRegion::new(1, 0, b"k1".to_vec(), b"k4".to_vec()); + skiplist.delete_range(&range); + + let mut iter = lock_handle.iterator(); + iter.seek_to_first(guard); + let expect = encode_key(b"k", 100, ValueType::Value); + assert_eq!(iter.key(), &expect); + + iter.next(guard); + let expect = encode_key(b"k4", 104, ValueType::Value); + assert_eq!(iter.key(), &expect); + + iter.next(guard); + assert!(!iter.valid()); + } + + #[test] + fn test_is_active() { + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), + )); + let path = tempfile::Builder::new() + .prefix("test_is_active") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + + let region = new_region(1, b"k00", b"k30"); + let cache_region = CacheRegion::from_region(®ion); + engine.load_region(cache_region.clone()).unwrap(); + assert!(engine.core.region_manager.is_active()); + + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.put(b"zk00", b"v1").unwrap(); + wb.put(b"zk10", b"v1").unwrap(); + wb.put(b"zk20", b"v1").unwrap(); + wb.set_sequence_number(1).unwrap(); + wb.write().unwrap(); + + test_util::eventually( + Duration::from_millis(10), + Duration::from_millis(1000), + || { + let regions_map = engine.core.region_manager().regions_map().read(); + regions_map.region_meta(1).unwrap().get_state() == Active + }, + ); + + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.put(b"zk10", b"v2").unwrap(); + wb.set_sequence_number(10).unwrap(); + + // trigger split and eviction during write. + let new_regions = vec![ + CacheRegion::new(1, 1, "zk00", "zk10"), + CacheRegion::new(2, 1, "zk10", "zk20"), + CacheRegion::new(3, 1, "zk20", "zk30"), + ]; + engine.on_region_event(RegionEvent::Split { + source: cache_region.clone(), + new_regions: new_regions.clone(), + }); + + engine.on_region_event(RegionEvent::Eviction { + region: new_regions[0].clone(), + reason: EvictReason::AutoEvict, + }); + + // trigger split again + let split_regions = vec![ + CacheRegion::new(2, 2, "zk10", "zk13"), + CacheRegion::new(4, 2, "zk13", "zk16"), + CacheRegion::new(5, 2, "zk16", "zk20"), + ]; + engine.on_region_event(RegionEvent::Split { + source: new_regions[1].clone(), + new_regions: split_regions.clone(), + }); + + { + let regions_map = engine.core.region_manager.regions_map.read(); + assert!(regions_map.regions().values().all(|m| m.is_written())); + } + wb.write().unwrap(); + { + let regions_map = engine.core.region_manager.regions_map.read(); + assert!(regions_map.regions().values().all(|m| !m.is_written())); + } + + engine.on_region_event(RegionEvent::Eviction { + region: cache_region, + reason: EvictReason::AutoEvict, + }); + + // engine should become inactive after all regions are evicted. + test_util::eventually( + Duration::from_millis(10), + Duration::from_millis(1000), + || !engine.core.region_manager.is_active(), + ); + } + + #[test] + fn test_cb_on_eviction_with_on_going_snapshot() { + let mut config = InMemoryEngineConfig::config_for_test(); + config.gc_run_interval = ReadableDuration(Duration::from_secs(1)); + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + + let region = new_region(1, b"", b"z"); + let cache_region = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); + + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.set_sequence_number(10).unwrap(); + wb.put(b"a", b"val1").unwrap(); + wb.put(b"b", b"val2").unwrap(); + wb.put(b"c", b"val3").unwrap(); + wb.write().unwrap(); + + let snap = engine.snapshot(cache_region.clone(), 100, 100).unwrap(); + + let (tx, rx) = mpsc::channel(1); + engine.evict_region( + &cache_region, + EvictReason::BecomeFollower, + Some(Box::new(move || { + Box::pin(async move { + let _ = tx.send(()).await; + }) + })), + ); + + let rt = Builder::new_current_thread().enable_all().build().unwrap(); + let rx = Arc::new(Mutex::new(rx)); + let rx_clone = rx.clone(); + rt.block_on(async move { + timeout(Duration::from_secs(1), rx_clone.lock().await.recv()) + .await + .unwrap_err() + }); + drop(snap); + rt.block_on(async move { rx.lock().await.recv().await.unwrap() }); + + { + let regions_map = engine.core().region_manager().regions_map().read(); + assert!(regions_map.region_meta(1).is_none()); + } + } +} diff --git a/components/range_cache_memory_engine/src/keys.rs b/components/in_memory_engine/src/keys.rs similarity index 88% rename from components/range_cache_memory_engine/src/keys.rs rename to components/in_memory_engine/src/keys.rs index e7705cdf3e0..90845af7f54 100644 --- a/components/range_cache_memory_engine/src/keys.rs +++ b/components/in_memory_engine/src/keys.rs @@ -3,11 +3,12 @@ use core::slice::SlicePattern; use std::{ cmp::{self, Ordering}, + fmt, sync::Arc, }; use bytes::{BufMut, Bytes}; -use engine_traits::CacheRange; +use engine_traits::CacheRegion; use txn_types::{Key, TimeStamp}; use crate::{memory_controller::MemoryController, write_batch::MEM_CONTROLLER_OVERHEAD}; @@ -115,14 +116,7 @@ impl Ord for InternalBytes { .unwrap(), ); - #[allow(clippy::comparison_chain)] - if n1 < n2 { - Ordering::Greater - } else if n1 > n2 { - Ordering::Less - } else { - Ordering::Equal - } + n2.cmp(&n1) } } @@ -154,6 +148,7 @@ impl TryFrom for ValueType { } } +#[derive(PartialEq)] pub struct InternalKey<'a> { // key with mvcc version in memory comparable format pub user_key: &'a [u8], @@ -161,6 +156,18 @@ pub struct InternalKey<'a> { pub sequence: u64, } +impl<'a> fmt::Debug for InternalKey<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "(key: {:?}, type: {:?}, seq: {})", + log_wrappers::Value(self.user_key), + self.v_type, + self.sequence + ) + } +} + // The size of sequence number suffix pub const ENC_KEY_SEQ_LENGTH: usize = std::mem::size_of::(); @@ -223,20 +230,20 @@ pub fn encode_seek_for_prev_key(key: &[u8], seq: u64) -> InternalBytes { encode_internal_bytes(key, seq, VALUE_TYPE_FOR_SEEK_FOR_PREV) } -// range keys deos not contain mvcc version and sequence number +// region keys deos not contain mvcc version and sequence number #[inline] -pub fn encode_key_for_boundary_with_mvcc(range: &CacheRange) -> (InternalBytes, InternalBytes) { +pub fn encode_key_for_boundary_with_mvcc(region: &CacheRegion) -> (InternalBytes, InternalBytes) { // Both encoded_start and encoded_end should be the smallest key in the // respective of user key (with mvcc version), so that the iterations cover all - // versions of the range start and covers nothing of range end. + // versions of the region start and covers nothing of region end. // TODO: can we avoid one clone - let start_mvcc_key = Key::from_encoded(range.start.to_vec()) + let start_mvcc_key = Key::from_encoded(region.start.to_vec()) .append_ts(TimeStamp::max()) .into_encoded(); let encoded_start = encode_key(&start_mvcc_key, u64::MAX, VALUE_TYPE_FOR_SEEK); - let end_mvcc_key = Key::from_encoded(range.end.to_vec()) + let end_mvcc_key = Key::from_encoded(region.end.to_vec()) .append_ts(TimeStamp::max()) .into_encoded(); let encoded_end = encode_key(&end_mvcc_key, u64::MAX, VALUE_TYPE_FOR_SEEK); @@ -245,14 +252,16 @@ pub fn encode_key_for_boundary_with_mvcc(range: &CacheRange) -> (InternalBytes, } #[inline] -pub fn encode_key_for_boundary_without_mvcc(range: &CacheRange) -> (InternalBytes, InternalBytes) { +pub fn encode_key_for_boundary_without_mvcc( + region: &CacheRegion, +) -> (InternalBytes, InternalBytes) { // Both encoded_start and encoded_end should be the smallest key in the // respective of user key (without mvcc version), so that the iterations cover - // all versions of the range start and covers nothing of range end. + // all versions of the region start and covers nothing of region end. // TODO: can we avoid one clone - let encoded_start = encode_key(&range.start, u64::MAX, VALUE_TYPE_FOR_SEEK); - let encoded_end = encode_key(&range.end, u64::MAX, VALUE_TYPE_FOR_SEEK); + let encoded_start = encode_key(®ion.start, u64::MAX, VALUE_TYPE_FOR_SEEK); + let encoded_end = encode_key(®ion.end, u64::MAX, VALUE_TYPE_FOR_SEEK); (encoded_start, encoded_end) } @@ -268,13 +277,19 @@ pub fn encoding_for_filter(mvcc_prefix: &[u8], start_ts: TimeStamp) -> InternalB #[cfg(test)] pub fn construct_user_key(i: u64) -> Vec { + let k = format!("zk{:08}", i); + k.as_bytes().to_owned() +} + +#[cfg(test)] +pub fn construct_region_key(i: u64) -> Vec { let k = format!("k{:08}", i); k.as_bytes().to_owned() } #[cfg(test)] pub fn construct_key(i: u64, ts: u64) -> Vec { - let k = format!("k{:08}", i); + let k = format!("zk{:08}", i); Key::from_encoded(k.as_bytes().to_vec()) .append_ts(TimeStamp::new(ts)) .into_encoded() diff --git a/components/in_memory_engine/src/lib.rs b/components/in_memory_engine/src/lib.rs new file mode 100644 index 00000000000..6de1a74d036 --- /dev/null +++ b/components/in_memory_engine/src/lib.rs @@ -0,0 +1,92 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(assert_matches)] +#![feature(let_chains)] +#![allow(internal_features)] +#![feature(core_intrinsics)] +#![feature(slice_pattern)] +#![feature(trait_alias)] + +use std::sync::Arc; + +use futures::future::ready; +use pd_client::PdClient; +use tikv_util::config::VersionTrack; + +mod background; +pub mod config; +mod cross_check; +mod engine; +mod keys; +mod memory_controller; +mod metrics; +mod perf_context; +#[cfg(test)] +mod prop_test; +mod read; +mod region_label; +mod region_manager; +mod region_stats; +mod statistics; +pub mod test_util; +mod write_batch; + +pub use background::{BackgroundRunner, BackgroundTask, GcTask}; +pub use config::InMemoryEngineConfig; +pub use engine::{RegionCacheMemoryEngine, SkiplistHandle}; +pub use keys::{ + decode_key, encode_key_for_boundary_without_mvcc, encoding_for_filter, InternalBytes, + InternalKey, ValueType, +}; +pub use metrics::flush_in_memory_engine_statistics; +pub use read::RegionCacheSnapshot; +pub use region_manager::{RegionCacheStatus, RegionState}; +pub use statistics::Statistics as InMemoryEngineStatistics; +use txn_types::TimeStamp; +pub use write_batch::RegionCacheWriteBatch; + +#[derive(Clone)] +pub struct InMemoryEngineContext { + config: Arc>, + statistics: Arc, + pd_client: Arc, +} + +impl InMemoryEngineContext { + pub fn new( + config: Arc>, + pd_client: Arc, + ) -> InMemoryEngineContext { + InMemoryEngineContext { + config, + statistics: Arc::default(), + pd_client, + } + } + + pub fn new_for_tests(config: Arc>) -> InMemoryEngineContext { + struct MockPdClient; + impl PdClient for MockPdClient { + fn get_tso(&self) -> pd_client::PdFuture { + Box::pin(ready(Ok(TimeStamp::compose(TimeStamp::physical_now(), 0)))) + } + } + InMemoryEngineContext { + config, + statistics: Arc::default(), + pd_client: Arc::new(MockPdClient), + } + } + + pub fn pd_client(&self) -> Arc { + self.pd_client.clone() + } + + pub fn config(&self) -> &Arc> { + &self.config + } + + pub fn statistics(&self) -> Arc { + self.statistics.clone() + } +} diff --git a/components/range_cache_memory_engine/src/memory_controller.rs b/components/in_memory_engine/src/memory_controller.rs similarity index 65% rename from components/range_cache_memory_engine/src/memory_controller.rs rename to components/in_memory_engine/src/memory_controller.rs index db51af4a202..b3a95c6f109 100644 --- a/components/range_cache_memory_engine/src/memory_controller.rs +++ b/components/in_memory_engine/src/memory_controller.rs @@ -11,16 +11,16 @@ use std::{ use tikv_util::config::VersionTrack; use crate::{ - engine::SkiplistEngine, write_batch::NODE_OVERHEAD_SIZE_EXPECTATION, RangeCacheEngineConfig, + engine::SkiplistEngine, write_batch::NODE_OVERHEAD_SIZE_EXPECTATION, InMemoryEngineConfig, }; #[derive(Debug, PartialEq)] pub(crate) enum MemoryUsage { NormalUsage(usize), - SoftLimitReached(usize), - // usize here means the current memory usage and it's the usize in it adding with the memory - // acquiring exceeds the hard limit - HardLimitReached(usize), + EvictThresholdReached(usize), + // usize here means the current memory usage and it's the usize in it adding + // with the memory acquiring exceeds the capacity + CapacityReached(usize), } /// MemoryController is used to control the memory usage of the region cache @@ -32,7 +32,7 @@ pub struct MemoryController { // Allocated memory for keys and values (node overhead is not included) // The number of writes that are buffered but not yet written. allocated: AtomicUsize, - config: Arc>, + config: Arc>, memory_checking: AtomicBool, skiplist_engine: SkiplistEngine, } @@ -41,8 +41,8 @@ impl fmt::Debug for MemoryController { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("MemoryController") .field("allocated", &self.allocated) - .field("soft_limit", &self.config.value().soft_limit_threshold()) - .field("hard_limit", &self.config.value().hard_limit_threshold()) + .field("capacity", &self.config.value().capacity()) + .field("evict_threshold", &self.config.value().evict_threshold()) .field("memory_checking", &self.memory_checking) .field("skiplist_engine", &self.skiplist_engine) .finish() @@ -50,7 +50,7 @@ impl fmt::Debug for MemoryController { } impl MemoryController { pub fn new( - config: Arc>, + config: Arc>, skiplist_engine: SkiplistEngine, ) -> Self { Self { @@ -64,18 +64,18 @@ impl MemoryController { pub(crate) fn acquire(&self, n: usize) -> MemoryUsage { let node_count = self.skiplist_engine.node_count(); - // We dont count the node overhead in the write batch to reduce complexity as - // there overhead should be negligible + // We don't count the node overhead in the write batch to reduce + // complexity as the overhead should be negligible. let mem_usage = self.allocated.fetch_add(n, Ordering::Relaxed) + n + node_count * NODE_OVERHEAD_SIZE_EXPECTATION; - if mem_usage >= self.config.value().hard_limit_threshold() { + if mem_usage >= self.config.value().capacity() { self.allocated.fetch_sub(n, Ordering::Relaxed); - return MemoryUsage::HardLimitReached(mem_usage - n); + return MemoryUsage::CapacityReached(mem_usage - n); } - if mem_usage >= self.config.value().soft_limit_threshold() { - return MemoryUsage::SoftLimitReached(mem_usage); + if mem_usage >= self.config.value().evict_threshold() { + return MemoryUsage::EvictThresholdReached(mem_usage); } MemoryUsage::NormalUsage(mem_usage) @@ -86,18 +86,19 @@ impl MemoryController { } #[inline] - pub(crate) fn reached_soft_limit(&self) -> bool { - self.mem_usage() >= self.config.value().soft_limit_threshold() + pub(crate) fn reached_stop_load_threshold(&self) -> bool { + self.mem_usage() >= self.config.value().stop_load_threshold() } #[inline] - pub(crate) fn soft_limit_threshold(&self) -> usize { - self.config.value().soft_limit_threshold() + pub(crate) fn evict_threshold(&self) -> usize { + self.config.value().evict_threshold() } #[inline] - pub(crate) fn set_memory_checking(&self, v: bool) { - self.memory_checking.store(v, Ordering::Relaxed); + // return the previous status. + pub(crate) fn set_memory_checking(&self, v: bool) -> bool { + self.memory_checking.swap(v, Ordering::Relaxed) } #[inline] @@ -123,20 +124,17 @@ mod tests { #[test] fn test_memory_controller() { let skiplist_engine = SkiplistEngine::new(); - let config = Arc::new(VersionTrack::new(RangeCacheEngineConfig { - enabled: true, - gc_interval: Default::default(), - load_evict_interval: Default::default(), - soft_limit_threshold: Some(ReadableSize(300)), - hard_limit_threshold: Some(ReadableSize(500)), - expected_region_size: Default::default(), - })); + let mut config = InMemoryEngineConfig::config_for_test(); + config.stop_load_threshold = Some(ReadableSize(300)); + config.evict_threshold = Some(ReadableSize(300)); + config.capacity = Some(ReadableSize(500)); + let config = Arc::new(VersionTrack::new(config)); let mc = MemoryController::new(config, skiplist_engine.clone()); assert_eq!(mc.acquire(100), MemoryUsage::NormalUsage(100)); assert_eq!(mc.acquire(150), MemoryUsage::NormalUsage(250)); - assert_eq!(mc.acquire(50), MemoryUsage::SoftLimitReached(300)); - assert_eq!(mc.acquire(50), MemoryUsage::SoftLimitReached(350)); - assert_eq!(mc.acquire(200), MemoryUsage::HardLimitReached(350)); + assert_eq!(mc.acquire(50), MemoryUsage::EvictThresholdReached(300)); + assert_eq!(mc.acquire(50), MemoryUsage::EvictThresholdReached(350)); + assert_eq!(mc.acquire(200), MemoryUsage::CapacityReached(350)); mc.release(50); assert_eq!(mc.mem_usage(), 300); @@ -149,8 +147,8 @@ mod tests { guard, ); assert_eq!(mc.mem_usage(), 396); - assert_eq!(mc.acquire(100), MemoryUsage::SoftLimitReached(496)); + assert_eq!(mc.acquire(100), MemoryUsage::EvictThresholdReached(496)); skiplist_engine.data[0].remove(entry.key(), guard); - assert_eq!(mc.acquire(99), MemoryUsage::SoftLimitReached(499)); + assert_eq!(mc.acquire(99), MemoryUsage::EvictThresholdReached(499)); } } diff --git a/components/in_memory_engine/src/metrics.rs b/components/in_memory_engine/src/metrics.rs new file mode 100644 index 00000000000..9b037c1c914 --- /dev/null +++ b/components/in_memory_engine/src/metrics.rs @@ -0,0 +1,300 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::EvictReason; +use lazy_static::lazy_static; +use prometheus::*; +use prometheus_static_metric::*; + +use crate::{ + statistics::{Tickers, ENGINE_TICKER_TYPES}, + InMemoryEngineStatistics, +}; + +make_auto_flush_static_metric! { + pub label_enum KeyCountType { + total, + filtered, + below_safe_point_total, + below_safe_point_unique, + } + + pub label_enum TickerEnum { + bytes_read, + iter_bytes_read, + number_db_seek, + number_db_seek_found, + number_db_next, + number_db_next_found, + number_db_prev, + number_db_prev_found, + } + + pub label_enum EvictReasonType { + merge, + auto_evict, + load_failed, + load_failed_without_start, + delete_range, + become_follower, + memory_limit_reached, + disabled, + apply_snapshot, + flashback, + manual, + destroy_peer, + } + + pub label_enum OperationType { + put, + delete, + } + + pub label_enum CF { + default, + lock, + write, + } + + pub struct GcFilteredCountVec: LocalIntCounter { + "type" => KeyCountType, + } + + pub struct InMemoryEngineTickerMetrics: LocalIntCounter { + "type" => TickerEnum, + } + + pub struct EvictionDurationVec: LocalHistogram { + "type" => EvictReasonType, + } + + pub struct OperationTypeForCF: LocalIntCounter { + "type" => OperationType, + "cf" => CF, + } +} + +lazy_static! { + pub static ref GC_FILTERED: IntCounterVec = register_int_counter_vec!( + "tikv_in_memory_engine_gc_filtered", + "Filtered version by GC", + &["type"] + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_MEMORY_USAGE: IntGauge = register_int_gauge!( + "tikv_in_memory_engine_memory_usage_bytes", + "The memory usage of the region cache engine", + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_LOAD_TIME_HISTOGRAM: Histogram = register_histogram!( + "tikv_in_memory_engine_load_duration_secs", + "Bucketed histogram of region load time duration.", + exponential_buckets(0.001, 2.0, 20).unwrap() + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_GC_TIME_HISTOGRAM: Histogram = register_histogram!( + "tikv_in_memory_engine_gc_duration_secs", + "Bucketed histogram of region gc time duration.", + exponential_buckets(0.001, 2.0, 20).unwrap() + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM: HistogramVec = + register_histogram_vec!( + "tikv_in_memory_engine_eviction_duration_secs", + "Bucketed histogram of region eviction time duration.", + &["type"], + exponential_buckets(0.001, 2.0, 20).unwrap() + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "tikv_in_memory_engine_write_duration_seconds", + "Bucketed histogram of write duration in region cache engine.", + exponential_buckets(0.00001, 2.0, 20).unwrap() + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_PREPARE_FOR_WRITE_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_in_memory_engine_prepare_for_write_duration_seconds", + "Bucketed histogram of prepare for write duration in region cache engine.", + exponential_buckets(0.00001, 2.0, 20).unwrap() + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_CACHE_COUNT: IntGaugeVec = register_int_gauge_vec!( + "tikv_in_memory_engine_cache_count", + "The count of each type on region cache.", + &["type"] + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_FLOW: IntCounterVec = register_int_counter_vec!( + "tikv_in_memory_engine_flow", + "Bytes and keys of read/written of in-memory engine", + &["type"] + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_LOCATE: IntCounterVec = register_int_counter_vec!( + "tikv_in_memory_engine_locate", + "Number of calls to seek/next/prev", + &["type"] + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_SEEK_DURATION: Histogram = register_histogram!( + "tikv_in_memory_engine_seek_duration", + "Histogram of seek duration", + exponential_buckets(0.00001, 2.0, 26).unwrap() + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_KV_OPERATIONS: IntCounterVec = register_int_counter_vec!( + "tikv_in_memory_engine_kv_operations", + "Number of kv operations", + &["type", "cf"] + ).unwrap(); + pub static ref IN_MEMORY_ENGINE_OLDEST_SAFE_POINT: IntGauge = register_int_gauge!( + "tikv_in_memory_engine_oldest_safe_point", + "The oldest safe point in the in-memory engine", + ) + .unwrap(); + pub static ref IN_MEMORY_ENGINE_NEWEST_SAFE_POINT: IntGauge = register_int_gauge!( + "tikv_in_memory_engine_newest_safe_point", + "The newest safe point in the in-memory engine", + ) + .unwrap(); + pub static ref SAFE_POINT_GAP: IntGauge = register_int_gauge!( + "tikv_safe_point_gap_with_in_memory_engine", + "The gap between tikv auto gc safe point and the oldest auto gc safe point in the in-memory engine", + ) + .unwrap(); +} + +lazy_static! { + pub static ref IN_MEMORY_ENGINE_GC_FILTERED_STATIC: GcFilteredCountVec = + auto_flush_from!(GC_FILTERED, GcFilteredCountVec); + pub static ref IN_MEMORY_ENGINE_FLOW_STATIC: InMemoryEngineTickerMetrics = + auto_flush_from!(IN_MEMORY_ENGINE_FLOW, InMemoryEngineTickerMetrics); + pub static ref IN_MEMORY_ENGINE_LOCATE_STATIC: InMemoryEngineTickerMetrics = + auto_flush_from!(IN_MEMORY_ENGINE_LOCATE, InMemoryEngineTickerMetrics); + pub static ref IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC: EvictionDurationVec = auto_flush_from!( + IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM, + EvictionDurationVec + ); + pub static ref IN_MEMORY_ENGINE_OPERATION_STATIC: OperationTypeForCF = + auto_flush_from!(IN_MEMORY_ENGINE_KV_OPERATIONS, OperationTypeForCF); +} + +pub fn flush_in_memory_engine_statistics(statistics: &Arc) { + for t in ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v); + } +} + +fn flush_engine_ticker_metrics(t: Tickers, value: u64) { + match t { + Tickers::BytesRead => { + IN_MEMORY_ENGINE_FLOW_STATIC.bytes_read.inc_by(value); + } + Tickers::IterBytesRead => { + IN_MEMORY_ENGINE_FLOW_STATIC.iter_bytes_read.inc_by(value); + } + Tickers::NumberDbSeek => { + IN_MEMORY_ENGINE_LOCATE_STATIC.number_db_seek.inc_by(value); + } + Tickers::NumberDbSeekFound => { + IN_MEMORY_ENGINE_LOCATE_STATIC + .number_db_seek_found + .inc_by(value); + } + Tickers::NumberDbNext => { + IN_MEMORY_ENGINE_LOCATE_STATIC.number_db_next.inc_by(value); + } + Tickers::NumberDbNextFound => { + IN_MEMORY_ENGINE_LOCATE_STATIC + .number_db_next_found + .inc_by(value); + } + Tickers::NumberDbPrev => { + IN_MEMORY_ENGINE_LOCATE_STATIC.number_db_prev.inc_by(value); + } + Tickers::NumberDbPrevFound => { + IN_MEMORY_ENGINE_LOCATE_STATIC + .number_db_prev_found + .inc_by(value); + } + _ => { + unreachable!() + } + } +} + +pub(crate) fn observe_eviction_duration(secs: f64, evict_reason: EvictReason) { + match evict_reason { + EvictReason::AutoEvict => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .auto_evict + .observe(secs), + EvictReason::BecomeFollower => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .become_follower + .observe(secs), + EvictReason::DeleteRange => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .delete_range + .observe(secs), + EvictReason::LoadFailed => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .load_failed + .observe(secs), + EvictReason::LoadFailedWithoutStart => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .load_failed_without_start + .observe(secs), + EvictReason::MemoryLimitReached => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .memory_limit_reached + .observe(secs), + EvictReason::Merge => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .merge + .observe(secs), + EvictReason::Disabled => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .disabled + .observe(secs), + EvictReason::ApplySnapshot => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .apply_snapshot + .observe(secs), + EvictReason::Flashback => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .flashback + .observe(secs), + EvictReason::Manual => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .manual + .observe(secs), + EvictReason::PeerDestroy => IN_MEMORY_ENGINE_EVICTION_DURATION_HISTOGRAM_STATIC + .destroy_peer + .observe(secs), + } +} + +pub(crate) fn count_operations_for_cfs(put_operations: &[u64], delete_operations: &[u64]) { + // according to `cf_to_id`, we have 0 for CF_DEFAULT, 1 for CF_LOCK, and 2 for + // CF_WRITE + assert_eq!(put_operations.len(), 3); + assert_eq!(delete_operations.len(), 3); + IN_MEMORY_ENGINE_OPERATION_STATIC + .put + .default + .inc_by(put_operations[0]); + IN_MEMORY_ENGINE_OPERATION_STATIC + .put + .lock + .inc_by(put_operations[1]); + IN_MEMORY_ENGINE_OPERATION_STATIC + .put + .write + .inc_by(put_operations[2]); + + IN_MEMORY_ENGINE_OPERATION_STATIC + .delete + .default + .inc_by(delete_operations[0]); + IN_MEMORY_ENGINE_OPERATION_STATIC + .delete + .lock + .inc_by(delete_operations[1]); + IN_MEMORY_ENGINE_OPERATION_STATIC + .delete + .write + .inc_by(delete_operations[2]); +} diff --git a/components/range_cache_memory_engine/src/perf_context.rs b/components/in_memory_engine/src/perf_context.rs similarity index 100% rename from components/range_cache_memory_engine/src/perf_context.rs rename to components/in_memory_engine/src/perf_context.rs diff --git a/components/range_cache_memory_engine/src/prop_test.rs b/components/in_memory_engine/src/prop_test.rs similarity index 70% rename from components/range_cache_memory_engine/src/prop_test.rs rename to components/in_memory_engine/src/prop_test.rs index 8045a43cc51..f25d21e6733 100644 --- a/components/range_cache_memory_engine/src/prop_test.rs +++ b/components/in_memory_engine/src/prop_test.rs @@ -5,17 +5,21 @@ use std::sync::Arc; use crossbeam::epoch; use engine_rocks::{util::new_engine, RocksEngine}; use engine_traits::{ - CacheRange, CfName, Iterable, Iterator, Peekable, SyncMutable, CF_DEFAULT, CF_LOCK, CF_WRITE, + CacheRegion, CfName, Iterable, Iterator, Peekable, SyncMutable, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use proptest::prelude::*; use tikv_util::config::{ReadableSize, VersionTrack}; +use txn_types::{Key, TimeStamp}; use super::engine::SkiplistHandle; use crate::{ decode_key, engine::SkiplistEngine, keys::encode_key, memory_controller::MemoryController, - InternalBytes, RangeCacheEngineConfig, + InMemoryEngineConfig, InternalBytes, }; +// This fixed mvcc suffix is used for CF_WRITE and CF_DEFAULT in prop test. +const MVCC_SUFFIX: u64 = 10; + #[derive(Clone)] enum Operation { Put(Vec, Vec), @@ -137,9 +141,9 @@ fn test_rocksdb_skiplist_basic_operations(cf: CfName, operations: Vec ) .unwrap(); - let mut cfg = RangeCacheEngineConfig::default(); - cfg.soft_limit_threshold = Some(ReadableSize::gb(1)); - cfg.hard_limit_threshold = Some(ReadableSize::gb(2)); + let mut cfg = InMemoryEngineConfig::default(); + cfg.evict_threshold = Some(ReadableSize::gb(1)); + cfg.capacity = Some(ReadableSize::gb(2)); let controller = Arc::new(MemoryController::new( Arc::new(VersionTrack::new(cfg)), skiplist.clone(), @@ -158,15 +162,27 @@ fn test_rocksdb_skiplist_basic_operations(cf: CfName, operations: Vec (handle, key, value, guard) }; + // Delete range in SkiplistEngine considers MVCC suffix for CF_DEFAULT and + // CF_WRITE, so we append the suffix for them. for op in operations { match op { - Operation::Put(k, v) => { + Operation::Put(mut k, v) => { + if cf != CF_LOCK { + k = Key::from_raw(&k) + .append_ts(TimeStamp::new(MVCC_SUFFIX)) + .into_encoded(); + } db_rocks.put_cf(cf, &k, &v).unwrap(); let (handle, key, value, guard) = skiplist_args(k, Some(v)); handle.insert(key, value.unwrap(), &guard) } - Operation::Get(k) => { + Operation::Get(mut k) => { + if cf != CF_LOCK { + k = Key::from_raw(&k) + .append_ts(TimeStamp::new(MVCC_SUFFIX)) + .into_encoded(); + } let res_rocks = db_rocks.get_value_cf(cf, &k).unwrap(); let (handle, key, _value, guard) = skiplist_args(k, None); let res_skiplist = handle.get(&key, &guard); @@ -175,28 +191,45 @@ fn test_rocksdb_skiplist_basic_operations(cf: CfName, operations: Vec res_skiplist.map(|e| e.value().as_slice()) ); } - Operation::Delete(k) => { + Operation::Delete(mut k) => { + if cf != CF_LOCK { + k = Key::from_raw(&k) + .append_ts(TimeStamp::new(MVCC_SUFFIX)) + .into_encoded(); + } db_rocks.delete_cf(cf, &k).unwrap(); let (handle, key, _value, guard) = skiplist_args(k, None); handle.remove(&key, &guard) } - Operation::Scan(k, limit) => { + Operation::Scan(mut k, limit) => { + if cf != CF_LOCK { + k = Key::from_raw(&k) + .append_ts(TimeStamp::new(MVCC_SUFFIX)) + .into_encoded(); + } let res_rocks = scan_rocksdb(&db_rocks, cf, &k, limit); let (handle, key, _value, _guard) = skiplist_args(k, None); let res_titan = scan_skiplist(handle, &key, limit); assert_eq!(res_rocks, res_titan); } - Operation::DeleteRange(k1, k2) => { - if k1 <= k2 { - db_rocks.delete_range_cf(cf, &k1, &k2).unwrap(); - let range = CacheRange::new(k1.clone(), k2.clone()); - skiplist.delete_range_cf(cf, &range); - } else { - db_rocks.delete_range_cf(cf, &k2, &k1).unwrap(); - let range = CacheRange::new(k2.clone(), k1.clone()); - skiplist.delete_range_cf(cf, &range); + Operation::DeleteRange(mut k1, mut k2) => { + if k1 > k2 { + (k1, k2) = (k2, k1); } + + let range = CacheRegion::new(1, 0, k1.clone(), k2.clone()); + skiplist.delete_range_cf(cf, &range); + + if cf != CF_LOCK { + k1 = Key::from_raw(&k1) + .append_ts(TimeStamp::new(MVCC_SUFFIX)) + .into_encoded(); + k2 = Key::from_raw(&k2) + .append_ts(TimeStamp::new(MVCC_SUFFIX)) + .into_encoded(); + } + db_rocks.delete_range_cf(cf, &k1, &k2).unwrap(); } } } @@ -209,3 +242,15 @@ proptest! { test_rocksdb_skiplist_basic_operations(cf, operations); } } + +#[test] +fn test_case1() { + let de = |data| hex::decode(data).unwrap(); + let cf = CF_WRITE; + let operations = [ + Operation::Put(de("E2"), de("38CC98E09D9CB1D1")), + Operation::DeleteRange(de(""), de("E2")), + Operation::Scan(de("2C0F698A"), 3), + ]; + test_rocksdb_skiplist_basic_operations(cf, operations.to_vec()); +} diff --git a/components/range_cache_memory_engine/src/read.rs b/components/in_memory_engine/src/read.rs similarity index 77% rename from components/range_cache_memory_engine/src/read.rs rename to components/in_memory_engine/src/read.rs index 1d307b79ffa..3619c4c4555 100644 --- a/components/range_cache_memory_engine/src/read.rs +++ b/components/in_memory_engine/src/read.rs @@ -4,11 +4,11 @@ use core::slice::SlicePattern; use std::{fmt::Debug, ops::Deref, result, sync::Arc}; use bytes::Bytes; -use crossbeam::epoch::{self}; +use crossbeam::epoch; use crossbeam_skiplist::{base::OwnedIter, SkipList}; use engine_rocks::{raw::SliceTransform, util::FixedSuffixSliceTransform}; use engine_traits::{ - CacheRange, CfNamesExt, DbVector, Error, FailedReason, IterMetricsCollector, IterOptions, + CacheRegion, CfNamesExt, DbVector, Error, FailedReason, IterMetricsCollector, IterOptions, Iterable, Iterator, MetricsExt, Peekable, ReadOptions, Result, Snapshot, SnapshotMiscExt, CF_DEFAULT, }; @@ -27,7 +27,7 @@ use crate::{ perf_context::PERF_CONTEXT, perf_counter_add, statistics::{LocalStatistics, Statistics, Tickers}, - RangeCacheMemoryEngine, + RegionCacheMemoryEngine, }; // The max snapshot number that can exist in the RocksDB. This is typically used @@ -42,65 +42,69 @@ enum Direction { } #[derive(Clone, Debug)] -pub struct RangeCacheSnapshotMeta { - pub(crate) range_id: u64, - pub(crate) range: CacheRange, +pub struct RegionCacheSnapshotMeta { + pub(crate) region: CacheRegion, pub(crate) snapshot_ts: u64, - // Sequence number is shared between RangeCacheEngine and disk KvEnigne to + // Sequence number is shared between RegionCacheEngine and disk KvEnigne to // provide atomic write pub(crate) sequence_number: u64, } -impl RangeCacheSnapshotMeta { - fn new(range_id: u64, range: CacheRange, snapshot_ts: u64, sequence_number: u64) -> Self { +impl RegionCacheSnapshotMeta { + pub(crate) fn new(region: CacheRegion, snapshot_ts: u64, sequence_number: u64) -> Self { Self { - range_id, - range, + region, snapshot_ts, sequence_number, } } } -#[derive(Clone, Debug)] -pub struct RangeCacheSnapshot { - snapshot_meta: RangeCacheSnapshotMeta, +#[derive(Debug)] +pub struct RegionCacheSnapshot { + snapshot_meta: RegionCacheSnapshotMeta, skiplist_engine: SkiplistEngine, - engine: RangeCacheMemoryEngine, + engine: RegionCacheMemoryEngine, } -impl RangeCacheSnapshot { +impl RegionCacheSnapshot { pub fn new( - engine: RangeCacheMemoryEngine, - range: CacheRange, + engine: RegionCacheMemoryEngine, + region: CacheRegion, read_ts: u64, seq_num: u64, ) -> result::Result { - let mut core = engine.core.write(); - let range_id = core.range_manager.range_snapshot(&range, read_ts)?; - Ok(RangeCacheSnapshot { - snapshot_meta: RangeCacheSnapshotMeta::new(range_id, range, read_ts, seq_num), - skiplist_engine: core.engine.clone(), + engine + .core + .region_manager + .region_snapshot(region.id, region.epoch_version, read_ts)?; + Ok(RegionCacheSnapshot { + snapshot_meta: RegionCacheSnapshotMeta::new(region, read_ts, seq_num), + skiplist_engine: engine.core.engine.clone(), engine: engine.clone(), }) } + + pub(crate) fn snapshot_meta(&self) -> &RegionCacheSnapshotMeta { + &self.snapshot_meta + } } -impl Drop for RangeCacheSnapshot { +impl Drop for RegionCacheSnapshot { fn drop(&mut self) { - let mut core = self.engine.core.write(); - let ranges_removable = core - .range_manager - .remove_range_snapshot(&self.snapshot_meta); - if !ranges_removable.is_empty() { - drop(core); + let regions_removable = self + .engine + .core + .region_manager + .remove_region_snapshot(&self.snapshot_meta); + if !regions_removable.is_empty() { if let Err(e) = self .engine .bg_worker_manager() - .schedule_task(BackgroundTask::DeleteRange(ranges_removable)) + .schedule_task(BackgroundTask::DeleteRegions(regions_removable)) { error!( - "schedule delete range failed"; + "ime schedule delete range failed"; "err" => ?e, ); assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); @@ -109,10 +113,10 @@ impl Drop for RangeCacheSnapshot { } } -impl Snapshot for RangeCacheSnapshot {} +impl Snapshot for RegionCacheSnapshot {} -impl Iterable for RangeCacheSnapshot { - type Iterator = RangeCacheIterator; +impl Iterable for RegionCacheSnapshot { + type Iterator = RegionCacheIterator; fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { let iter = self.skiplist_engine.data[cf_to_id(cf)].owned_iter(); @@ -129,19 +133,19 @@ impl Iterable for RangeCacheSnapshot { } let (lower_bound, upper_bound) = (lower_bound.unwrap(), upper_bound.unwrap()); - if lower_bound < self.snapshot_meta.range.start - || upper_bound > self.snapshot_meta.range.end + if lower_bound < self.snapshot_meta.region.start + || upper_bound > self.snapshot_meta.region.end { return Err(Error::Other(box_err!( - "the bounderies required [{}, {}] exceeds the range of the snapshot [{}, {}]", + "the boundaries required [{}, {}] exceeds the range of the snapshot [{}, {}]", log_wrappers::Value(&lower_bound), log_wrappers::Value(&upper_bound), - log_wrappers::Value(&self.snapshot_meta.range.start), - log_wrappers::Value(&self.snapshot_meta.range.end) + log_wrappers::Value(&self.snapshot_meta.region.start), + log_wrappers::Value(&self.snapshot_meta.region.end) ))); } - Ok(RangeCacheIterator { + Ok(RegionCacheIterator { valid: false, prefix: None, lower_bound, @@ -155,12 +159,13 @@ impl Iterable for RangeCacheSnapshot { prefix_extractor, local_stats: LocalStatistics::default(), seek_duration: IN_MEMORY_ENGINE_SEEK_DURATION.local(), + snapshot_read_ts: self.snapshot_meta.snapshot_ts, }) } } -impl Peekable for RangeCacheSnapshot { - type DbVector = RangeCacheDbVector; +impl Peekable for RegionCacheSnapshot { + type DbVector = RegionCacheDbVector; fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { self.get_value_cf_opt(opts, CF_DEFAULT, key) @@ -172,13 +177,12 @@ impl Peekable for RangeCacheSnapshot { cf: &str, key: &[u8], ) -> Result> { - fail::fail_point!("on_range_cache_get_value"); - if !self.snapshot_meta.range.contains_key(key) { + if !self.snapshot_meta.region.contains_key(key) { return Err(Error::Other(box_err!( "key {} not in range[{}, {}]", log_wrappers::Value(key), - log_wrappers::Value(&self.snapshot_meta.range.start), - log_wrappers::Value(&self.snapshot_meta.range.end) + log_wrappers::Value(&self.snapshot_meta.region.start), + log_wrappers::Value(&self.snapshot_meta.region.end) ))); } let mut iter = self.skiplist_engine.data[cf_to_id(cf)].owned_iter(); @@ -201,35 +205,35 @@ impl Peekable for RangeCacheSnapshot { .statistics() .record_ticker(Tickers::BytesRead, value.len() as u64); perf_counter_add!(get_read_bytes, value.len() as u64); - Ok(Some(RangeCacheDbVector(value))) + Ok(Some(RegionCacheDbVector(value))) } _ => Ok(None), } } } -impl CfNamesExt for RangeCacheSnapshot { +impl CfNamesExt for RegionCacheSnapshot { fn cf_names(&self) -> Vec<&str> { unimplemented!() } } -impl SnapshotMiscExt for RangeCacheSnapshot { +impl SnapshotMiscExt for RegionCacheSnapshot { fn sequence_number(&self) -> u64 { self.snapshot_meta.sequence_number } } -pub struct RangeCacheIterator { +pub struct RegionCacheIterator { valid: bool, iter: OwnedIter>, InternalBytes, InternalBytes>, // The lower bound is inclusive while the upper bound is exclusive if set // Note: bounds (region boundaries) have no mvcc versions - lower_bound: Vec, - upper_bound: Vec, + pub(crate) lower_bound: Vec, + pub(crate) upper_bound: Vec, // A snapshot sequence number passed from RocksEngine Snapshot to guarantee suitable // visibility. - sequence_number: u64, + pub(crate) sequence_number: u64, saved_user_key: Vec, // This is only used by backwawrd iteration where the value we want may not be pointed by the @@ -246,9 +250,11 @@ pub struct RangeCacheIterator { statistics: Arc, local_stats: LocalStatistics, seek_duration: LocalHistogram, + + pub(crate) snapshot_read_ts: u64, } -impl Drop for RangeCacheIterator { +impl Drop for RegionCacheIterator { fn drop(&mut self) { self.statistics .record_ticker(Tickers::IterBytesRead, self.local_stats.bytes_read); @@ -275,7 +281,7 @@ impl Drop for RangeCacheIterator { } } -impl RangeCacheIterator { +impl RegionCacheIterator { // If `skipping_saved_key` is true, the function will keep iterating until it // finds a user key that is larger than `saved_user_key`. // If `prefix` is not None, the iterator needs to stop when all keys for the @@ -463,7 +469,7 @@ impl RangeCacheIterator { } } -impl Iterator for RangeCacheIterator { +impl Iterator for RegionCacheIterator { fn key(&self) -> &[u8] { assert!(self.valid); &self.saved_user_key @@ -525,6 +531,7 @@ impl Iterator for RangeCacheIterator { } fn seek(&mut self, key: &[u8]) -> Result { + fail::fail_point!("ime_on_iterator_seek"); let begin = Instant::now(); self.direction = Direction::Forward; if let Some(ref mut extractor) = self.prefix_extractor { @@ -614,9 +621,9 @@ impl Iterator for RangeCacheIterator { } } -pub struct RangeCacheIterMetricsCollector; +pub struct RegionCacheIterMetricsCollector; -impl IterMetricsCollector for RangeCacheIterMetricsCollector { +impl IterMetricsCollector for RegionCacheIterMetricsCollector { fn internal_delete_skipped_count(&self) -> u64 { PERF_CONTEXT.with(|perf_context| perf_context.borrow().internal_delete_skipped_count) } @@ -626,17 +633,17 @@ impl IterMetricsCollector for RangeCacheIterMetricsCollector { } } -impl MetricsExt for RangeCacheIterator { - type Collector = RangeCacheIterMetricsCollector; +impl MetricsExt for RegionCacheIterator { + type Collector = RegionCacheIterMetricsCollector; fn metrics_collector(&self) -> Self::Collector { - RangeCacheIterMetricsCollector {} + RegionCacheIterMetricsCollector {} } } #[derive(Debug)] -pub struct RangeCacheDbVector(Bytes); +pub struct RegionCacheDbVector(Bytes); -impl Deref for RangeCacheDbVector { +impl Deref for RegionCacheDbVector { type Target = [u8]; fn deref(&self) -> &[u8] { @@ -644,9 +651,9 @@ impl Deref for RangeCacheDbVector { } } -impl DbVector for RangeCacheDbVector {} +impl DbVector for RegionCacheDbVector {} -impl<'a> PartialEq<&'a [u8]> for RangeCacheDbVector { +impl<'a> PartialEq<&'a [u8]> for RegionCacheDbVector { fn eq(&self, rhs: &&[u8]) -> bool { self.0.as_slice() == *rhs } @@ -669,44 +676,46 @@ mod tests { raw::DBStatisticsTickerType, util::new_engine_opt, RocksDbOptions, RocksStatistics, }; use engine_traits::{ - CacheRange, FailedReason, IterMetricsCollector, IterOptions, Iterable, Iterator, - MetricsExt, Mutable, Peekable, RangeCacheEngine, ReadOptions, WriteBatch, WriteBatchExt, - CF_DEFAULT, CF_LOCK, CF_WRITE, + CacheRegion, EvictReason, FailedReason, IterMetricsCollector, IterOptions, Iterable, + Iterator, MetricsExt, Mutable, Peekable, ReadOptions, RegionCacheEngine, + RegionCacheEngineExt, RegionEvent, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, + CF_WRITE, }; + use keys::DATA_PREFIX_KEY; use tempfile::Builder; use tikv_util::config::VersionTrack; - use super::{RangeCacheIterator, RangeCacheSnapshot}; + use super::{RegionCacheIterator, RegionCacheSnapshot}; use crate::{ engine::{cf_to_id, SkiplistEngine}, keys::{ - construct_key, construct_user_key, construct_value, decode_key, encode_key, - encode_seek_key, InternalBytes, ValueType, + construct_key, construct_region_key, construct_user_key, construct_value, decode_key, + encode_key, encode_seek_key, InternalBytes, ValueType, }, perf_context::PERF_CONTEXT, + region_manager::RegionState, statistics::Tickers, - RangeCacheEngineConfig, RangeCacheEngineContext, RangeCacheMemoryEngine, - RangeCacheWriteBatch, + test_util::new_region, + InMemoryEngineConfig, InMemoryEngineContext, RegionCacheMemoryEngine, + RegionCacheWriteBatch, }; #[test] fn test_snapshot() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"k00", b"k10"); + engine.new_region(region.clone()); let verify_snapshot_count = |snapshot_ts, count| { - let core = engine.core.read(); + let regions_map = engine.core.region_manager.regions_map.read(); if count > 0 { assert_eq!( - *core - .range_manager - .ranges() - .get(&range) + *regions_map.regions()[®ion.id] + .region_snapshot_list() + .lock() .unwrap() - .range_snapshot_list() .0 .get(&snapshot_ts) .unwrap(), @@ -714,11 +723,10 @@ mod tests { ); } else { assert!( - core.range_manager - .ranges() - .get(&range) + regions_map.regions()[®ion.id] + .region_snapshot_list() + .lock() .unwrap() - .range_snapshot_list() .0 .get(&snapshot_ts) .is_none() @@ -726,42 +734,41 @@ mod tests { } }; - let s1 = engine.snapshot(range.clone(), 5, u64::MAX).unwrap(); + let cache_region = CacheRegion::from_region(®ion); + let s1 = engine.snapshot(cache_region.clone(), 5, u64::MAX).unwrap(); - { - let mut core = engine.core.write(); - let t_range = CacheRange::new(b"k00".to_vec(), b"k02".to_vec()); - assert!(!core.range_manager.set_safe_point(&t_range, 5)); - assert!(core.range_manager.set_safe_point(&range, 5)); - } + assert!(engine.core.region_manager.set_safe_point(region.id, 5)); assert_eq!( - engine.snapshot(range.clone(), 5, u64::MAX).unwrap_err(), + engine + .snapshot(cache_region.clone(), 5, u64::MAX) + .unwrap_err(), FailedReason::TooOldRead ); - let s2 = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); + let s2 = engine.snapshot(cache_region.clone(), 10, u64::MAX).unwrap(); verify_snapshot_count(5, 1); verify_snapshot_count(10, 1); - let s3 = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); + let s3 = engine.snapshot(cache_region.clone(), 10, u64::MAX).unwrap(); verify_snapshot_count(10, 2); drop(s1); verify_snapshot_count(5, 0); drop(s2); verify_snapshot_count(10, 1); - let s4 = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); + let s4 = engine.snapshot(cache_region.clone(), 10, u64::MAX).unwrap(); verify_snapshot_count(10, 2); drop(s4); verify_snapshot_count(10, 1); drop(s3); { - let core = engine.core.write(); + let regions_map = engine.core.region_manager.regions_map.read(); assert!( - core.range_manager - .ranges() - .get(&range) + regions_map + .region_meta(region.id) + .unwrap() + .region_snapshot_list() + .lock() .unwrap() - .range_snapshot_list() .is_empty() ); } @@ -806,6 +813,7 @@ mod tests { fn construct_mvcc_key(key: &str, mvcc: u64) -> Vec { let mut k = vec![]; + k.extend_from_slice(DATA_PREFIX_KEY); k.extend_from_slice(key.as_bytes()); k.put_u64(!mvcc); k @@ -855,7 +863,7 @@ mod tests { } fn verify_key_values, J: iter::Iterator + Clone>( - iter: &mut RangeCacheIterator, + iter: &mut RegionCacheIterator, key_range: I, mvcc_range: J, foward: bool, @@ -881,20 +889,17 @@ mod tests { #[test] fn test_seek() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - - { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); - put_key_val(&sl, "b", "val", 10, 5); - put_key_val(&sl, "c", "vall", 10, 5); - } + engine.core.region_manager().set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); + put_key_val(&sl, "b", "val", 10, 5); + put_key_val(&sl, "c", "vall", 10, 5); let snapshot = engine.snapshot(range.clone(), u64::MAX, 100).unwrap(); let mut iter_opt = IterOptions::default(); @@ -919,24 +924,22 @@ mod tests { #[test] fn test_get_value() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"", b"z"); + let cache_region = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); - { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); - fill_data_in_skiplist(sl.clone(), (1..10).step_by(1), 1..50, 1); - // k1 is deleted at seq_num 150 while k49 is deleted at seq num 101 - delete_data_in_skiplist(sl, (1..10).step_by(1), 1..50, 100); - } + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); + fill_data_in_skiplist(sl.clone(), (1..10).step_by(1), 1..50, 1); + // k1 is deleted at seq_num 150 while k49 is deleted at seq num 101 + delete_data_in_skiplist(sl, (1..10).step_by(1), 1..50, 100); let opts = ReadOptions::default(); { - let snapshot = engine.snapshot(range.clone(), 10, 60).unwrap(); + let snapshot = engine.snapshot(cache_region.clone(), 10, 60).unwrap(); for i in 1..10 { for mvcc in 1..50 { let k = construct_key(i, mvcc); @@ -958,7 +961,7 @@ mod tests { // all deletions { - let snapshot = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); + let snapshot = engine.snapshot(cache_region.clone(), 10, u64::MAX).unwrap(); for i in 1..10 { for mvcc in 1..50 { let k = construct_key(i, mvcc); @@ -974,7 +977,7 @@ mod tests { // some deletions { - let snapshot = engine.snapshot(range.clone(), 10, 105).unwrap(); + let snapshot = engine.snapshot(cache_region.clone(), 10, 105).unwrap(); for mvcc in 1..50 { for i in 1..7 { let k = construct_key(i, mvcc); @@ -999,20 +1002,18 @@ mod tests { #[test] fn test_iterator_forawrd() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - let step: i32 = 2; + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); - { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); - fill_data_in_skiplist(sl.clone(), (1..100).step_by(step as usize), 1..10, 1); - delete_data_in_skiplist(sl, (1..100).step_by(step as usize), 1..10, 200); - } + let step = 2; + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); + fill_data_in_skiplist(sl.clone(), (1..100).step_by(step), 1..10, 1); + delete_data_in_skiplist(sl, (1..100).step_by(step), 1..10, 200); let mut iter_opt = IterOptions::default(); let snapshot = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); @@ -1035,20 +1036,14 @@ mod tests { let snapshot = engine.snapshot(range.clone(), 100, 150).unwrap(); let mut iter = snapshot.iterator_opt("write", iter_opt.clone()).unwrap(); iter.seek_to_first().unwrap(); - verify_key_values( - &mut iter, - (1..100).step_by(step as usize), - (1..10).rev(), - true, - true, - ); + verify_key_values(&mut iter, (1..100).step_by(step), (1..10).rev(), true, true); // seek key that is in the skiplist let seek_key = construct_key(11, u64::MAX); iter.seek(&seek_key).unwrap(); verify_key_values( &mut iter, - (11..100).step_by(step as usize), + (11..100).step_by(step), (1..10).rev(), true, true, @@ -1059,7 +1054,7 @@ mod tests { iter.seek(&seek_key).unwrap(); verify_key_values( &mut iter, - (13..100).step_by(step as usize), + (13..100).step_by(step), (1..10).rev(), true, true, @@ -1073,7 +1068,7 @@ mod tests { iter.seek_to_first().unwrap(); verify_key_values( &mut iter, - (63..100).step_by(step as usize), + (63..100).step_by(step), (1..10).rev(), true, true, @@ -1116,24 +1111,12 @@ mod tests { let mut iter = snapshot.iterator_opt("write", iter_opt.clone()).unwrap(); assert!(iter.seek_to_first().unwrap()); - verify_key_values( - &mut iter, - (21..40).step_by(step as usize), - (1..10).rev(), - true, - true, - ); + verify_key_values(&mut iter, (21..40).step_by(step), (1..10).rev(), true, true); // seek a key that is below the lower bound is the same with seek_to_first let seek_key = construct_key(19, u64::MAX); assert!(iter.seek(&seek_key).unwrap()); - verify_key_values( - &mut iter, - (21..40).step_by(step as usize), - (1..10).rev(), - true, - true, - ); + verify_key_values(&mut iter, (21..40).step_by(step), (1..10).rev(), true, true); // seek a key that is larger or equal to upper bound won't get any key let seek_key = construct_key(41, u64::MAX); @@ -1142,13 +1125,7 @@ mod tests { let seek_key = construct_key(32, u64::MAX); assert!(iter.seek(&seek_key).unwrap()); - verify_key_values( - &mut iter, - (33..40).step_by(step as usize), - (1..10).rev(), - true, - true, - ); + verify_key_values(&mut iter, (33..40).step_by(step), (1..10).rev(), true, true); } // with bounds, some deletions (seq_num 215) @@ -1186,19 +1163,19 @@ mod tests { #[test] fn test_iterator_backward() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - let step: i32 = 2; + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); + let step = 2; { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); - fill_data_in_skiplist(sl.clone(), (1..100).step_by(step as usize), 1..10, 1); - delete_data_in_skiplist(sl, (1..100).step_by(step as usize), 1..10, 200); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); + fill_data_in_skiplist(sl.clone(), (1..100).step_by(step), 1..10, 1); + delete_data_in_skiplist(sl, (1..100).step_by(step), 1..10, 200); } let mut iter_opt = IterOptions::default(); @@ -1212,35 +1189,17 @@ mod tests { let snapshot = engine.snapshot(range.clone(), 10, 150).unwrap(); let mut iter = snapshot.iterator_opt("write", iter_opt.clone()).unwrap(); assert!(iter.seek_to_last().unwrap()); - verify_key_values( - &mut iter, - (1..100).step_by(step as usize).rev(), - 1..10, - false, - true, - ); + verify_key_values(&mut iter, (1..100).step_by(step).rev(), 1..10, false, true); // seek key that is in the skiplist let seek_key = construct_key(81, 0); assert!(iter.seek_for_prev(&seek_key).unwrap()); - verify_key_values( - &mut iter, - (1..82).step_by(step as usize).rev(), - 1..10, - false, - true, - ); + verify_key_values(&mut iter, (1..82).step_by(step).rev(), 1..10, false, true); // seek key that is in the skiplist let seek_key = construct_key(80, 0); assert!(iter.seek_for_prev(&seek_key).unwrap()); - verify_key_values( - &mut iter, - (1..80).step_by(step as usize).rev(), - 1..10, - false, - true, - ); + verify_key_values(&mut iter, (1..80).step_by(step).rev(), 1..10, false, true); } let lower_bound = construct_user_key(21); @@ -1252,24 +1211,12 @@ mod tests { let mut iter = snapshot.iterator_opt("write", iter_opt).unwrap(); assert!(iter.seek_to_last().unwrap()); - verify_key_values( - &mut iter, - (21..38).step_by(step as usize).rev(), - 1..10, - false, - true, - ); + verify_key_values(&mut iter, (21..38).step_by(step).rev(), 1..10, false, true); // seek a key that is above the upper bound is the same with seek_to_last let seek_key = construct_key(40, 0); assert!(iter.seek_for_prev(&seek_key).unwrap()); - verify_key_values( - &mut iter, - (21..38).step_by(step as usize).rev(), - 1..10, - false, - true, - ); + verify_key_values(&mut iter, (21..38).step_by(step).rev(), 1..10, false, true); // seek a key that is less than the lower bound won't get any key let seek_key = construct_key(20, u64::MAX); @@ -1278,28 +1225,22 @@ mod tests { let seek_key = construct_key(26, 0); assert!(iter.seek_for_prev(&seek_key).unwrap()); - verify_key_values( - &mut iter, - (21..26).step_by(step as usize).rev(), - 1..10, - false, - true, - ); + verify_key_values(&mut iter, (21..26).step_by(step).rev(), 1..10, false, true); } } #[test] fn test_seq_visibility() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); put_key_val(&sl, "aaa", "va1", 10, 1); put_key_val(&sl, "aaa", "va2", 10, 3); @@ -1413,16 +1354,16 @@ mod tests { #[test] fn test_seq_visibility_backward() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); put_key_val(&sl, "aaa", "va1", 10, 2); put_key_val(&sl, "aaa", "va2", 10, 4); @@ -1510,20 +1451,20 @@ mod tests { #[test] fn test_iter_user_skip() { let mut iter_opt = IterOptions::default(); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); iter_opt.set_upper_bound(&range.end, 0); iter_opt.set_lower_bound(&range.start, 0); // backward, all put { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( - Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), )); - engine.new_range(range.clone()); + engine.new_region(region.clone()); let sl = { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - core.engine.data[cf_to_id("write")].clone() + engine.core.region_manager.set_safe_point(region.id, 5); + engine.core.engine.data[cf_to_id("write")].clone() }; let mut s = 1; @@ -1554,14 +1495,13 @@ mod tests { // backward, all deletes { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( - Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), )); - engine.new_range(range.clone()); + engine.new_region(region.clone()); let sl = { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - core.engine.data[cf_to_id("write")].clone() + engine.core.region_manager.set_safe_point(region.id, 5); + engine.core.engine.data[cf_to_id("write")].clone() }; let mut s = 1; @@ -1585,14 +1525,13 @@ mod tests { // backward, all deletes except for last put, last put's seq { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( - Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), )); - engine.new_range(range.clone()); + engine.new_region(region.clone()); let sl = { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - core.engine.data[cf_to_id("write")].clone() + engine.core.region_manager.set_safe_point(region.id, 5); + engine.core.engine.data[cf_to_id("write")].clone() }; put_key_val(&sl, "a", "val", 10, 1); for i in 2..50 { @@ -1618,14 +1557,13 @@ mod tests { // all deletes except for last put, deletions' seq { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( - Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), )); - engine.new_range(range.clone()); + engine.new_region(region.clone()); let sl = { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - core.engine.data[cf_to_id("write")].clone() + engine.core.region_manager.set_safe_point(region.id, 5); + engine.core.engine.data[cf_to_id("write")].clone() }; let mut s = 1; for seq in 2..50 { @@ -1650,16 +1588,16 @@ mod tests { #[test] fn test_prefix_seek() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"k000".to_vec(), b"k100".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"k000", b"k100"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); let guard = &epoch::pin(); for i in 1..5 { @@ -1724,7 +1662,7 @@ mod tests { fill_data_in_skiplist(sl.clone(), (1..60).step_by(1), 1..2, 1); }); - let evict_range = CacheRange::new(construct_user_key(20), construct_user_key(40)); + let evict_range = CacheRegion::new(1, 0, construct_user_key(20), construct_user_key(40)); sl_engine.delete_range(&evict_range); sl_engine.data.iter().for_each(|sl| { let mut iter = sl.owned_iter(); @@ -1747,46 +1685,44 @@ mod tests { }); } - fn verify_evict_range_deleted(engine: &RangeCacheMemoryEngine, range: &CacheRange) { - let mut wait = 0; - while wait < 10 { - wait += 1; - if !engine - .core - .read() - .range_manager() - .ranges_being_deleted - .is_empty() - { - std::thread::sleep(Duration::from_millis(200)); - } else { - break; - } - } - let write_handle = engine.core.read().engine.cf_handle("write"); - let start_key = encode_seek_key(&range.start, u64::MAX); + fn verify_evict_region_deleted(engine: &RegionCacheMemoryEngine, region: &CacheRegion) { + test_util::eventually( + Duration::from_millis(100), + Duration::from_millis(2000), + || { + !engine + .core + .region_manager() + .regions_map + .read() + .regions() + .values() + .any(|m| m.get_state().is_evict()) + }, + ); + let write_handle = engine.core.engine.cf_handle("write"); + let start_key = encode_seek_key(®ion.start, u64::MAX); let mut iter = write_handle.iterator(); let guard = &epoch::pin(); iter.seek(&start_key, guard); - let end = encode_seek_key(&range.end, u64::MAX); + let end = encode_seek_key(®ion.end, u64::MAX); assert!(iter.key() > &end || !iter.valid()); } #[test] - fn test_evict_range_without_snapshot() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + fn test_evict_region_without_snapshot() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(construct_user_key(0), construct_user_key(30)); - let evict_range = CacheRange::new(construct_user_key(10), construct_user_key(20)); - engine.new_range(range.clone()); + let region = new_region(1, construct_region_key(0), construct_region_key(30)); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); let guard = &epoch::pin(); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); for i in 0..30 { let user_key = construct_key(i, 10); let internal_key = encode_key(&user_key, 10, ValueType::Value); @@ -1796,18 +1732,30 @@ mod tests { } } - engine.evict_range(&evict_range); + let new_regions = vec![ + CacheRegion::new(1, 1, construct_user_key(0), construct_user_key(10)), + CacheRegion::new(2, 1, construct_user_key(10), construct_user_key(20)), + CacheRegion::new(3, 1, construct_user_key(20), construct_user_key(30)), + ]; + + engine.on_region_event(RegionEvent::Split { + source: CacheRegion::from_region(®ion), + new_regions: new_regions.clone(), + }); + + let evict_region = new_regions[1].clone(); + engine.evict_region(&evict_region, EvictReason::AutoEvict, None); assert_eq!( engine.snapshot(range.clone(), 10, 200).unwrap_err(), - FailedReason::NotCached + FailedReason::EpochNotMatch ); assert_eq!( - engine.snapshot(evict_range.clone(), 10, 200).unwrap_err(), + engine.snapshot(evict_region.clone(), 10, 200).unwrap_err(), FailedReason::NotCached ); - let r_left = CacheRange::new(construct_user_key(0), construct_user_key(10)); - let r_right = CacheRange::new(construct_user_key(20), construct_user_key(30)); + let r_left = new_regions[0].clone(); + let r_right = new_regions[2].clone(); let snap_left = engine.snapshot(r_left, 10, 200).unwrap(); let mut iter_opt = IterOptions::default(); @@ -1829,23 +1777,21 @@ mod tests { verify_key_values(&mut iter, (20..30).step_by(1), 10..11, true, true); // verify the key, values are delete - verify_evict_range_deleted(&engine, &evict_range); + verify_evict_region_deleted(&engine, &evict_region); } #[test] fn test_evict_range_with_snapshot() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(construct_user_key(0), construct_user_key(30)); - let evict_range = CacheRange::new(construct_user_key(10), construct_user_key(20)); - engine.new_range(range.clone()); + let region = new_region(1, construct_region_key(0), construct_region_key(30)); + engine.new_region(region.clone()); let guard = &epoch::pin(); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); for i in 0..30 { let user_key = construct_key(i, 10); let internal_key = encode_key(&user_key, 10, ValueType::Value); @@ -1859,63 +1805,82 @@ mod tests { } } - let s1 = engine.snapshot(range.clone(), 10, 10); - let s2 = engine.snapshot(range, 20, 20); - engine.evict_range(&evict_range); - let range_left = CacheRange::new(construct_user_key(0), construct_user_key(10)); - let s3 = engine.snapshot(range_left, 20, 20).unwrap(); - let range_right = CacheRange::new(construct_user_key(20), construct_user_key(30)); - let s4 = engine.snapshot(range_right, 20, 20).unwrap(); + let cache_region = CacheRegion::from_region(®ion); + let s1 = engine.snapshot(cache_region.clone(), 10, 10); + let s2 = engine.snapshot(cache_region.clone(), 20, 20); + + let new_regions = vec![ + CacheRegion::new(1, 1, construct_user_key(0), construct_user_key(10)), + CacheRegion::new(2, 1, construct_user_key(10), construct_user_key(20)), + CacheRegion::new(3, 1, construct_user_key(20), construct_user_key(30)), + ]; + engine.on_region_event(RegionEvent::Split { + source: cache_region.clone(), + new_regions: new_regions.clone(), + }); + + let evict_region = new_regions[1].clone(); + engine.evict_region(&evict_region, EvictReason::AutoEvict, None); + + let r_left = new_regions[0].clone(); + let s3 = engine.snapshot(r_left.clone(), 20, 20).unwrap(); + let r_right = new_regions[2].clone(); + let s4 = engine.snapshot(r_right, 20, 20).unwrap(); drop(s3); - let range_left_eviction = CacheRange::new(construct_user_key(0), construct_user_key(5)); - engine.evict_range(&range_left_eviction); + engine.evict_region(&r_left, EvictReason::AutoEvict, None); // todo(SpadeA): memory limiter { // evict_range is not eligible for delete - assert!( + assert_eq!( engine .core + .region_manager() + .regions_map .read() - .range_manager() - .ranges_being_deleted - .contains(&evict_range) + .region_meta(evict_region.id) + .unwrap() + .get_state(), + RegionState::PendingEvict ); } drop(s1); { // evict_range is still not eligible for delete - assert!( + assert_eq!( engine .core + .region_manager() + .regions_map .read() - .range_manager() - .ranges_being_deleted - .contains(&evict_range) + .region_meta(evict_region.id) + .unwrap() + .get_state(), + RegionState::PendingEvict ); } drop(s2); // Now, all snapshots before evicting `evict_range` are released - verify_evict_range_deleted(&engine, &evict_range); + verify_evict_region_deleted(&engine, &evict_region); drop(s4); - verify_evict_range_deleted(&engine, &range_left_eviction); + verify_evict_region_deleted(&engine, &r_left); } #[test] fn test_tombstone_count_when_iterating() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); delete_key(&sl, "a", 10, 5); delete_key(&sl, "b", 10, 5); @@ -1950,16 +1915,16 @@ mod tests { #[test] fn test_read_flow_metrics() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); { - let mut core = engine.core.write(); - core.range_manager.set_safe_point(&range, 5); - let sl = core.engine.data[cf_to_id("write")].clone(); + engine.core.region_manager.set_safe_point(region.id, 5); + let sl = engine.core.engine.data[cf_to_id("write")].clone(); put_key_val(&sl, "a", "val", 10, 5); put_key_val(&sl, "b", "vall", 10, 5); @@ -2033,7 +1998,7 @@ mod tests { iter.next().unwrap(); rocks_iter.next().unwrap(); drop(iter); - assert_eq!(PERF_CONTEXT.with(|c| c.borrow().iter_read_bytes), 54); + assert_eq!(PERF_CONTEXT.with(|c| c.borrow().iter_read_bytes), 58); assert_eq!(2, statistics.get_ticker_count(Tickers::NumberDbSeek)); assert_eq!(2, statistics.get_ticker_count(Tickers::NumberDbSeekFound)); assert_eq!(2, statistics.get_ticker_count(Tickers::NumberDbNext)); @@ -2050,12 +2015,12 @@ mod tests { rocks_iter.prev().unwrap(); drop(rocks_iter); drop(iter); - assert_eq!(statistics.get_ticker_count(Tickers::IterBytesRead), 108); + assert_eq!(statistics.get_ticker_count(Tickers::IterBytesRead), 116); assert_eq!( rocks_statistics.get_and_reset_ticker_count(DBStatisticsTickerType::IterBytesRead), statistics.get_and_reset_ticker_count(Tickers::IterBytesRead) ); - assert_eq!(PERF_CONTEXT.with(|c| c.borrow().iter_read_bytes), 108); + assert_eq!(PERF_CONTEXT.with(|c| c.borrow().iter_read_bytes), 116); assert_eq!(3, statistics.get_ticker_count(Tickers::NumberDbSeek)); assert_eq!(3, statistics.get_ticker_count(Tickers::NumberDbSeekFound)); assert_eq!(3, statistics.get_ticker_count(Tickers::NumberDbPrev)); @@ -2067,21 +2032,22 @@ mod tests { snap_sequence: u64, put_entries: F, ) -> ( - RangeCacheMemoryEngine, - RangeCacheSnapshot, - RangeCacheIterator, + RegionCacheMemoryEngine, + RegionCacheSnapshot, + RegionCacheIterator, ) where - F: FnOnce(&mut RangeCacheWriteBatch), + F: FnOnce(&mut RegionCacheWriteBatch), { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), ))); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); let mut wb = engine.write_batch(); - wb.prepare_for_range(range.clone()); + wb.prepare_for_region(®ion); put_entries(&mut wb); wb.set_sequence_number(wb_sequence).unwrap(); wb.write().unwrap(); @@ -2099,34 +2065,34 @@ mod tests { #[test] fn test_iterator() { let (.., mut iter) = set_up_for_iteator(100, 200, |wb| { - wb.put(b"a", b"1").unwrap(); - wb.put(b"b", b"2").unwrap(); - wb.put(b"c", b"3").unwrap(); - wb.put(b"d", b"4").unwrap(); + wb.put(b"za", b"1").unwrap(); + wb.put(b"zb", b"2").unwrap(); + wb.put(b"zc", b"3").unwrap(); + wb.put(b"zd", b"4").unwrap(); }); - iter.seek(b"c").unwrap(); + iter.seek(b"zc").unwrap(); assert!(iter.valid().unwrap()); iter.prev().unwrap(); assert!(iter.valid().unwrap()); - assert_eq!(iter.key(), b"b"); + assert_eq!(iter.key(), b"zb"); assert_eq!(iter.value(), b"2"); iter.next().unwrap(); assert!(iter.valid().unwrap()); - assert_eq!(iter.key(), b"c"); + assert_eq!(iter.key(), b"zc"); assert_eq!(iter.value(), b"3"); - iter.seek_for_prev(b"c").unwrap(); + iter.seek_for_prev(b"zc").unwrap(); assert!(iter.valid().unwrap()); iter.next().unwrap(); assert!(iter.valid().unwrap()); - assert_eq!(iter.key(), b"d"); + assert_eq!(iter.key(), b"zd"); assert_eq!(iter.value(), b"4"); iter.prev().unwrap(); assert!(iter.valid().unwrap()); - assert_eq!(iter.key(), b"c"); + assert_eq!(iter.key(), b"zc"); assert_eq!(iter.value(), b"3"); } @@ -2135,56 +2101,57 @@ mod tests { #[test] fn test_next_with_newer_seq() { let (engine, _, mut iter) = set_up_for_iteator(100, 110, |wb| { - wb.put(b"0", b"0").unwrap(); - wb.put(b"a", b"b").unwrap(); - wb.put(b"c", b"d").unwrap(); - wb.put(b"d", b"e").unwrap(); + wb.put(b"z0", b"0").unwrap(); + wb.put(b"za", b"b").unwrap(); + wb.put(b"zc", b"d").unwrap(); + wb.put(b"zd", b"e").unwrap(); }); let mut wb = engine.write_batch(); - wb.prepare_for_range(CacheRange::new(b"".to_vec(), b"z".to_vec())); - wb.put(b"b", b"f").unwrap(); + let region = new_region(1, b"", b"z"); + wb.prepare_for_region(®ion); + wb.put(b"zb", b"f").unwrap(); wb.set_sequence_number(200).unwrap(); - iter.seek(b"a").unwrap(); - assert_eq!(iter.key(), b"a"); + iter.seek(b"za").unwrap(); + assert_eq!(iter.key(), b"za"); assert_eq!(iter.value(), b"b"); iter.next().unwrap(); - assert_eq!(iter.key(), b"c"); + assert_eq!(iter.key(), b"zc"); assert_eq!(iter.value(), b"d"); - iter.seek_for_prev(b"b").unwrap(); - assert_eq!(iter.key(), b"a"); + iter.seek_for_prev(b"zb").unwrap(); + assert_eq!(iter.key(), b"za"); assert_eq!(iter.value(), b"b"); iter.next().unwrap(); - assert_eq!(iter.key(), b"c"); + assert_eq!(iter.key(), b"zc"); assert_eq!(iter.value(), b"d"); - iter.seek(b"d").unwrap(); - assert_eq!(iter.key(), b"d"); + iter.seek(b"zd").unwrap(); + assert_eq!(iter.key(), b"zd"); assert_eq!(iter.value(), b"e"); iter.prev().unwrap(); - assert_eq!(iter.key(), b"c"); + assert_eq!(iter.key(), b"zc"); assert_eq!(iter.value(), b"d"); iter.prev().unwrap(); - assert_eq!(iter.key(), b"a"); + assert_eq!(iter.key(), b"za"); assert_eq!(iter.value(), b"b"); iter.prev().unwrap(); - iter.seek_for_prev(b"d").unwrap(); - assert_eq!(iter.key(), b"d"); + iter.seek_for_prev(b"zd").unwrap(); + assert_eq!(iter.key(), b"zd"); assert_eq!(iter.value(), b"e"); iter.prev().unwrap(); - assert_eq!(iter.key(), b"c"); + assert_eq!(iter.key(), b"zc"); assert_eq!(iter.value(), b"d"); iter.prev().unwrap(); - assert_eq!(iter.key(), b"a"); + assert_eq!(iter.key(), b"za"); assert_eq!(iter.value(), b"b"); } @@ -2205,7 +2172,7 @@ mod tests { }); // For sequence number 102 - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); + let range = CacheRegion::new(1, 0, b"".to_vec(), b"z".to_vec()); let snap = engine.snapshot(range.clone(), 100, 102).unwrap(); let mut iter_opt = IterOptions::default(); iter_opt.set_upper_bound(&range.end, 0); diff --git a/components/range_cache_memory_engine/src/region_label.rs b/components/in_memory_engine/src/region_label.rs similarity index 87% rename from components/range_cache_memory_engine/src/region_label.rs rename to components/in_memory_engine/src/region_label.rs index 1a1e34e8fa7..c6e643beb53 100644 --- a/components/range_cache_memory_engine/src/region_label.rs +++ b/components/in_memory_engine/src/region_label.rs @@ -3,7 +3,7 @@ use std::{sync::Arc, time::Duration}; use dashmap::DashMap; -use engine_traits::CacheRange; +use engine_traits::CacheRegion; use futures::{ compat::Future01CompatExt, stream::{self, StreamExt}, @@ -50,21 +50,21 @@ pub struct KeyRangeRule { pub end_key: String, } -impl TryFrom<&KeyRangeRule> for CacheRange { +impl TryFrom<&KeyRangeRule> for CacheRegion { type Error = Box; fn try_from(key_range: &KeyRangeRule) -> Result { let start_key = data_key(&hex::decode(&key_range.start_key)?); let end_key = data_end_key(&hex::decode(&key_range.end_key)?); - Ok(CacheRange::new(start_key, end_key)) + Ok(CacheRegion::new(0, 0, start_key, end_key)) } } -pub type RegionLabelAddedCb = Arc; +pub type RegionLabelChangedCallback = Arc; #[derive(Default)] pub struct RegionLabelRulesManager { pub(crate) region_labels: DashMap, - pub(crate) region_label_added_cb: Option, + pub(crate) region_label_change_cb: Option, } impl RegionLabelRulesManager { @@ -72,18 +72,19 @@ impl RegionLabelRulesManager { let old_value = self .region_labels .insert(label_rule.id.clone(), label_rule.clone()); - if let Some(cb) = self.region_label_added_cb.as_ref() { + if let Some(cb) = self.region_label_change_cb.as_ref() { match old_value.as_ref() { // If a watch fires twice on an identical label rule, ignore the second invocation. Some(old_value) if old_value == label_rule => { - info!("Identical region label rule added twice; ignoring."; "rule_id" => &label_rule.id) + info!("ime identical region label rule added twice; ignoring."; + "rule_id" => &label_rule.id) } - _ => cb(label_rule), + _ => cb(label_rule, true), } } } - #[allow(dead_code)] + #[cfg(test)] pub fn region_labels(&self) -> Vec { self.region_labels .iter() @@ -91,11 +92,14 @@ impl RegionLabelRulesManager { .collect::>() } - pub fn remove_region_label(&self, label_rule_id: &String) { - let _ = self.region_labels.remove(label_rule_id); + pub fn remove_region_label(&self, label_rule: &LabelRule) { + let _ = self.region_labels.remove(&label_rule.id); + if let Some(cb) = self.region_label_change_cb.as_ref() { + cb(label_rule, false); + } } - #[allow(dead_code)] + #[cfg(test)] pub fn get_region_label(&self, label_rule_id: &str) -> Option { self.region_labels .get(label_rule_id) @@ -146,12 +150,6 @@ impl RegionLabelServiceBuilder { } } - #[allow(dead_code)] - pub fn path_suffix(mut self, suffix: String) -> Self { - self.path_suffix = Some(suffix); - self - } - pub fn rule_filter_fn(mut self, rule_filter_fn: F) -> Self where F: Fn(&LabelRule) -> bool + Send + Sync + 'static, @@ -184,7 +182,7 @@ impl RegionLabelService { ) } - fn on_label_rule(&mut self, label_rule: &LabelRule) { + fn on_label_rule_add(&mut self, label_rule: &LabelRule) { let should_add_label = self .rule_filter_fn .as_ref() @@ -193,6 +191,15 @@ impl RegionLabelService { self.manager.add_region_label(label_rule) } } + fn on_label_rule_delete(&mut self, label_rule: &LabelRule) { + let should_remove_label = self + .rule_filter_fn + .as_ref() + .map_or_else(|| true, |r_f_fn| r_f_fn(label_rule)); + if should_remove_label { + self.manager.remove_region_label(label_rule) + } + } pub async fn watch_region_labels(&mut self) { self.reload_all_region_labels().await; 'outer: loop { @@ -205,7 +212,8 @@ impl RegionLabelService { .with_prev_kv(), ), ); - info!("pd meta client creating watch stream"; "path" => region_label_path, "rev" => %self.revision); + info!("ime pd meta client creating watch stream"; + "path" => region_label_path, "rev" => %self.revision); while let Some(grpc_response) = stream.next().await { match grpc_response { Ok(resp) => { @@ -216,28 +224,28 @@ impl RegionLabelService { match serde_json::from_slice::( event.get_kv().get_value(), ) { - Ok(label_rule) => self.on_label_rule(&label_rule), - Err(e) => error!("parse put region label event failed"; "name" => ?event.get_kv().get_key(), "err" => ?e), + Ok(label_rule) => self.on_label_rule_add(&label_rule), + Err(e) => error!("ime parse put region label event failed"; "name" => ?event.get_kv().get_key(), "err" => ?e), } } EventEventType::Delete => { match serde_json::from_slice::( event.get_prev_kv().get_value() ) { - Ok(label_rule) => self.manager.remove_region_label(&label_rule.id), - Err(e) => error!("parse delete region label event failed"; "name" => ?event.get_kv().get_key(), "err" => ?e), + Ok(label_rule) => self.on_label_rule_delete(&label_rule), + Err(e) => error!("ime parse delete region label event failed"; "name" => ?event.get_kv().get_key(), "err" => ?e), } } }); } Err(PdError::DataCompacted(msg)) => { - error!("required revision has been compacted"; "err" => ?msg); + error!("ime required revision has been compacted"; "err" => ?msg); self.reload_all_region_labels().await; cancel.abort(); continue 'outer; } Err(err) => { - error!("failed to watch region labels"; "err" => ?err); + error!("ime failed to watch region labels"; "err" => ?err); let _ = GLOBAL_TIMER_HANDLE .delay(std::time::Instant::now() + RETRY_INTERVAL) .compat() @@ -261,17 +269,17 @@ impl RegionLabelService { let kvs = resp.take_kvs().into_iter().collect::>(); for g in kvs.iter() { match serde_json::from_slice::(g.get_value()) { - Ok(label_rule) => self.on_label_rule(&label_rule), + Ok(label_rule) => self.on_label_rule_add(&label_rule), Err(e) => { - error!("parse label rule failed"; "name" => ?g.get_key(), "err" => ?e); + error!("ime parse label rule failed"; "name" => ?g.get_key(), "err" => ?e); } } } return; } Err(err) => { - error!("failed to get meta storage's region label rules"; "err" => ?err); + error!("ime failed to get meta storage's region label rules"; "err" => ?err); let _ = GLOBAL_TIMER_HANDLE .delay(std::time::Instant::now() + RETRY_INTERVAL) .compat() @@ -422,7 +430,7 @@ pub mod tests { ); }; - let background_worker = Builder::new("background").thread_count(1).create(); + let background_worker = Builder::new("ime-test").thread_count(1).create(); let mut s_clone = s.clone(); background_worker.spawn_async_task(async move { s_clone.watch_region_labels().await; diff --git a/components/in_memory_engine/src/region_manager.rs b/components/in_memory_engine/src/region_manager.rs new file mode 100644 index 00000000000..ad7278669d8 --- /dev/null +++ b/components/in_memory_engine/src/region_manager.rs @@ -0,0 +1,1543 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + assert_matches::assert_matches, + collections::{ + BTreeMap, + Bound::{self, Excluded, Unbounded}, + }, + fmt::Debug, + future::Future, + pin::Pin, + result, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, + }, +}; + +use collections::HashMap; +use engine_traits::{CacheRegion, EvictReason, FailedReason}; +use futures::executor::block_on; +use parking_lot::RwLock; +use strum::EnumCount; +use tikv_util::{info, time::Instant, warn}; + +use crate::{metrics::observe_eviction_duration, read::RegionCacheSnapshotMeta}; + +pub(crate) trait AsyncFnOnce = FnOnce() -> Pin + Send>>; + +#[derive(PartialEq, Eq, Debug, Clone, Copy, Default, Hash, EnumCount)] +#[repr(usize)] +pub enum RegionState { + // waiting to be load. + // NOTE: in this state, the region's epoch may be older than + // target region in raftstore. + #[default] + Pending = 0, + // Region is handling batch loading from rocksdb snapshot. + Loading = 1, + // Region is cached, ready to handle foreground read. + Active = 2, + // region should be evicted, but batch loading is possible still running. + LoadingCanceled = 3, + // region should be evicted, but there are possible active snapshot or gc task. + PendingEvict = 4, + // evicting event is running, the region will be removed after the evict task finished. + Evicting = 5, +} + +impl RegionState { + pub fn as_str(&self) -> &'static str { + use RegionState::*; + match *self { + Pending => "pending", + Loading => "loading", + Active => "cached", + LoadingCanceled => "loading_canceled", + PendingEvict => "pending_evict", + Evicting => "evicting", + } + } + + pub fn from_usize(v: usize) -> Self { + use RegionState::*; + match v { + 0 => Pending, + 1 => Loading, + 2 => Active, + 3 => LoadingCanceled, + 4 => PendingEvict, + 5 => Evicting, + _ => panic!("unknown RegionState value {}", v), + } + } + + pub fn is_evict(&self) -> bool { + use RegionState::*; + matches!(*self, LoadingCanceled | PendingEvict | Evicting) + } +} + +// read_ts -> ref_count +#[derive(Default, Debug)] +pub(crate) struct SnapshotList(pub(crate) BTreeMap); + +impl SnapshotList { + pub(crate) fn new_snapshot(&mut self, read_ts: u64) { + // snapshot with this ts may be granted before + *self.0.entry(read_ts).or_default() += 1; + } + + pub(crate) fn remove_snapshot(&mut self, read_ts: u64) { + let count = self.0.get_mut(&read_ts).unwrap(); + assert!(*count >= 1); + if *count == 1 { + self.0.remove(&read_ts).unwrap(); + } else { + *count -= 1; + } + } + + // returns the min snapshot_ts (read_ts) if there's any + pub fn min_snapshot_ts(&self) -> Option { + self.0.first_key_value().map(|(ts, _)| *ts) + } + + pub(crate) fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +#[derive(Debug)] +pub struct CacheRegionMeta { + // the cached region meta. + region: CacheRegion, + // active region snapshots. + region_snapshot_list: Mutex, + // the gc safe point. + safe_point: u64, + state: RegionState, + // whether a gc task is running with this region. + in_gc: AtomicBool, + // whether the raft apply thread is written new KVs in this region. + is_written: AtomicBool, + // region eviction triggers info, and callback when eviction finishes. + evict_info: Option, +} + +struct EvictInfo { + start: Instant, + reason: EvictReason, + // called when eviction finishes + cb: Option>, +} + +impl Debug for EvictInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EvictInfo") + .field("reason", &self.reason) + .finish() + } +} + +impl CacheRegionMeta { + fn new(region: CacheRegion) -> Self { + Self { + region, + region_snapshot_list: Mutex::new(SnapshotList::default()), + safe_point: 0, + state: RegionState::Pending, + in_gc: AtomicBool::new(false), + is_written: AtomicBool::new(false), + evict_info: None, + } + } + + #[inline] + pub fn get_region(&self) -> &CacheRegion { + &self.region + } + + // check whether we can replace the current outdated pending region with the new + // one. + pub(crate) fn can_be_updated_to(&self, region: &CacheRegion) -> bool { + assert!( + self.region.id == region.id && self.region.epoch_version < region.epoch_version, + "current: {:?}, new: {:?}", + &self.region, + region + ); + // if the new region's range is contained by the current region, we can directly + // update to the new one. + self.region.contains_range(region) + } + + pub fn safe_point(&self) -> u64 { + self.safe_point + } + + pub(crate) fn set_safe_point(&mut self, safe_point: u64) { + assert!(self.safe_point <= safe_point); + self.safe_point = safe_point; + } + + pub fn get_state(&self) -> RegionState { + self.state + } + + // each state can only be updated to some specific new state. + fn validate_update_region_state(&self, new_state: RegionState) -> bool { + use RegionState::*; + let valid_new_states: &[RegionState] = match self.state { + Pending => &[Loading], + Loading => &[Active, LoadingCanceled, Evicting], + Active => &[PendingEvict], + LoadingCanceled => &[PendingEvict, Evicting], + PendingEvict => &[Evicting], + Evicting => &[], + }; + valid_new_states.contains(&new_state) + } + + pub(crate) fn set_state(&mut self, new_state: RegionState) { + assert!(self.validate_update_region_state(new_state)); + info!( + "ime update region meta state"; + "region_id" => self.region.id, + "epoch" => self.region.epoch_version, + "current_state" => ?self.state, + "new_state" => ?new_state); + self.state = new_state; + } + + pub(crate) fn mark_evict( + &mut self, + state: RegionState, + reason: EvictReason, + cb: Option>, + ) { + use RegionState::*; + assert_matches!(self.state, Loading | Active | LoadingCanceled); + assert_matches!(state, PendingEvict | Evicting); + self.set_state(state); + self.evict_info = Some(EvictInfo { + start: Instant::now_coarse(), + reason, + cb, + }); + } + + pub(crate) fn set_in_gc(&self, in_gc: bool) { + assert!(self.in_gc.load(Ordering::Acquire) != in_gc); + self.in_gc.store(in_gc, Ordering::Release); + } + + pub fn is_in_gc(&self) -> bool { + self.in_gc.load(Ordering::Acquire) + } + + #[inline] + pub(crate) fn set_being_written(&self) { + debug_assert!(!self.is_written.load(Ordering::Relaxed)); + self.is_written.store(true, Ordering::Relaxed); + } + + #[inline] + pub fn is_written(&self) -> bool { + self.is_written.load(Ordering::Relaxed) + } + + // Build a new RegionMeta from a existing meta, the new meta should inherit + // the safe_point, state, in_gc and evict_info. + // This method is currently only used for handling region split. + pub(crate) fn derive_from(region: CacheRegion, source_meta: &Self) -> Self { + assert!(source_meta.region.contains_range(®ion)); + Self { + region, + region_snapshot_list: Mutex::new(SnapshotList::default()), + safe_point: source_meta.safe_point, + state: source_meta.state, + in_gc: AtomicBool::new(source_meta.in_gc.load(Ordering::Relaxed)), + is_written: AtomicBool::new(source_meta.is_written.load(Ordering::Relaxed)), + evict_info: None, + } + } + + pub(crate) fn region_snapshot_list(&self) -> &Mutex { + &self.region_snapshot_list + } +} + +// TODO: it's currently impossible to implement a `Borrow`ed instance +// for `KeyAndVersion` from a borrowed key without clone. +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] +struct KeyAndVersion(Vec, u64); + +pub struct RegionMetaMap { + // regions that are cached now + // data_end_key --> region_id + regions_by_range: BTreeMap, u64>, + // region_id --> region_meta + regions: HashMap, + // active flag, cloned from RegionManager, + is_active: Arc, + + manual_load_range: Vec, +} + +impl RegionMetaMap { + fn new_region_meta(&mut self, meta: CacheRegionMeta) { + assert!(!self.overlaps_with(&meta.region)); + let id = meta.region.id; + let data_end_key = meta.region.end.clone(); + let old_meta = self.regions.insert(id, meta); + assert!(old_meta.is_none(), "old_meta: {:?}", old_meta.unwrap()); + let old_id = self.regions_by_range.insert(data_end_key, id); + assert!(old_id.is_none(), "old_region_id: {}", old_id.unwrap()); + if self.regions.len() == 1 { + assert!(!self.is_active.load(Ordering::Relaxed)); + self.is_active.store(true, Ordering::Relaxed); + } + } + + pub(crate) fn load_region( + &mut self, + cache_region: CacheRegion, + ) -> Result<(), LoadFailedReason> { + use RegionState::*; + if let Some(state) = self.check_overlap_with_region(&cache_region) { + let reason = match state { + Pending | Loading => LoadFailedReason::PendingRange, + Active => LoadFailedReason::Overlapped, + LoadingCanceled | PendingEvict | Evicting => LoadFailedReason::Evicting, + }; + return Err(reason); + } + info!("ime load new region"; "region" => ?cache_region); + let meta = CacheRegionMeta::new(cache_region); + self.new_region_meta(meta); + Ok(()) + } + + pub(crate) fn remove_region(&mut self, id: u64) -> CacheRegionMeta { + let meta = self.regions.remove(&id).unwrap(); + self.regions_by_range.remove(&meta.region.end); + if self.regions.is_empty() { + assert!(self.is_active.load(Ordering::Relaxed)); + self.is_active.store(false, Ordering::Relaxed); + } + meta + } + + #[cfg(test)] + pub(crate) fn region_meta_by_end_key(&self, key: &[u8]) -> Option<&CacheRegionMeta> { + self.regions_by_range + .get(key) + .and_then(|id| self.regions.get(id)) + } + + fn overlaps_with(&self, region: &CacheRegion) -> bool { + let entry = self + .regions_by_range + .range::<[u8], (Bound<&[u8]>, Bound<&[u8]>)>((Excluded(®ion.start), Unbounded)) + .next(); + if let Some((_key, id)) = entry { + let meta = &self.regions[id]; + if meta.region.start < region.end { + return true; + } + } + false + } + + /// `check_overlap_with_region` check whether there are regions overlap with + /// target region. If there are regions with `pending` state and whose + /// epoch version is smaller than target region, the pending regions will + /// be removed first. + fn check_overlap_with_region(&mut self, region: &CacheRegion) -> Option { + let mut removed_regions = vec![]; + let mut overlapped_region_state = None; + self.iter_overlapped_regions(region, |region_meta| { + // pending region with out-dated epoch, should be removed. + if region_meta.state == RegionState::Pending + && region_meta.region.epoch_version < region.epoch_version + { + removed_regions.push(region_meta.region.id); + return true; + } + warn!("ime load region overlaps with existing region"; + "region" => ?region, + "exist_meta" => ?region_meta); + overlapped_region_state = Some(region_meta.state); + false + }); + if !removed_regions.is_empty() { + info!("ime load region meet pending region with stale epoch, removed"; + "region" => ?region, "stale_regions" => ?removed_regions); + } + for id in removed_regions { + self.remove_region(id); + } + if overlapped_region_state.is_some() { + return overlapped_region_state; + } + + // check region with same id. It is possible that there is a cached region with + // outdated epoch that is still in the (pending_)evicting state, and a new load + // is triggered after the region is merged and split for multiple times. + // Thus, the new pending region's range may not overlap with the old + // cached region but their region ids are the same. + // While in theory we should keep the new region as it doesn't overlap with any + // other region, but because we use region_id as the unique identifier, we do + // not load it for implementation simplicity as this kind of scenario + // should be very rare. + if let Some(region) = self.regions.get(®ion.id) { + return Some(region.state); + } + + None + } + + fn on_all_overlapped_regions(&self, region: &CacheRegion, mut f: impl FnMut(&CacheRegionMeta)) { + // fast path: region epoch match + if let Some(region_meta) = self.region_meta(region.id) + && region_meta.region.epoch_version == region.epoch_version + { + f(region_meta); + return; + } + // epoch not match, need to iter all overlapped regions. + self.iter_overlapped_regions(region, |meta| { + f(meta); + true + }); + } + + pub fn cached_regions(&self) -> Vec { + self.regions + .iter() + .filter_map(|(id, meta)| { + if meta.state == RegionState::Active { + Some(*id) + } else { + None + } + }) + .collect::>() + } + + pub fn iter_overlapped_regions( + &self, + region: &CacheRegion, + mut f: impl FnMut(&CacheRegionMeta) -> bool, + ) { + for (_key, id) in self + .regions_by_range + .range::<[u8], (Bound<&[u8]>, Bound<&[u8]>)>((Excluded(®ion.start), Unbounded)) + { + let region_meta = &self.regions[id]; + if region_meta.region.start >= region.end { + break; + } + if !f(region_meta) { + break; + } + } + } + + pub fn iter_overlapped_regions_mut( + &mut self, + region: &CacheRegion, + mut f: impl FnMut(&mut CacheRegionMeta), + ) { + for (_key, id) in self + .regions_by_range + .range::<[u8], (Bound<&[u8]>, Bound<&[u8]>)>((Excluded(®ion.start), Unbounded)) + { + let region_meta = self.regions.get_mut(id).unwrap(); + if region_meta.region.start >= region.end { + break; + } + f(region_meta); + } + } + + #[inline] + pub fn region_meta(&self, id: u64) -> Option<&CacheRegionMeta> { + self.regions.get(&id) + } + + #[inline] + pub(crate) fn mut_region_meta(&mut self, id: u64) -> Option<&mut CacheRegionMeta> { + self.regions.get_mut(&id) + } + + pub fn regions(&self) -> &HashMap { + &self.regions + } + + pub fn overlap_with_manual_load_range(&self, range: &CacheRegion) -> bool { + self.manual_load_range.iter().any(|r| r.overlaps(range)) + } + + pub fn add_manual_load_range(&mut self, range: CacheRegion) { + let mut union = range; + self.manual_load_range.retain(|r| { + let Some(u) = r.union(&union) else { + return true; + }; + union = u; + // The intersected range need to be removed before updating + // the union range. + false + }); + info!("ime add manual load range"; "range" => ?union); + self.manual_load_range.push(union); + } + + pub fn remove_manual_load_range(&mut self, range: CacheRegion) { + let mut diffs = Vec::new(); + self.manual_load_range.retain(|r| { + match r.difference(&range) { + (None, None) => { + // Remove the range if it is overlapped with the range. + if !r.overlaps(&range) { + return true; + } + } + others => diffs.push(others), + }; + // The intersected range need to be removed before updating + // the union range. + false + }); + info!("ime remove manual load range"; "range" => ?range); + assert!(diffs.len() <= 2, "{:?}", diffs); + for (left, right) in diffs.into_iter() { + if let Some(left) = left { + info!("ime update manual load range"; "range" => ?left); + self.manual_load_range.push(left); + } + if let Some(right) = right { + info!("ime update manual load range"; "range" => ?right); + self.manual_load_range.push(right); + } + } + const GC_THRESHOLD: usize = 64; + if self.manual_load_range.capacity() - self.manual_load_range.len() > GC_THRESHOLD { + self.manual_load_range.shrink_to_fit(); + } + } +} + +#[cfg(test)] +impl Drop for RegionMetaMap { + fn drop(&mut self) { + assert_eq!(self.regions.len(), self.regions_by_range.len()); + // check regions and regions by range matches with each other. + for (key, id) in &self.regions_by_range { + let meta = self.regions.get(id).unwrap(); + assert_eq!(key, &meta.region.end); + } + } +} + +// RegionManger manges the regions for RegionCacheMemoryEngine. Every new region +// (whether created by new_region/load_region or by split)'s range is unique and +// is not overlap with any other regions. +// +// Each region is first added with `pending` state. Because `pending` can be +// added by the background workers, it is possible the pending region is added +// with an outdated epoch. We handle this outdated epoch in the raft apply +// thread, before handling a region, the apply worker will check the region in +// RegionManager, if its epoch is outdated(only possible in `pending` state), if +// the old region's range contains new region's range, then we update it to the +// new version, else we just drop the outdated region. +// +// In RegionCacheEngine, we only keep region's epoch version updated with +// raftstore, but not the conf version for simplicity because conf version +// change doesn't affect the correctness of data. In order to always keep the +// region epoch version updated, we handle region epoch we use a ApplyObserver +// to watch following event: +// - PrepareMerge/CommitMerge. We evict target region currently for simplicity. +// - Leader Resign. evict the region. +// - SST Ingestion. evict the region. +// - Split/BatchSplit. For split event, we just replace the source region with +// the split new regions. The new regions should inherit the state of the +// source region including(state, safe_point, in_gc). If there are ongoing +// snapshot in the source region, the source region meta should be put in +// `historical_regions`. +pub struct RegionManager { + // regions hold the metadata of all cached regions. + pub(crate) regions_map: RwLock, + // we use this flag to ensure there is only 1 running gc task. + is_gc_task_running: AtomicBool, + // Outdated regions that are split but still hold some on going snapshots. + // These on going snapshot should block regions fell in this range from gc or eviction. + // It's possible that multi region with the same end key are in `historical_regions`, + // so we add epoch version into the key to ensure the uniqueness. + // (data_end_key, epoch_version) --> region_id + historical_regions: Mutex>, + // whether there are any cached regions. We use this flag to minimize the overhead of + // `prepare_for_apply` when no region is cached. + is_active: Arc, +} + +impl Default for RegionManager { + fn default() -> Self { + let is_active = Arc::new(AtomicBool::new(false)); + let regions_map = RwLock::new(RegionMetaMap { + regions_by_range: BTreeMap::default(), + regions: HashMap::default(), + is_active: is_active.clone(), + manual_load_range: Vec::default(), + }); + Self { + regions_map, + is_gc_task_running: AtomicBool::default(), + historical_regions: Mutex::new(BTreeMap::default()), + is_active, + } + } +} + +impl RegionManager { + // load a new region directly in the active state. + // This function is used for unit/integration tests only. + pub fn new_region(&self, region: CacheRegion) { + let mut range_meta = CacheRegionMeta::new(region); + range_meta.state = RegionState::Active; + self.regions_map.write().new_region_meta(range_meta); + } + + pub fn is_active(&self) -> bool { + self.is_active.load(Ordering::Acquire) + } + + pub fn set_safe_point(&self, region_id: u64, safe_ts: u64) -> bool { + let mut regions_map = self.regions_map.write(); + if let Some(meta) = regions_map.mut_region_meta(region_id) { + if meta.safe_point > safe_ts { + return false; + } + meta.safe_point = safe_ts; + true + } else { + false + } + } + + pub fn get_region_for_key(&self, key: &[u8]) -> Option { + let regions_map = self.regions_map.read(); + if let Some((key, id)) = regions_map + .regions_by_range + .range::<[u8], (Bound<&[u8]>, Bound<&[u8]>)>((Excluded(key), Unbounded)) + .next() + { + let meta = ®ions_map.regions[id]; + if &meta.region.start <= key { + return Some(meta.region.clone()); + } + } + None + } + + pub fn contains_region(&self, region_id: u64) -> bool { + self.regions_map.read().regions.contains_key(®ion_id) + } + + pub fn regions_map(&self) -> &RwLock { + &self.regions_map + } + + // Acquire a snapshot of the `range` with `read_ts`. If the range is not + // accessable, None will be returned. Otherwise, the range id will be returned. + pub(crate) fn region_snapshot( + &self, + region_id: u64, + region_epoch: u64, + read_ts: u64, + ) -> result::Result<(), FailedReason> { + let regions_map = self.regions_map.read(); + let Some(meta) = regions_map.region_meta(region_id) else { + return Err(FailedReason::NotCached); + }; + + if meta.state != RegionState::Active { + return Err(FailedReason::NotCached); + } + + if meta.region.epoch_version != region_epoch { + return Err(FailedReason::EpochNotMatch); + } + + if read_ts <= meta.safe_point { + return Err(FailedReason::TooOldRead); + } + + meta.region_snapshot_list + .lock() + .unwrap() + .new_snapshot(read_ts); + Ok(()) + } + + // If the snapshot is the last one in the snapshot list of one cache region + // in historical_regions, it means one or some evicted_regions may be ready + // to be removed physically. + // So, we return a vector of ranges to denote the ranges that are ready to + // be removed. + pub(crate) fn remove_region_snapshot( + &self, + snapshot_meta: &RegionCacheSnapshotMeta, + ) -> Vec { + let regions_map = self.regions_map.read(); + // fast path: in most case, region is not changed. + if let Some(region_meta) = regions_map.region_meta(snapshot_meta.region.id) + && region_meta.region.epoch_version == snapshot_meta.region.epoch_version + { + // epoch not changed + let mut snapshot_list = region_meta.region_snapshot_list.lock().unwrap(); + snapshot_list.remove_snapshot(snapshot_meta.snapshot_ts); + if Self::region_ready_to_evict( + region_meta, + &snapshot_list, + &self.historical_regions.lock().unwrap(), + ) { + drop(snapshot_list); + drop(regions_map); + let mut regions_map = self.regions_map.write(); + let region_meta = regions_map + .mut_region_meta(snapshot_meta.region.id) + .unwrap(); + region_meta.set_state(RegionState::Evicting); + return vec![region_meta.region.clone()]; + } + return vec![]; + } + + // slow path: region not found or epoch version changes, must fell in the + // history regions. + let hist_key = KeyAndVersion( + snapshot_meta.region.end.clone(), + snapshot_meta.region.epoch_version, + ); + let mut historical_regions = self.historical_regions.lock().unwrap(); + let meta = historical_regions.get_mut(&hist_key).unwrap(); + + let mut snapshot_list = meta.region_snapshot_list.lock().unwrap(); + snapshot_list.remove_snapshot(snapshot_meta.snapshot_ts); + + let mut deletable_regions = vec![]; + if snapshot_list.is_empty() { + drop(snapshot_list); + historical_regions.remove(&hist_key).unwrap(); + regions_map.iter_overlapped_regions(&snapshot_meta.region, |meta| { + if matches!( + meta.get_state(), + RegionState::PendingEvict | RegionState::Evicting + ) { + assert_eq!(meta.get_state(), RegionState::PendingEvict); + let snap_list = meta.region_snapshot_list.lock().unwrap(); + if Self::region_ready_to_evict(meta, &snap_list, &historical_regions) { + deletable_regions.push(meta.region.clone()); + } + } + true + }); + if !deletable_regions.is_empty() { + drop(regions_map); + let mut regions_map = self.regions_map.write(); + for r in &deletable_regions { + let meta = regions_map.mut_region_meta(r.id).unwrap(); + meta.set_state(RegionState::Evicting); + } + } + } + deletable_regions + } + + // whether target region is ready to be physically evicted. + // NOTE: region in_gc or region is actively written will also block + // evicting, but we check these two factor in the DeleteRange worker, + // so we don't check these two factors here for simplicity. + #[inline] + fn region_ready_to_evict( + meta: &CacheRegionMeta, + snapshot_list: &SnapshotList, + historical_regions: &BTreeMap, + ) -> bool { + if meta.state != RegionState::PendingEvict { + return false; + } + snapshot_list.is_empty() + && !Self::overlaps_with_historical_regions(&meta.region, historical_regions) + } + + fn overlaps_with_historical_regions( + region: &CacheRegion, + historical_regions: &BTreeMap, + ) -> bool { + for (_, meta) in historical_regions.range(( + Excluded(KeyAndVersion(region.start.clone(), u64::MAX)), + Unbounded, + )) { + if meta.region.start < region.end { + return true; + } + } + false + } + + pub(crate) fn get_history_regions_min_ts(&self, region: &CacheRegion) -> Option { + self.historical_regions + .lock() + .unwrap() + .range(( + Excluded(KeyAndVersion(region.start.clone(), u64::MAX)), + Unbounded, + )) + .filter_map(|(_, meta)| { + if meta.region.start < region.end { + meta.region_snapshot_list.lock().unwrap().min_snapshot_ts() + } else { + None + } + }) + .min() + } + + pub(crate) fn on_gc_region_finished(&self, region: &CacheRegion) { + let regions_map = self.regions_map.read(); + regions_map.on_all_overlapped_regions(region, |meta| { + assert!(region.contains_range(&meta.region)); + meta.set_in_gc(false); + }); + } + + /// Return ranges that can be deleted now (no ongoing snapshot). + // If the region epoch has changed which means the region range may have + // changed, evict the regions overlapped with the range of `evict_region`. + // `cb` is called when the eviction of the region with id equaling to the id of + // `evict_region` has done. + // Note: `cb` should not do anything heavy. + pub(crate) fn evict_region( + &self, + evict_region: &CacheRegion, + evict_reason: EvictReason, + mut cb: Option>, + ) -> Vec { + info!( + "ime try to evict region"; + "evict_region" => ?evict_region, + "reason" => ?evict_reason, + ); + + let mut regions_map = self.regions_map.write(); + let mut evict_ids = vec![]; + regions_map.on_all_overlapped_regions(evict_region, |meta| { + evict_ids.push(meta.region.id); + }); + if evict_ids.is_empty() { + info!("ime evict a region that is not cached"; + "reason" => ?evict_reason, + "region" => ?evict_region); + return vec![]; + } + + let mut deletable_regions = vec![]; + for rid in evict_ids { + if let Some(region) = self.do_evict_region( + rid, + evict_region, + evict_reason, + &mut regions_map, + if rid == evict_region.id { + cb.take() + } else { + None + }, + ) { + deletable_regions.push(region); + } + } + deletable_regions + } + + // return the region if it can be directly deleted. + fn do_evict_region( + &self, + id: u64, + evict_region: &CacheRegion, + evict_reason: EvictReason, + regions_map: &mut RegionMetaMap, + cb: Option>, + ) -> Option { + let meta = regions_map.mut_region_meta(id).unwrap(); + let prev_state = meta.state; + assert!( + meta.region.overlaps(evict_region), + "meta: {:?}, evict_region: {:?}", + meta, + evict_region + ); + if prev_state == RegionState::Pending { + let meta = regions_map.remove_region(id); + info!( + "ime evict overlap pending region"; + "reason" => ?evict_reason, + "target_region" => ?evict_region, + "overlap_region" => ?meta.region, + "state" => ?prev_state, + ); + return None; + } else if prev_state.is_evict() { + info!("ime region already evicted"; + "region" => ?meta.region, "state" => ?prev_state); + return None; + } + + if prev_state == RegionState::Active { + meta.mark_evict(RegionState::PendingEvict, evict_reason, cb); + } else { + meta.set_state(RegionState::LoadingCanceled) + }; + + info!( + "ime evict overlap region"; + "reason" => ?evict_reason, + "target_region" => ?evict_region, + "overlap_region" => ?meta.region, + "state" => ?prev_state, + "new_state" => ?meta.state, + ); + + if meta.state == RegionState::PendingEvict { + let snap_list = meta.region_snapshot_list.lock().unwrap(); + let historical_regions = self.historical_regions.lock().unwrap(); + if Self::region_ready_to_evict(meta, &snap_list, &historical_regions) { + drop(snap_list); + meta.set_state(RegionState::Evicting); + return Some(meta.region.clone()); + } + } + None + } + + pub fn on_delete_regions(&self, regions: &[CacheRegion]) { + fail::fail_point!("ime_on_delete_regions"); + let mut cbs = vec![]; + { + let mut regions_map = self.regions_map.write(); + for r in regions { + let meta = regions_map.remove_region(r.id); + assert_eq!(meta.region.epoch_version, r.epoch_version); + info!("ime remove evicted region"; "meta" => ?meta); + + let evict_info = meta.evict_info.unwrap(); + observe_eviction_duration( + evict_info.start.saturating_elapsed_secs(), + evict_info.reason, + ); + if let Some(cb) = evict_info.cb { + cbs.push(cb); + } + + info!( + "ime range eviction done"; + "region" => ?r, + ); + } + } + + block_on(async { + for cb in cbs { + cb().await; + } + }); + } + + // return whether the operation is successful. + pub fn try_set_regions_in_gc(&self, in_gc: bool) -> bool { + self.is_gc_task_running + .compare_exchange(!in_gc, in_gc, Ordering::AcqRel, Ordering::Relaxed) + .is_ok() + } + + pub(crate) fn clear_regions_in_being_written(&self, regions: &[CacheRegion]) { + let regions_map = self.regions_map.read(); + for r in regions { + regions_map.on_all_overlapped_regions(r, |meta| { + assert!(r.contains_range(&meta.region)); + debug_assert!(meta.is_written()); + meta.is_written.store(false, Ordering::Release); + }); + } + } + + pub fn load_region(&self, cache_region: CacheRegion) -> Result<(), LoadFailedReason> { + self.regions_map.write().load_region(cache_region) + } + + // return `true` is the region is evicted. + pub(crate) fn split_region( + &self, + source_region: &CacheRegion, + mut new_regions: Vec, + ) { + let mut regions_map = self.regions_map.write(); + if let Some(region_meta) = regions_map.region_meta(source_region.id) { + // if region is evicting, skip handling split for simplicity. + if region_meta.state.is_evict() { + info!("ime region is evicted, skip split"; + "meta" => ?®ion_meta, "new_regions" => ?new_regions); + return; + } + } else { + info!("ime split source region not cached"; "region_id" => source_region.id); + return; + } + + let region_meta = regions_map.remove_region(source_region.id); + assert!(!region_meta.state.is_evict()); + if region_meta.region.epoch_version != source_region.epoch_version { + // for pending regions, we keep regions that still fall in this range if epoch + // version changed. + assert_eq!(region_meta.state, RegionState::Pending); + assert!(region_meta.region.epoch_version < source_region.epoch_version); + new_regions.retain(|r| region_meta.region.overlaps(r)); + info!("ime handle split region met pending region epoch stale"; + "cached" => ?region_meta, + "split_source" => ?source_region, + "cache_new_regions" => ?new_regions); + } + + info!("ime handle region split"; + "region_id" => source_region.id, + "meta" => ?region_meta, + "new_regions" => ?new_regions); + + for r in new_regions { + let meta = CacheRegionMeta::derive_from(r, ®ion_meta); + regions_map.new_region_meta(meta); + } + + // if there are still active snapshot, we need to put the orginal region + // into `historical_regions` to track these snapshots. + let snapshot_empty = region_meta.region_snapshot_list.lock().unwrap().is_empty(); + if !snapshot_empty { + self.historical_regions.lock().unwrap().insert( + KeyAndVersion( + region_meta.region.end.clone(), + region_meta.region.epoch_version, + ), + region_meta, + ); + } + } +} + +#[derive(Debug, PartialEq)] +pub enum LoadFailedReason { + Overlapped, + PendingRange, + Evicting, +} + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum RegionCacheStatus { + NotInCache, + Cached, + Loading, +} + +#[cfg(test)] +mod tests { + use engine_traits::{CacheRegion, EvictReason, FailedReason}; + + use super::*; + use crate::region_manager::LoadFailedReason; + + #[test] + fn test_range_manager() { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, "k00", b"k10"); + + region_mgr.new_region(r1.clone()); + region_mgr.set_safe_point(r1.id, 5); + assert_eq!( + region_mgr.region_snapshot(r1.id, 0, 5).unwrap_err(), + FailedReason::TooOldRead + ); + region_mgr.region_snapshot(r1.id, 0, 8).unwrap(); + let snapshot1 = RegionCacheSnapshotMeta::new(r1.clone(), 8, 1); + region_mgr.region_snapshot(r1.id, 0, 10).unwrap(); + let snapshot2 = RegionCacheSnapshotMeta::new(r1.clone(), 10, 2); + assert_eq!( + region_mgr.region_snapshot(2, 0, 8).unwrap_err(), + FailedReason::NotCached + ); + + let r_evict = CacheRegion::new(2, 2, b"k03", b"k06"); + let r_left = CacheRegion::new(1, 2, b"k00", b"k03"); + let r_right = CacheRegion::new(3, 2, b"k06", b"k10"); + region_mgr.split_region(&r1, vec![r_left.clone(), r_evict.clone(), r_right.clone()]); + region_mgr.evict_region(&r_evict, EvictReason::AutoEvict, None); + + { + let regions_map = region_mgr.regions_map.read(); + let history_regions = region_mgr.historical_regions.lock().unwrap(); + let meta1 = history_regions + .get(&KeyAndVersion(r1.end.clone(), 0)) + .unwrap(); + assert_eq!( + regions_map.regions.get(&r_evict.id).unwrap().state, + RegionState::PendingEvict, + ); + assert_eq!( + regions_map.regions_by_range.get(&r1.end).unwrap(), + &r_right.id + ); + + let meta2 = regions_map.regions.get(&r_left.id).unwrap(); + let meta3 = regions_map.regions.get(&r_right.id).unwrap(); + assert!(meta1.safe_point == meta2.safe_point && meta1.safe_point == meta3.safe_point); + } + + // evict a range with accurate match + region_mgr.region_snapshot(r_left.id, 2, 10).unwrap(); + let snapshot3 = RegionCacheSnapshotMeta::new(r_left.clone(), 10, 3); + region_mgr.evict_region(&r_left, EvictReason::AutoEvict, None); + assert_eq!( + region_mgr + .regions_map + .read() + .regions + .get(&r_left.id) + .unwrap() + .state, + RegionState::PendingEvict, + ); + assert!(region_mgr.remove_region_snapshot(&snapshot1).is_empty()); + + let regions = region_mgr.remove_region_snapshot(&snapshot2); + assert_eq!(regions, vec![r_evict.clone()]); + assert_eq!( + region_mgr + .regions_map + .read() + .region_meta(r_evict.id) + .unwrap() + .get_state(), + RegionState::Evicting + ); + + let regions = region_mgr.remove_region_snapshot(&snapshot3); + assert_eq!(regions, vec![r_left.clone()]); + assert_eq!( + region_mgr + .regions_map + .read() + .region_meta(r_left.id) + .unwrap() + .get_state(), + RegionState::Evicting + ); + } + + #[test] + fn test_range_load() { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, b"k00", b"k10"); + let mut r2 = CacheRegion::new(2, 2, b"k10", b"k20"); + let r3 = CacheRegion::new(3, 0, b"k20", b"k30"); + let r4 = CacheRegion::new(4, 0, b"k25", b"k35"); + + region_mgr.new_region(r1.clone()); + region_mgr.load_region(r2.clone()).unwrap(); + region_mgr.new_region(r3.clone()); + region_mgr.evict_region(&r1, EvictReason::AutoEvict, None); + + assert_eq!( + region_mgr.load_region(r1).unwrap_err(), + LoadFailedReason::Evicting + ); + + // load r2 with an outdated epoch. + r2.epoch_version = 1; + assert_eq!( + region_mgr.load_region(r2).unwrap_err(), + LoadFailedReason::PendingRange, + ); + + assert_eq!( + region_mgr.load_region(r4).unwrap_err(), + LoadFailedReason::Overlapped + ); + } + + #[test] + fn test_range_load_overlapped() { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, b"k00", b"k10"); + let r3 = CacheRegion::new(3, 0, b"k40", b"k50"); + region_mgr.new_region(r1.clone()); + region_mgr.evict_region(&r1, EvictReason::AutoEvict, None); + + region_mgr.load_region(r3).unwrap(); + + let r = CacheRegion::new(4, 0, b"k00", b"k05"); + assert_eq!( + region_mgr.load_region(r).unwrap_err(), + LoadFailedReason::Evicting + ); + let r = CacheRegion::new(4, 0, b"k05", b"k15"); + assert_eq!( + region_mgr.load_region(r).unwrap_err(), + LoadFailedReason::Evicting + ); + + let r = CacheRegion::new(4, 0, b"k35", b"k45"); + assert_eq!( + region_mgr.load_region(r).unwrap_err(), + LoadFailedReason::PendingRange + ); + let r = CacheRegion::new(4, 0, b"k45", b"k55"); + assert_eq!( + region_mgr.load_region(r).unwrap_err(), + LoadFailedReason::PendingRange + ); + + // test range overlap but id overlap + let r = CacheRegion::new(1, 2, b"k20", b"k30"); + assert_eq!( + region_mgr.load_region(r).unwrap_err(), + LoadFailedReason::Evicting + ); + } + + #[test] + fn test_evict_regions() { + { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, b"k00", b"k10"); + let r2 = CacheRegion::new(2, 0, b"k20", b"k30"); + let r3 = CacheRegion::new(3, 0, b"k40", b"k50"); + region_mgr.new_region(r1.clone()); + region_mgr.new_region(r2.clone()); + region_mgr.new_region(r3.clone()); + region_mgr.contains_region(r1.id); + region_mgr.contains_region(r2.id); + region_mgr.contains_region(r3.id); + + let r4 = CacheRegion::new(4, 2, b"k00", b"k05"); + assert_eq!( + region_mgr.evict_region(&r4, EvictReason::AutoEvict, None), + vec![r1] + ); + } + + { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, b"k00", b"k10"); + let r2 = CacheRegion::new(2, 0, b"k20", b"k30"); + let r3 = CacheRegion::new(3, 0, b"k40", b"k50"); + region_mgr.new_region(r1.clone()); + region_mgr.new_region(r2.clone()); + region_mgr.new_region(r3.clone()); + assert!(region_mgr.contains_region(r1.id)); + assert!(region_mgr.contains_region(r2.id)); + assert!(region_mgr.contains_region(r3.id)); + + let r4 = CacheRegion::new(4, 0, b"k", b"k51"); + assert_eq!( + region_mgr.evict_region(&r4, EvictReason::AutoEvict, None), + vec![r1, r2, r3] + ); + assert!( + region_mgr + .regions_map + .read() + .regions + .values() + .all(|m| m.get_state() == RegionState::Evicting) + ); + } + + { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, b"k00", b"k10"); + let r2 = CacheRegion::new(2, 0, b"k20", b"k30"); + let r3 = CacheRegion::new(3, 0, b"k40", b"k50"); + region_mgr.new_region(r1.clone()); + region_mgr.new_region(r2.clone()); + region_mgr.new_region(r3.clone()); + + let r4 = CacheRegion::new(4, 0, b"k25", b"k55"); + assert_eq!( + region_mgr.evict_region(&r4, EvictReason::AutoEvict, None), + vec![r2, r3] + ); + assert_eq!( + region_mgr + .regions_map + .read() + .regions + .values() + .filter(|m| m.get_state() == RegionState::Active) + .count(), + 1 + ); + } + + { + let region_mgr = RegionManager::default(); + let r1 = CacheRegion::new(1, 0, b"k00", b"k10"); + let r2 = CacheRegion::new(2, 0, b"k30", b"k40"); + let r3 = CacheRegion::new(3, 0, b"k50", b"k60"); + region_mgr.new_region(r1.clone()); + region_mgr.new_region(r2.clone()); + region_mgr.new_region(r3.clone()); + + let r4 = CacheRegion::new(4, 0, b"k25", b"k75"); + assert_eq!( + region_mgr.evict_region(&r4, EvictReason::AutoEvict, None), + vec![r2, r3] + ); + assert_eq!( + region_mgr + .regions_map + .read() + .regions + .values() + .filter(|m| m.get_state() == RegionState::Active) + .count(), + 1 + ); + } + } + + #[test] + fn test_overlap_with_manual_load_range() { + let region_mgr = RegionManager::default(); + region_mgr + .regions_map() + .write() + .add_manual_load_range(CacheRegion::new(0, 0, b"k00".to_vec(), b"k10".to_vec())); + region_mgr + .regions_map() + .write() + .add_manual_load_range(CacheRegion::new(0, 0, b"k20".to_vec(), b"k30".to_vec())); + region_mgr + .regions_map() + .write() + .add_manual_load_range(CacheRegion::new(0, 0, b"k30".to_vec(), b"k50".to_vec())); + + struct Case { + name: &'static str, + range: (&'static [u8], &'static [u8]), + expected: bool, + } + let cases = [ + Case { + name: "left intersect 1", + range: (b"k00", b"k05"), + expected: true, + }, + Case { + name: "left intersect 2", + range: (b"k15", b"k25"), + expected: true, + }, + Case { + name: "cover all", + range: (b"k00", b"k60"), + expected: true, + }, + Case { + name: "right intersect", + range: (b"k05", b"k15"), + expected: true, + }, + Case { + name: "left and right intersect", + range: (b"k25", b"k45"), + expected: true, + }, + Case { + name: "not overlap 1", + range: (b"k15", b"k20"), + expected: false, + }, + Case { + name: "not overlap 2", + range: (b"k", b"k0"), + expected: false, + }, + Case { + name: "not overlap 3", + range: (b"k60", b"k70"), + expected: false, + }, + ]; + + for case in cases { + let range = CacheRegion::new(0, 0, case.range.0.to_vec(), case.range.1.to_vec()); + assert_eq!( + region_mgr + .regions_map() + .read() + .overlap_with_manual_load_range(&range), + case.expected, + "{}", + case.name + ); + } + } + + #[test] + fn test_manual_load_range_add_remove() { + struct Case { + name: &'static str, + build: Vec<(&'static [u8], &'static [u8])>, + remove: Vec<(&'static [u8], &'static [u8])>, + add: Vec<(&'static [u8], &'static [u8])>, + result: Vec<(&'static [u8], &'static [u8])>, + } + let cases = [ + Case { + name: "remove empty", + build: vec![], + remove: vec![(b"k00", b"k10")], + add: vec![], + result: vec![], + }, + Case { + name: "add empty", + build: vec![], + remove: vec![], + add: vec![(b"k00", b"k10")], + result: vec![(b"k00", b"k10")], + }, + // Test remove + Case { + name: "remove one range", + build: vec![(b"k00", b"k10"), (b"k20", b"k30"), (b"k40", b"k50")], + remove: vec![(b"k20", b"k30")], + add: vec![], + result: vec![(b"k00", b"k10"), (b"k40", b"k50")], + }, + Case { + name: "remove left intersected ranges", + build: vec![(b"k00", b"k10"), (b"k20", b"k30"), (b"k40", b"k50")], + remove: vec![(b"k", b"k05")], + add: vec![], + result: vec![(b"k05", b"k10"), (b"k20", b"k30"), (b"k40", b"k50")], + }, + Case { + name: "remove left and right intersected ranges", + build: vec![(b"k00", b"k10"), (b"k20", b"k30"), (b"k40", b"k50")], + remove: vec![(b"k05", b"k45")], + add: vec![], + result: vec![(b"k00", b"k05"), (b"k45", b"k50")], + }, + Case { + name: "remove right intersected ranges", + build: vec![(b"k00", b"k10"), (b"k20", b"k30"), (b"k40", b"k50")], + remove: vec![(b"k45", b"k60")], + add: vec![], + result: vec![(b"k00", b"k10"), (b"k20", b"k30"), (b"k40", b"k45")], + }, + // Test add + Case { + name: "add left intersected ranges 1", + build: vec![(b"k00", b"k10"), (b"k20", b"k30")], + add: vec![(b"k", b"k05")], + remove: vec![], + result: vec![(b"k", b"k10"), (b"k20", b"k30")], + }, + Case { + name: "add left intersected ranges 2", + build: vec![(b"k00", b"k10"), (b"k20", b"k30")], + add: vec![(b"k", b"k25")], + remove: vec![], + result: vec![(b"k", b"k30")], + }, + Case { + name: "add right intersected ranges 1", + build: vec![(b"k20", b"k30"), (b"k40", b"k50")], + add: vec![(b"k45", b"k55")], + remove: vec![], + result: vec![(b"k20", b"k30"), (b"k40", b"k55")], + }, + Case { + name: "add right intersected ranges 2", + build: vec![(b"k20", b"k30"), (b"k40", b"k50")], + add: vec![(b"k25", b"k55")], + remove: vec![], + result: vec![(b"k20", b"k55")], + }, + Case { + name: "add left and right intersected ranges 1", + build: vec![(b"k20", b"k30")], + add: vec![(b"k10", b"k50")], + remove: vec![], + result: vec![(b"k10", b"k50")], + }, + Case { + name: "add left and right intersected ranges 2", + build: vec![(b"k10", b"k20"), (b"k20", b"k30")], + add: vec![(b"k10", b"k50")], + remove: vec![], + result: vec![(b"k10", b"k50")], + }, + Case { + name: "add adjacent ranges", + build: vec![(b"k10", b"k20")], + add: vec![(b"k00", b"k10"), (b"k20", b"k30")], + remove: vec![], + result: vec![(b"k00", b"k10"), (b"k10", b"k20"), (b"k20", b"k30")], + }, + ]; + + for case in cases { + // Build + let region_mgr = RegionManager::default(); + for (start, end) in case.build { + let r = CacheRegion::new(0, 0, start.to_vec(), end.to_vec()); + region_mgr.regions_map().write().add_manual_load_range(r); + } + + // Remove + for (start, end) in case.remove { + let r = CacheRegion::new(0, 0, start.to_vec(), end.to_vec()); + region_mgr.regions_map().write().remove_manual_load_range(r); + } + + // Add + for (start, end) in case.add { + let r = CacheRegion::new(0, 0, start.to_vec(), end.to_vec()); + region_mgr.regions_map().write().add_manual_load_range(r); + } + + // Check + let map = region_mgr.regions_map.read(); + assert_eq!( + map.manual_load_range.len(), + case.result.len(), + "case: {}", + case.name + ); + for r in case.result { + assert!( + map.manual_load_range + .iter() + .any(|range| range.start.as_slice() == r.0 && range.end.as_slice() == r.1), + "case: {}", + case.name + ); + } + } + } +} diff --git a/components/in_memory_engine/src/region_stats.rs b/components/in_memory_engine/src/region_stats.rs new file mode 100644 index 00000000000..5f81dc82860 --- /dev/null +++ b/components/in_memory_engine/src/region_stats.rs @@ -0,0 +1,713 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. +use std::{ + collections::{BTreeMap, HashSet}, + num::NonZeroUsize, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::{Duration, Instant}, +}; + +use collections::HashMap; +use crossbeam::sync::ShardedLock; +use engine_traits::{CacheRegion, EvictReason}; +use kvproto::metapb::Region; +use parking_lot::Mutex; +use raftstore::coprocessor::RegionInfoProvider; +use slog_global::error; +use tikv_util::{config::VersionTrack, info, smoother::Smoother, worker::Scheduler}; +use tokio::sync::mpsc; + +use crate::{ + memory_controller::MemoryController, region_manager::AsyncFnOnce, BackgroundTask, + InMemoryEngineConfig, +}; + +/// Do not evict a region if has been cached for less than this duration. +pub const DEFAULT_EVICT_MIN_DURATION: Duration = Duration::from_secs(60 * 5); +const MIN_REGION_COUNT_TO_EVICT: usize = 5; +// TODO(SpadeA): this 10 and 20 may be adjusted by observing more workloads. +const MVCC_AMPLIFICATION_FILTER_FACTOR: f64 = 10.0; +const ITERATED_COUNT_FILTER_FACTOR: usize = 20; + +#[derive(Clone)] +pub(crate) struct RegionStatsManager { + config: Arc>, + info_provider: Arc, + checking_top_regions: Arc, + region_loaded_at: Arc>>, + evict_min_duration: Duration, + // Moving average of amplification reduction. Amplification reduction is the reduced + // multiple before and after the cache. When a new top region (of course, not cached) comes in, + // this moving average number is used to estimate the mvcc amplification after cache so we can + // use it to determine whether to evict some regions if memory usage is relative high. + ma_mvcc_amplification_reduction: Arc>>, + mvcc_amplification_record: Arc>>, + + last_load_evict_time: Arc>, +} + +impl RegionStatsManager { + /// Creates a new RangeStatsManager that retrieves state from + /// `info_provider`. + /// + /// * `num_regions` Initial number of top regions to track and cache. This + /// may change, see `adjust_max_num_regions` below. + /// * `evict_min_duration` - do not evict regions that have been loaded for + /// less than this duration. + pub fn new( + config: Arc>, + evict_min_duration: Duration, + info_provider: Arc, + ) -> Self { + RegionStatsManager { + config, + info_provider, + checking_top_regions: Arc::new(AtomicBool::new(false)), + region_loaded_at: Arc::new(ShardedLock::new(BTreeMap::new())), + ma_mvcc_amplification_reduction: Arc::new(Mutex::new(Smoother::default())), + mvcc_amplification_record: Arc::default(), + evict_min_duration, + last_load_evict_time: Arc::new(Mutex::new(Instant::now())), + } + } + + pub(crate) fn expected_region_size(&self) -> usize { + self.config.value().expected_region_size.0 as usize + } + + /// If false is returned, it is not ready to check. + pub fn ready_for_auto_load_and_evict(&self) -> bool { + // The auto load and evict process can block for some time (mainly waiting for + // eviction). To avoid too check after check immediately, we check the elapsed + // time after the last check. + // Region stats update duration is one minute by default, to avoid two checks + // using the same region stats as much as possible, we set a min check interval + // of 1.5 minutes(considers there are not just one region for stat collection). + self.last_load_evict_time.lock().elapsed() + > Duration::max( + self.config.value().load_evict_interval.0 / 2, + Duration::from_secs(90), + ) + && !self.set_checking_top_regions(true) + } + + pub fn complete_auto_load_and_evict(&self) { + *self.last_load_evict_time.lock() = Instant::now(); + self.set_checking_top_regions(false); + } + + /// Prevents two instances of this from running concurrently. + /// Return the previous checking status. + fn set_checking_top_regions(&self, v: bool) -> bool { + self.checking_top_regions.swap(v, Ordering::Relaxed) + } + + /// This method should be called when `evicted_range` is succesfully evicted + /// to remove any internal `RegionStatsManager` that corresponds to the + /// range. + /// + /// Calls [raftstore::coprocessor::region_info_accessor::RegionInfoProvider::find_region_by_key] to + /// find the region corresponding to the range. + /// + /// TODO (afeinberg): This is inefficient, either make this method bulk, or + /// find another way to avoid calling `find_region_by_key` in a loop. + pub fn handle_region_evicted(&self, region: &Region) { + self.region_loaded_at.write().unwrap().remove(®ion.id); + } + + /// Collects regions to load and evict based on region stat, mvcc + /// amplification, and memory constraints. New top regions will be + /// collected in `regions_added_out` to be loaded. + /// + /// If memory usage is below the stop load threshold, regions with low read + /// flow or low mvcc amplification are considered for eviction. + /// + /// If memory usage reaches stop load threshold, whether to evict region is + /// determined by comparison between the new top regions' activity and the + /// current cached regions. + /// + /// # Returns + /// (Regions to load, Regions to evict) + pub fn collect_regions_to_load_and_evict( + &self, + current_region_count: usize, + cached_region_ids: Vec, + memory_controller: &MemoryController, + ) -> (Vec, Vec) { + // Get regions' stat of the cached region and sort them by next + prev in + // descending order. + let mut regions_stat = self + .info_provider + .get_regions_stat(cached_region_ids.clone()) + .unwrap(); + regions_stat.sort_by(|a, b| { + let next_prev_a = a.1.cop_detail.iterated_count(); + let next_prev_b = b.1.cop_detail.iterated_count(); + next_prev_b.cmp(&next_prev_a) + }); + + let ma_mvcc_amplification_reduction = { + let mut ma_mvcc_amplification_reduction = self.ma_mvcc_amplification_reduction.lock(); + let mut record = self.mvcc_amplification_record.lock(); + // update the moving average of the mvcc amplification reduction(the reduced + // multiple before and after the cache). + regions_stat.iter().for_each(|(r, a)| { + if let Some(&lification) = record.get(&r.id) { + let amp_after_cache = a.cop_detail.mvcc_amplification(); + if amp_after_cache != 0.0 && amp_after_cache != amplification { + ma_mvcc_amplification_reduction + .observe(amplification / a.cop_detail.mvcc_amplification()); + } + } + }); + record.clear(); + // this reduction should not be less than 1 + ma_mvcc_amplification_reduction.get_avg() + }; + info!( + "IME moving average mvcc amplification reduction update"; + "ma_mvcc_amplification_reduction" => ma_mvcc_amplification_reduction, + ); + + // Use evict-threshold rather than stop-load-threshold as there might + // be some regions to be evicted. + let expected_new_count = (memory_controller + .evict_threshold() + .saturating_sub(memory_controller.mem_usage())) + / self.expected_region_size(); + let expected_num_regions = usize::max(1, current_region_count + expected_new_count); + info!("ime collect_changed_ranges"; "num_regions" => expected_num_regions); + let curr_top_regions = self + .info_provider + .get_top_regions(NonZeroUsize::try_from(expected_num_regions).unwrap()) + .unwrap() // TODO (afeinberg): Potentially custom error handling here. + .iter() + .map(|(r, region_stats)| (r.id, (r.clone(), region_stats.clone()))) + .collect::>(); + { + let mut region_loaded_map = self.region_loaded_at.write().unwrap(); + for ®ion_id in curr_top_regions.keys() { + let _ = region_loaded_map.insert(region_id, Instant::now()); + } + } + + let cached_region_ids = cached_region_ids.into_iter().collect::>(); + let (mvcc_amplification_to_filter, regions_to_load) = { + let mut max_mvcc_amplification: f64 = 0.0; + let mut record = self.mvcc_amplification_record.lock(); + let regions_to_load = curr_top_regions + .iter() + .filter_map(|(id, (r, region_stats))| { + if !cached_region_ids.contains(id) { + max_mvcc_amplification = max_mvcc_amplification + .max(region_stats.cop_detail.mvcc_amplification()); + record.insert(*id, region_stats.cop_detail.mvcc_amplification()); + Some(r.clone()) + } else { + None + } + }) + .collect(); + // `max_mvcc_amplification / ma_mvcc_amplification_reduction` is the + // expected mvcc amplification factor after cache. Make the half of it to filter + // the cached regions. + ( + max_mvcc_amplification / f64::max(1.0, ma_mvcc_amplification_reduction / 2.0), + regions_to_load, + ) + }; + + info!( + "ime mvcc amplification reduction filter"; + "mvcc_amplification_to_filter" => mvcc_amplification_to_filter, + ); + + { + // TODO(SpadeA): remove it after it's stable + let debug: Vec<_> = regions_stat + .iter() + .map(|(r, s)| { + format!( + "region_id={}, cop={}, cop_detail={:?}, mvcc_amplification={}", + r.id, + s.query_stats.coprocessor, + s.cop_detail, + s.cop_detail.mvcc_amplification(), + ) + }) + .collect(); + info!( + "ime collect regions activities"; + "regions" => ?debug, + ); + } + + let reach_stop_load = memory_controller.reached_stop_load_threshold(); + let mut regions_loaded = self.region_loaded_at.write().unwrap(); + // Evict at most 1/10 of the regions. + let max_count_to_evict = usize::max(1, regions_stat.len() / 10); + // Use top MIN_REGION_COUNT_TO_EVICT regions next and prev to filter regions + // with very few next and prev. If there's less than or equal to + // MIN_REGION_COUNT_TO_EVICT regions, do not evict any one. + let regions_to_evict = if regions_stat.len() > MIN_REGION_COUNT_TO_EVICT { + let avg_top_next_prev = regions_stat + .iter() + .map(|r| r.1.cop_detail.iterated_count()) + .take(MIN_REGION_COUNT_TO_EVICT) + .sum::() + / MIN_REGION_COUNT_TO_EVICT; + let region_to_evict: Vec<_> = regions_stat + .into_iter() + .filter(|(_, r)| { + if reach_stop_load { + r.cop_detail.mvcc_amplification() + < mvcc_amplification_to_filter + } else { + // In this case, memory usage is relarively low, we only evict those that should not be cached apparently. + r.cop_detail.mvcc_amplification() + <= self.config.value().mvcc_amplification_threshold as f64 / MVCC_AMPLIFICATION_FILTER_FACTOR + || r.cop_detail.iterated_count() < avg_top_next_prev / ITERATED_COUNT_FILTER_FACTOR + } + }) + .filter_map(|(r, s)| { + // Do not evict regions that were loaded less than `EVICT_MIN_DURATION` ago. + // If it has no time recorded, it should be loaded + // be pre-load or something, record the time and + // does not evict it this time. + let time_loaded = regions_loaded.entry(r.id).or_insert(Instant::now()); + if Instant::now() - *time_loaded < self.evict_min_duration { + None + } else { + Some((r, s)) + } + }) + .rev() // evict the regions with least next + prev after the filter + .take(max_count_to_evict) + .collect(); + + // TODO(SpadeA): remove it after it's stable + let debug: Vec<_> = region_to_evict + .iter() + .map(|(r, s)| { + format!( + "region_id={}, cop={}, cop_detail={:?}, mvcc_amplification={}", + r.id, + s.query_stats.coprocessor, + s.cop_detail, + s.cop_detail.mvcc_amplification(), + ) + }) + .collect(); + info!( + "ime collect regions to evict"; + "reached_stop_limit" => reach_stop_load, + "regions" => ?debug, + ); + region_to_evict.into_iter().map(|(r, _)| r).collect() + } else { + vec![] + }; + (regions_to_load, regions_to_evict) + } + + // Evict regions with less read flow until the memory usage is below the evict + // threshold. + pub(crate) async fn evict_on_evict_threshold_reached( + &self, + mut evict_region: F, + delete_range_scheduler: &Scheduler, + cached_region_ids: Vec, + memory_controller: &MemoryController, + ) where + F: FnMut( + &CacheRegion, + EvictReason, + Option>, + ) -> Vec, + { + // Get regions' stat of the cached region and sort them by next + prev in + // descending order. + let regions_activity = self + .info_provider + .get_regions_stat(cached_region_ids.clone()) + .unwrap(); + if regions_activity.is_empty() { + return; + } + + // Use the average mvcc amplification to filter the regions with high mvcc + // amplification + let avg_mvcc_amplification = regions_activity + .iter() + .map(|(_, ra)| ra.cop_detail.mvcc_amplification()) + .sum::() + / regions_activity.len() as f64; + + let evict_candidates: Vec<_> = { + let mut evict_candidates = regions_activity; + info!( + "ime evict candidate count before filter"; + "count" => evict_candidates.len(), + ); + let mut filter_by_mvcc_amplification = 0; + evict_candidates.retain(|(_, ra)| { + // Do not evict regions with high mvcc amplification + if ra.cop_detail.mvcc_amplification() > avg_mvcc_amplification { + filter_by_mvcc_amplification += 1; + return false; + } + true + }); + evict_candidates.sort_by(|a, b| { + let next_prev_a = a.1.cop_detail.iterated_count(); + let next_prev_b = b.1.cop_detail.iterated_count(); + next_prev_a.cmp(&next_prev_b) + }); + + info!( + "ime evict candidate count after filter"; + "count" => evict_candidates.len(), + "filter_by_mvcc_amplification" => filter_by_mvcc_amplification, + ); + + evict_candidates.into_iter().map(|(r, _)| r).collect() + }; + // Evict two regions each time to reduce the probability that an un-dropped + // ongoing snapshot blocks the process + for regions in evict_candidates.chunks(2) { + let mut deletable_regions = vec![]; + + let (tx, mut rx) = mpsc::channel(3); + for r in regions { + info!( + "ime evict on evict threshold reached"; + "region_to_evict" => ?r, + ); + + let tx_clone = tx.clone(); + deletable_regions.extend(evict_region( + &CacheRegion::from_region(r), + EvictReason::MemoryLimitReached, + // This callback will be executed when eviction finishes at `on_delete_regions` + // and when the reletive rx.recv() returns, we know some memory are freed. + Some(Box::new(move || { + Box::pin(async move { + let _ = tx_clone.send(()).await; + }) + })), + )); + self.handle_region_evicted(r); + } + if !deletable_regions.is_empty() { + if let Err(e) = delete_range_scheduler + .schedule_force(BackgroundTask::DeleteRegions(deletable_regions)) + { + error!( + "ime schedule delete regions failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + for _ in 0..regions.len() { + // It's better to use `timeout(Duration, rx.recv())` but which needs to use + // tokio runtime with timer enabled while yatp does not. + if rx.recv().await.is_none() { + break; + } + } + if !memory_controller.reached_stop_load_threshold() { + return; + } + } + } +} + +#[cfg(test)] +pub mod tests { + use futures::executor::block_on; + use pd_client::{RegionStat, RegionWriteCfCopDetail}; + use raftstore::coprocessor::{self, region_info_accessor::TopRegions, RegionInfoProvider}; + use tikv_util::{ + box_err, + config::{ReadableDuration, ReadableSize, VersionTrack}, + worker::dummy_scheduler, + }; + + use super::*; + use crate::{engine::SkiplistEngine, test_util::new_region, InMemoryEngineConfig}; + + struct RegionInfoSimulator { + regions: Mutex, + region_stats: Mutex>, + } + + impl RegionInfoSimulator { + fn set_top_regions(&self, top_regions: &TopRegions) { + *self.regions.lock() = top_regions.clone() + } + + fn set_region_stats(&self, region_stats: &[(Region, RegionStat)]) { + *self.region_stats.lock() = region_stats + .iter() + .map(|(r, s)| (r.id, (r.clone(), s.clone()))) + .collect::>(); + } + } + + impl RegionInfoProvider for RegionInfoSimulator { + fn find_region_by_key(&self, key: &[u8]) -> coprocessor::Result { + self.regions + .lock() + .iter() + .find(|(region, _)| region.start_key == key) + .cloned() + .map_or_else( + || Err(box_err!(format!("key {:?} not found", key))), + |(region, _)| Ok(region), + ) + } + + fn get_top_regions(&self, count: NonZeroUsize) -> coprocessor::Result { + Ok(self + .regions + .lock() + .iter() + .take(count.get()) + .cloned() + .collect::>()) + } + + fn get_regions_stat( + &self, + ids: Vec, + ) -> coprocessor::Result> { + let g = self.region_stats.lock().clone(); + Ok(ids + .into_iter() + .filter_map(|id| g.get(&id).cloned()) + .collect()) + } + } + + fn new_region_stat(next: usize, processed_keys: usize) -> RegionStat { + let mut stat = RegionStat::default(); + stat.cop_detail = RegionWriteCfCopDetail::new(next, 0, processed_keys); + stat + } + + #[test] + fn test_collect_changed_regions() { + let skiplist_engine = SkiplistEngine::new(); + let mut config = InMemoryEngineConfig::config_for_test(); + config.stop_load_threshold = Some(ReadableSize::kb(1)); + config.mvcc_amplification_threshold = 10; + let config = Arc::new(VersionTrack::new(config)); + let mc = MemoryController::new(config.clone(), skiplist_engine); + let region_1 = new_region(1, b"k01", b"k02"); + + let region_2 = new_region(2, b"k03", b"k04"); + let sim = Arc::new(RegionInfoSimulator { + regions: Mutex::new(vec![(region_1.clone(), new_region_stat(1000, 10))]), + region_stats: Mutex::default(), + }); + // 10 ms min duration eviction for testing purposes. + let rsm = RegionStatsManager::new(config, Duration::from_millis(10), sim.clone()); + let (added, removed) = rsm.collect_regions_to_load_and_evict(0, vec![], &mc); + assert_eq!(&added, &[region_1.clone()]); + assert!(removed.is_empty()); + let top_regions = vec![(region_2.clone(), new_region_stat(1000, 8))]; + let region_stats = vec![(region_1.clone(), new_region_stat(20, 10))]; + sim.set_top_regions(&top_regions); + sim.set_region_stats(®ion_stats); + let (added, removed) = rsm.collect_regions_to_load_and_evict(0, vec![1], &mc); + assert_eq!(&added, &[region_2.clone()]); + assert!(removed.is_empty()); + let region_3 = new_region(3, b"k05", b"k06"); + let region_4 = new_region(4, b"k07", b"k08"); + let region_5 = new_region(5, b"k09", b"k10"); + let region_6 = new_region(6, b"k11", b"k12"); + let region_7 = new_region(7, b"k13", b"k14"); + let region_8 = new_region(8, b"k15", b"k16"); + let top_regions = vec![ + (region_6.clone(), new_region_stat(1000, 10)), + (region_3.clone(), new_region_stat(1000, 10)), + (region_4.clone(), new_region_stat(1000, 10)), + (region_5.clone(), new_region_stat(1000, 10)), + (region_7.clone(), new_region_stat(2000, 10)), + ]; + let region_stats = vec![ + (region_1.clone(), new_region_stat(20, 10)), + (region_2.clone(), new_region_stat(20, 8)), + ]; + sim.set_top_regions(&top_regions); + sim.set_region_stats(®ion_stats); + let (added, removed) = rsm.collect_regions_to_load_and_evict(0, vec![1, 2], &mc); + assert!(removed.is_empty()); + assert_eq!( + &added, + &[ + region_3.clone(), + region_4.clone(), + region_5.clone(), + region_6.clone(), + region_7.clone() + ] + ); + + let region_stats = vec![ + (region_1.clone(), new_region_stat(2, 0)), // evicted due to read flow + (region_6.clone(), new_region_stat(200, 10)), + (region_2.clone(), new_region_stat(20, 8)), + (region_3.clone(), new_region_stat(15, 10)), // evicted due to mvcc amplification + (region_4.clone(), new_region_stat(30, 10)), + (region_5.clone(), new_region_stat(55, 10)), + (region_7.clone(), new_region_stat(80, 10)), + ]; + sim.set_top_regions(&vec![]); + sim.set_region_stats(®ion_stats); + let (_, removed) = rsm.collect_regions_to_load_and_evict(0, vec![1, 2, 3, 4, 5, 6, 7], &mc); + // `region_1` is no longer needed to cached, but since it was loaded less + // than 10 ms ago, it should not be included in the removed ranges. + assert!(removed.is_empty()); + std::thread::sleep(Duration::from_millis(100)); + // After 100 ms passed, check again, and verify `region_1` is evictable. + + sim.set_top_regions(&vec![]); + sim.set_region_stats(®ion_stats); + let (_, removed) = rsm.collect_regions_to_load_and_evict(0, vec![1, 2, 3, 4, 5, 6, 7], &mc); + assert_eq!(&removed, &[region_1.clone()]); + + let top_regions = vec![(region_8.clone(), new_region_stat(4000, 10))]; + sim.set_top_regions(&top_regions); + sim.set_region_stats(®ion_stats); + mc.acquire(2000); + let (_, removed) = rsm.collect_regions_to_load_and_evict(0, vec![2, 3, 4, 5, 6, 7], &mc); + assert_eq!(&removed, &[region_3.clone()]); + } + + #[test] + fn test_collect_candidates_for_eviction() { + let skiplist_engine = SkiplistEngine::new(); + let mut config = InMemoryEngineConfig::config_for_test(); + config.stop_load_threshold = Some(ReadableSize::kb(1)); + config.mvcc_amplification_threshold = 10; + let config = Arc::new(VersionTrack::new(config)); + let mc = MemoryController::new(config.clone(), skiplist_engine); + let (scheduler, _rx) = dummy_scheduler(); + + fn make_region_vec(rs: &[Region], stats: &[RegionStat]) -> TopRegions { + rs.iter() + .zip(stats.iter()) + .map(|(r, stat)| (r.clone(), stat.clone())) + .collect::>() + } + + let region_1 = new_region(1, b"k01", b"k02"); + let region_2 = new_region(2, b"k03", b"k04"); + let region_3 = new_region(3, b"k05", b"k06"); + let region_4 = new_region(4, b"k07", b"k08"); + let region_5 = new_region(5, b"k09", b"k10"); + let region_6 = new_region(6, b"k11", b"k12"); + let regions = vec![region_1, region_2, region_3, region_4, region_5, region_6]; + + let region_stats = vec![ + new_region_stat(100, 6), + new_region_stat(10000, 1000), + new_region_stat(100000, 100000), + new_region_stat(100, 50), // will be evicted + new_region_stat(1000, 120), + new_region_stat(20, 2), // will be evicted + ]; + + let all_regions = make_region_vec(®ions, ®ion_stats); + let region_stats: HashMap = all_regions + .iter() + .map(|top_region| { + ( + top_region.0.id, + (top_region.0.clone(), top_region.1.clone()), + ) + }) + .collect(); + + let sim = Arc::new(RegionInfoSimulator { + regions: Mutex::new(all_regions), + region_stats: Mutex::new(region_stats), + }); + // 10 ms min duration eviction for testing purposes. + let rsm = RegionStatsManager::new(config.clone(), Duration::from_millis(10), sim.clone()); + rsm.collect_regions_to_load_and_evict(0, vec![], &mc); + std::thread::sleep(Duration::from_millis(100)); + + let evicted_regions = Arc::new(Mutex::new(vec![])); + let cbs = Arc::new(Mutex::new(vec![])); + + let evicted_regions2 = evicted_regions.clone(); + let cbs2 = cbs.clone(); + let evict_fn = move |evict_region: &CacheRegion, + _: EvictReason, + cb: Option>| + -> Vec { + evicted_regions2.lock().push(evict_region.id); + cbs2.lock().push(cb.unwrap()); + vec![] + }; + + let handle = std::thread::spawn(move || { + block_on(async { + rsm.evict_on_evict_threshold_reached( + evict_fn, + &scheduler, + vec![1, 2, 3, 4, 5, 6], + &mc, + ) + .await + }); + }); + + let t = Instant::now(); + while t.elapsed() < Duration::from_secs(2) { + std::thread::sleep(Duration::from_millis(100)); + let mut cb_lock = cbs.lock(); + if cb_lock.len() == 2 { + let evicted_regions_lock = evicted_regions.lock(); + assert_eq!(*evicted_regions_lock, vec![6, 4]); + block_on(async { + cb_lock.pop().unwrap()().await; + cb_lock.pop().unwrap()().await; + }); + break; + } + } + + let _ = handle.join(); + } + + #[test] + fn test_check_after_check() { + let sim = Arc::new(RegionInfoSimulator { + regions: Mutex::new(vec![]), + region_stats: Mutex::new(HashMap::default()), + }); + let mut config = InMemoryEngineConfig::config_for_test(); + config.load_evict_interval = ReadableDuration(Duration::from_secs(120)); + let config = Arc::new(VersionTrack::new(config)); + let mgr = RegionStatsManager::new(config, Duration::from_secs(120), sim); + + // Interval is not enough + assert!(!mgr.ready_for_auto_load_and_evict()); + *mgr.last_load_evict_time.lock() = Instant::now() - Duration::from_secs(120); + assert!(mgr.ready_for_auto_load_and_evict()); + // Checking + assert!(!mgr.ready_for_auto_load_and_evict()); + + mgr.complete_auto_load_and_evict(); + // Interval is not enough + assert!(!mgr.ready_for_auto_load_and_evict()); + *mgr.last_load_evict_time.lock() = Instant::now() - Duration::from_secs(120); + assert!(mgr.ready_for_auto_load_and_evict()); + } +} diff --git a/components/range_cache_memory_engine/src/statistics.rs b/components/in_memory_engine/src/statistics.rs similarity index 100% rename from components/range_cache_memory_engine/src/statistics.rs rename to components/in_memory_engine/src/statistics.rs diff --git a/components/range_cache_memory_engine/src/test_util.rs b/components/in_memory_engine/src/test_util.rs similarity index 65% rename from components/range_cache_memory_engine/src/test_util.rs rename to components/in_memory_engine/src/test_util.rs index f9f353903b2..c11ba667ae4 100644 --- a/components/range_cache_memory_engine/src/test_util.rs +++ b/components/in_memory_engine/src/test_util.rs @@ -3,13 +3,17 @@ use std::sync::Arc; use crossbeam::epoch; +use engine_rocks::RocksEngine; +use engine_traits::{SyncMutable, CF_WRITE}; +use keys::data_key; +use kvproto::metapb::{Peer, Region}; use txn_types::{Key, TimeStamp, Write, WriteType}; use crate::{ engine::SkiplistHandle, keys::{encode_key, InternalBytes, ValueType}, memory_controller::MemoryController, - write_batch::RangeCacheWriteBatchEntry, + write_batch::RegionCacheWriteBatchEntry, }; // Put data with write cf and related start cf @@ -78,7 +82,8 @@ fn put_data_impl( write_cf: &SkiplistHandle, mem_controller: Arc, ) { - let raw_write_k = Key::from_raw(key) + let data_key = data_key(key); + let raw_write_k = Key::from_raw(&data_key) .append_ts(TimeStamp::new(commit_ts)) .into_encoded(); let mut write_k = encode_key(&raw_write_k, seq_num, ValueType::Value); @@ -95,7 +100,7 @@ fn put_data_impl( let mut val = InternalBytes::from_vec(write_v.as_ref().to_bytes()); val.set_memory_controller(mem_controller.clone()); let guard = &epoch::pin(); - let _ = mem_controller.acquire(RangeCacheWriteBatchEntry::calc_put_entry_size( + let _ = mem_controller.acquire(RegionCacheWriteBatchEntry::calc_put_entry_size( &raw_write_k, val.as_bytes(), )); @@ -110,17 +115,66 @@ fn put_data_impl( } if !short_value { - let raw_default_k = Key::from_raw(key) + let raw_default_k = Key::from_raw(&data_key) .append_ts(TimeStamp::new(start_ts)) .into_encoded(); let mut default_k = encode_key(&raw_default_k, seq_num + 1, ValueType::Value); default_k.set_memory_controller(mem_controller.clone()); let mut val = InternalBytes::from_vec(value.to_vec()); val.set_memory_controller(mem_controller.clone()); - let _ = mem_controller.acquire(RangeCacheWriteBatchEntry::calc_put_entry_size( + let _ = mem_controller.acquire(RegionCacheWriteBatchEntry::calc_put_entry_size( &raw_default_k, val.as_bytes(), )); default_cf.insert(default_k, val, guard); } } + +pub fn put_data_in_rocks( + key: &[u8], + value: &[u8], + commit_ts: u64, + start_ts: u64, + short_value: bool, + rocks_engine: &RocksEngine, + write_type: WriteType, +) { + let data_key = data_key(key); + let raw_write_k = Key::from_raw(&data_key) + .append_ts(TimeStamp::new(commit_ts)) + .into_encoded(); + let write_v = Write::new( + write_type, + TimeStamp::new(start_ts), + if short_value { + Some(value.to_vec()) + } else { + None + }, + ); + + rocks_engine + .put_cf(CF_WRITE, &raw_write_k, &write_v.as_ref().to_bytes()) + .unwrap(); + + if write_type == WriteType::Delete { + return; + } + + if !short_value { + let raw_default_k = Key::from_raw(key) + .append_ts(TimeStamp::new(start_ts)) + .into_encoded(); + rocks_engine.put(&raw_default_k, value).unwrap(); + } +} + +pub fn new_region>, T2: Into>>(id: u64, start: T1, end: T2) -> Region { + let mut region = Region::new(); + region.id = id; + region.start_key = start.into(); + region.end_key = end.into(); + // push a dummy peer to avoid CacheRegion::from_region panic. + region.mut_peers().push(Peer::default()); + region +} diff --git a/components/in_memory_engine/src/write_batch.rs b/components/in_memory_engine/src/write_batch.rs new file mode 100644 index 00000000000..fd92d4a5b8a --- /dev/null +++ b/components/in_memory_engine/src/write_batch.rs @@ -0,0 +1,1075 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{atomic::Ordering, Arc}, + time::Duration, +}; + +use bytes::Bytes; +use crossbeam::epoch; +use engine_traits::{ + CacheRegion, EvictReason, MiscExt, Mutable, RegionCacheEngine, Result, WriteBatch, + WriteBatchExt, WriteOptions, CF_DEFAULT, +}; +use kvproto::metapb; +use smallvec::SmallVec; +use tikv_util::{box_err, config::ReadableSize, error, info, time::Instant, warn}; + +use crate::{ + background::BackgroundTask, + engine::{cf_to_id, id_to_cf, is_lock_cf, SkiplistEngine}, + keys::{encode_key, InternalBytes, ValueType, ENC_KEY_SEQ_LENGTH}, + memory_controller::{MemoryController, MemoryUsage}, + metrics::{ + count_operations_for_cfs, IN_MEMORY_ENGINE_PREPARE_FOR_WRITE_DURATION_HISTOGRAM, + IN_MEMORY_ENGINE_WRITE_DURATION_HISTOGRAM, + }, + region_manager::RegionCacheStatus, + RegionCacheMemoryEngine, +}; + +// This is a bit of a hack. It's the overhead of a node in the skiplist with +// height 3, which is sufficiently conservative for estimating the node overhead +// size. +pub(crate) const NODE_OVERHEAD_SIZE_EXPECTATION: usize = 96; +// As every key/value holds a Arc, this overhead should be +// taken into consideration. +pub(crate) const MEM_CONTROLLER_OVERHEAD: usize = 8; +// A threshold that when the lock cf increment bytes exceed it, a +// CleanLockTombstone will be scheduled to cleanup the lock tombstones. +// It's somewhat like RocksDB flush memtables when the memtable reaches to a +// certain bytes so that the compactions may cleanup some tombstones. By +// default, the memtable size for lock cf is 32MB. As not all ranges will be +// cached in the memory, just use half of it here. +const AMOUNT_TO_CLEAN_TOMBSTONE: u64 = ReadableSize::mb(16).0; +// The value of the delete entry in the in-memory engine. It's just a emptry +// slice. +const DELETE_ENTRY_VAL: &[u8] = b""; + +// `prepare_for_region` should be called before raft command apply for each peer +// delegate. It sets `region_cache_status` which is used to determine whether +// the writes of this peer should be buffered. +pub struct RegionCacheWriteBatch { + // `region_cache_status` indicates whether the range is cached, loading data, or not cached. If + // it is cached, we should buffer the write in `buffer` which is consumed during the write + // is written in the kv engine. If it is loading data, we should buffer the write in + // `pending_range_in_loading_buffer` which is cached in the memory engine and will be consumed + // after the snapshot has been loaded. + region_cache_status: RegionCacheStatus, + buffer: Vec, + engine: RegionCacheMemoryEngine, + save_points: Vec, + sequence_number: Option, + memory_controller: Arc, + current_region_evicted: bool, + current_region: Option, + // all the regions this write batch is written. + written_regions: Vec, + + // record the total durations of the prepare work for write in the write batch + prepare_for_write_duration: Duration, + + // Now, we have an assumption that in one round of batch system process (PollHandler::begin -> + // ... -> PollHandler::end), although the same region can call `prepare_for_region` + // multiple times, it can only call sequentially. This is say, we will not have this: + // prepare_for_region(region1), prepare_for_region(region2), prepare_for_region(region1). + // In case to avoid this asssumption being broken, we record the regions that have called + // prepare_for_region and ensure that if the region is not the `currnet_region`, it is not + // recorded in this vec. + prepared_regions: SmallVec<[u64; 10]>, +} + +impl std::fmt::Debug for RegionCacheWriteBatch { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RegionCacheWriteBatch") + .field("buffer", &self.buffer) + .field("save_points", &self.save_points) + .field("sequence_number", &self.sequence_number) + .finish() + } +} + +impl From<&RegionCacheMemoryEngine> for RegionCacheWriteBatch { + fn from(engine: &RegionCacheMemoryEngine) -> Self { + Self { + region_cache_status: RegionCacheStatus::NotInCache, + buffer: Vec::new(), + engine: engine.clone(), + save_points: Vec::new(), + sequence_number: None, + memory_controller: engine.memory_controller(), + current_region_evicted: false, + prepare_for_write_duration: Duration::default(), + current_region: None, + written_regions: vec![], + prepared_regions: SmallVec::new(), + } + } +} + +impl RegionCacheWriteBatch { + pub fn with_capacity(engine: &RegionCacheMemoryEngine, cap: usize) -> Self { + Self { + region_cache_status: RegionCacheStatus::NotInCache, + buffer: Vec::with_capacity(cap), + // cache_buffer should need small capacity + engine: engine.clone(), + save_points: Vec::new(), + sequence_number: None, + memory_controller: engine.memory_controller(), + current_region_evicted: false, + prepare_for_write_duration: Duration::default(), + current_region: None, + written_regions: vec![], + prepared_regions: SmallVec::new(), + } + } + + pub fn prepare_for_region(&mut self, region: &metapb::Region) { + // If the region is already prepared for write, we do not need to prepare it + // again. See comments for the `prepared_regions` field for more details. + if let Some(current_region) = &self.current_region + && current_region.id == region.id + { + return; + } + let time = Instant::now(); + // verify that the region is not prepared before + if self.prepared_regions.contains(®ion.id) { + panic!( + "region {} is prepared for write before, but it is not the current region", + region.id + ); + } + self.prepared_regions.push(region.id); + // record last region for clearing region in written flags. + self.record_last_written_region(); + + let cached_region = CacheRegion::from_region(region); + // TODO: remote range. + self.set_region_cache_status( + self.engine + .prepare_for_apply(&cached_region, region.is_in_flashback), + ); + self.current_region = Some(cached_region); + self.current_region_evicted = false; + self.prepare_for_write_duration += time.saturating_elapsed(); + } + + /// Trigger a CleanLockTombstone task if the accumulated lock cf + /// modification exceeds the threshold (16MB). + /// + /// NB: Acquiring the RocksDB mutex is necessary to get the oldest snapshot, + /// so avoid calling this in any RocksDB callback (e.g., write batch + /// callback) to prevent potential deadlocks. + pub fn maybe_compact_lock_cf(&self) { + if self.engine.lock_modification_bytes.load(Ordering::Relaxed) > AMOUNT_TO_CLEAN_TOMBSTONE { + // Use `swap` to only allow one schedule when multiple writers reaches the limit + // concurrently. + if self + .engine + .lock_modification_bytes + .swap(0, Ordering::Relaxed) + > AMOUNT_TO_CLEAN_TOMBSTONE + { + let rocks_engine = self.engine.rocks_engine.as_ref().unwrap(); + let last_seqno = rocks_engine.get_latest_sequence_number(); + let snapshot_seqno = self + .engine + .rocks_engine + .as_ref() + .unwrap() + .get_oldest_snapshot_sequence_number() + .unwrap_or(last_seqno); + + if let Err(e) = self + .engine + .bg_worker_manager() + .schedule_task(BackgroundTask::CleanLockTombstone(snapshot_seqno)) + { + error!( + "ime schedule lock tombstone cleanup failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + } + } + + /// Sets the sequence number for this batch. This should only be called + /// prior to writing the batch. + pub fn set_sequence_number(&mut self, seq: u64) -> Result<()> { + if let Some(seqno) = self.sequence_number { + return Err(box_err!("Sequence number {} already set", seqno)); + }; + self.sequence_number = Some(seq); + Ok(()) + } + + // Note: `seq` is the sequence number of the first key in this write batch in + // the RocksDB, which will be incremented automatically for each key, so + // that all keys have unique sequence numbers. + fn write_impl(&mut self, mut seq: u64) -> Result<()> { + // record last region before flush. + self.record_last_written_region(); + + fail::fail_point!("ime_on_region_cache_write_batch_write_impl"); + let guard = &epoch::pin(); + let start = Instant::now(); + let mut lock_modification: u64 = 0; + let engine = self.engine.core.engine(); + + // record the number of insertions and deletions for each cf + let mut put = [0, 0, 0]; + let mut delete = [0, 0, 0]; + // Some entries whose ranges may be marked as evicted above, but it does not + // matter, they will be deleted later. + std::mem::take(&mut self.buffer).into_iter().for_each(|e| { + if is_lock_cf(e.cf) { + lock_modification += e.data_size() as u64; + } + if e.is_insertion() { + put[e.cf] += 1; + } else { + delete[e.cf] += 1; + } + + e.write_to_memory(seq, &engine, self.memory_controller.clone(), guard); + seq += 1; + }); + let duration = start.saturating_elapsed_secs(); + IN_MEMORY_ENGINE_WRITE_DURATION_HISTOGRAM.observe(duration); + count_operations_for_cfs(&put, &delete); + + fail::fail_point!("ime_on_region_cache_write_batch_write_consumed"); + fail::fail_point!("ime_before_clear_regions_in_being_written"); + + if !self.written_regions.is_empty() { + self.engine + .core + .region_manager() + .clear_regions_in_being_written(&self.written_regions); + } + + self.engine + .lock_modification_bytes + .fetch_add(lock_modification, Ordering::Relaxed); + + IN_MEMORY_ENGINE_PREPARE_FOR_WRITE_DURATION_HISTOGRAM + .observe(self.prepare_for_write_duration.as_secs_f64()); + + Ok(()) + } + + #[inline] + pub fn set_region_cache_status(&mut self, region_cache_status: RegionCacheStatus) { + self.region_cache_status = region_cache_status; + } + + fn evict_current_region(&mut self, reason: EvictReason) { + if self.current_region_evicted { + return; + } + self.engine + .evict_region(self.current_region.as_ref().unwrap(), reason, None); + self.current_region_evicted = true; + } + + fn process_cf_operation(&mut self, entry_size: F1, entry: F2) + where + F1: FnOnce() -> usize, + F2: FnOnce() -> RegionCacheWriteBatchEntry, + { + if self.region_cache_status == RegionCacheStatus::NotInCache || self.current_region_evicted + { + return; + } + + if !self.engine.enabled() { + let region = self.current_region.as_ref().unwrap(); + info!("ime is disabled, evict the range"; "region" => ?region); + self.evict_current_region(EvictReason::Disabled); + return; + } + let memory_expect = entry_size(); + if !self.memory_acquire(memory_expect) { + let region = self.current_region.as_ref().unwrap(); + info!("ime memory acquire failed due to reaches capacity"; "region" => ?region); + self.evict_current_region(EvictReason::MemoryLimitReached); + return; + } + + self.buffer.push(entry()); + } + + fn schedule_memory_check(&self) { + if self.memory_controller.memory_checking() { + return; + } + if !self.memory_controller.set_memory_checking(true) { + if let Err(e) = self + .engine + .bg_worker_manager() + .schedule_task(BackgroundTask::MemoryCheckAndEvict) + { + error!( + "ime schedule memory check failed"; + "err" => ?e, + ); + assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); + } + } + } + + // return false means the memory usage reaches to capacity and we have no + // quota to write to the engine + fn memory_acquire(&mut self, mem_required: usize) -> bool { + match self.memory_controller.acquire(mem_required) { + MemoryUsage::CapacityReached(n) => { + warn!( + "ime the memory usage reaches capacity"; + "region" => ?self.current_region.as_ref().unwrap(), + "memory_usage(MB)" => ReadableSize(n as u64).as_mb_f64(), + ); + self.schedule_memory_check(); + return false; + } + MemoryUsage::EvictThresholdReached(_) => { + self.schedule_memory_check(); + } + _ => {} + } + true + } + + #[inline] + fn record_last_written_region(&mut self) { + // NOTE: event if the region is evcited due to memory limit, we still + // need to track it because its "in written" flag has been set. + if self.region_cache_status != RegionCacheStatus::NotInCache { + let last_region = self.current_region.take().unwrap(); + self.written_regions.push(last_region); + } + } +} + +#[derive(Clone, Debug)] +enum WriteBatchEntryInternal { + PutValue(Bytes), + Deletion, +} + +impl WriteBatchEntryInternal { + fn encode(&self, key: &[u8], seq: u64) -> (InternalBytes, InternalBytes) { + match self { + WriteBatchEntryInternal::PutValue(value) => ( + encode_key(key, seq, ValueType::Value), + InternalBytes::from_bytes(value.clone()), + ), + WriteBatchEntryInternal::Deletion => ( + encode_key(key, seq, ValueType::Deletion), + InternalBytes::from_bytes(Bytes::from_static(DELETE_ENTRY_VAL)), + ), + } + } + + fn data_size(&self) -> usize { + match self { + WriteBatchEntryInternal::PutValue(value) => value.len(), + WriteBatchEntryInternal::Deletion => 0, + } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct RegionCacheWriteBatchEntry { + cf: usize, + key: Bytes, + inner: WriteBatchEntryInternal, +} + +impl RegionCacheWriteBatchEntry { + pub fn is_insertion(&self) -> bool { + matches!(self.inner, WriteBatchEntryInternal::PutValue(_)) + } + + pub fn put_value(cf: &str, key: &[u8], value: &[u8]) -> Self { + Self { + cf: cf_to_id(cf), + key: Bytes::copy_from_slice(key), + inner: WriteBatchEntryInternal::PutValue(Bytes::copy_from_slice(value)), + } + } + + pub fn deletion(cf: &str, key: &[u8]) -> Self { + Self { + cf: cf_to_id(cf), + key: Bytes::copy_from_slice(key), + inner: WriteBatchEntryInternal::Deletion, + } + } + + #[inline] + pub fn encode(&self, seq: u64) -> (InternalBytes, InternalBytes) { + self.inner.encode(&self.key, seq) + } + + pub fn calc_put_entry_size(key: &[u8], value: &[u8]) -> usize { + RegionCacheWriteBatchEntry::memory_size_required_for_key_value(key, value) + } + + pub fn calc_delete_entry_size(key: &[u8]) -> usize { + // delete also has value which is an empty bytes + RegionCacheWriteBatchEntry::memory_size_required_for_key_value(key, DELETE_ENTRY_VAL) + } + + fn memory_size_required_for_key_value(key: &[u8], value: &[u8]) -> usize { + // The key will be encoded with sequence number when it is written to in-memory + // engine, so we have to acquire the sequence number suffix memory usage. + InternalBytes::memory_size_required(key.len() + ENC_KEY_SEQ_LENGTH) + + InternalBytes::memory_size_required(value.len()) + } + + pub fn data_size(&self) -> usize { + self.key.len() + ENC_KEY_SEQ_LENGTH + self.inner.data_size() + } + + #[inline] + pub fn write_to_memory( + &self, + seq: u64, + skiplist_engine: &SkiplistEngine, + memory_controller: Arc, + guard: &epoch::Guard, + ) { + let handle = skiplist_engine.cf_handle(id_to_cf(self.cf)); + + let (mut key, mut value) = self.encode(seq); + key.set_memory_controller(memory_controller.clone()); + value.set_memory_controller(memory_controller); + handle.insert(key, value, guard); + } +} + +impl WriteBatchExt for RegionCacheMemoryEngine { + type WriteBatch = RegionCacheWriteBatch; + // todo: adjust it + const WRITE_BATCH_MAX_KEYS: usize = 256; + + fn write_batch(&self) -> Self::WriteBatch { + RegionCacheWriteBatch::from(self) + } + + fn write_batch_with_cap(&self, cap: usize) -> Self::WriteBatch { + RegionCacheWriteBatch::with_capacity(self, cap) + } +} + +impl WriteBatch for RegionCacheWriteBatch { + fn write_opt(&mut self, _: &WriteOptions) -> Result { + self.sequence_number + .map(|seq| self.write_impl(seq).map(|()| seq)) + .transpose() + .map(|o| o.ok_or_else(|| box_err!("sequence_number must be set!")))? + } + + fn data_size(&self) -> usize { + self.buffer + .iter() + .map(RegionCacheWriteBatchEntry::data_size) + .sum() + } + + fn count(&self) -> usize { + self.buffer.len() + } + + fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + fn should_write_to_engine(&self) -> bool { + unimplemented!() + } + + fn clear(&mut self) { + // `current_region` is some means `write_impl` is not called, so we need to + // clear the `in_written` flag. + // This can happen when apply fsm do `commit`(e.g. after handling Msg::Change), + // and then do not handle other kvs. Thus, the write batch is empty, + // and `write_impl` is not called. + if self.current_region.is_some() { + self.record_last_written_region(); + // region's `in_written` is not cleaned as `write_impl` is not called, + // so we should do it here. + if !self.written_regions.is_empty() { + self.engine + .core + .region_manager() + .clear_regions_in_being_written(&self.written_regions); + } + } + + self.region_cache_status = RegionCacheStatus::NotInCache; + self.buffer.clear(); + self.save_points.clear(); + self.sequence_number = None; + self.current_region_evicted = false; + self.current_region = None; + self.written_regions.clear(); + self.prepare_for_write_duration = Duration::ZERO; + self.prepared_regions.clear(); + } + + fn set_save_point(&mut self) { + self.save_points.push(self.buffer.len()) + } + + fn pop_save_point(&mut self) -> Result<()> { + self.save_points + .pop() + .map(|_| ()) + .ok_or_else(|| box_err!("no save points available")) + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + self.save_points + .pop() + .map(|sp| { + self.buffer.truncate(sp); + }) + .ok_or_else(|| box_err!("no save point available!")) + } + + fn merge(&mut self, mut other: Self) -> Result<()> { + self.buffer.append(&mut other.buffer); + Ok(()) + } +} + +impl Mutable for RegionCacheWriteBatch { + fn put(&mut self, key: &[u8], val: &[u8]) -> Result<()> { + self.put_cf(CF_DEFAULT, key, val) + } + + fn put_cf(&mut self, cf: &str, key: &[u8], val: &[u8]) -> Result<()> { + self.process_cf_operation( + || RegionCacheWriteBatchEntry::calc_put_entry_size(key, val), + || RegionCacheWriteBatchEntry::put_value(cf, key, val), + ); + Ok(()) + } + + fn delete(&mut self, key: &[u8]) -> Result<()> { + self.delete_cf(CF_DEFAULT, key) + } + + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + self.process_cf_operation( + || RegionCacheWriteBatchEntry::calc_delete_entry_size(key), + || RegionCacheWriteBatchEntry::deletion(cf, key), + ); + Ok(()) + } + + // rather than delete the keys in the range, we evict ranges that overlap with + // them directly + fn delete_range(&mut self, _begin_key: &[u8], _end_key: &[u8]) -> Result<()> { + self.evict_current_region(EvictReason::DeleteRange); + Ok(()) + } + + fn delete_range_cf(&mut self, _: &str, _begin_key: &[u8], _end_key: &[u8]) -> Result<()> { + self.evict_current_region(EvictReason::DeleteRange); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use crossbeam_skiplist::SkipList; + use engine_rocks::util::new_engine; + use engine_traits::{ + CacheRegion, FailedReason, Peekable, RegionCacheEngine, WriteBatch, DATA_CFS, + }; + use kvproto::metapb::{Region, RegionEpoch}; + use online_config::{ConfigChange, ConfigManager, ConfigValue}; + use tempfile::Builder; + use tikv_util::{config::VersionTrack, store::new_peer}; + + use super::*; + use crate::{ + background::flush_epoch, config::InMemoryEngineConfigManager, region_manager::RegionState, + test_util::new_region, InMemoryEngineConfig, InMemoryEngineContext, + }; + + // We should not use skiplist.get directly as we only cares keys without + // sequence number suffix + fn get_value( + sl: &Arc>, + key: &InternalBytes, + guard: &epoch::Guard, + ) -> Option> { + let mut iter = sl.owned_iter(); + iter.seek(key, guard); + if iter.valid() && iter.key().same_user_key_with(key) { + return Some(iter.value().as_slice().to_vec()); + } + None + } + + #[test] + fn test_write_to_skiplist() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let r = new_region(1, b"", b"z"); + engine.new_region(r.clone()); + engine.core.region_manager().set_safe_point(r.id, 10); + + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r); + wb.put(b"aaa", b"bbb").unwrap(); + wb.set_sequence_number(1).unwrap(); + assert_eq!(wb.write().unwrap(), 1); + let sl = engine.core.engine().data[cf_to_id(CF_DEFAULT)].clone(); + let guard = &crossbeam::epoch::pin(); + let val = get_value(&sl, &encode_key(b"aaa", 2, ValueType::Value), guard).unwrap(); + assert_eq!(&b"bbb"[..], val.as_slice()); + } + + #[test] + fn test_savepoints() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let r = new_region(1, b"", b"z"); + engine.new_region(r.clone()); + engine.core.region_manager().set_safe_point(r.id, 10); + + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r); + wb.put(b"aaa", b"bbb").unwrap(); + wb.set_save_point(); + wb.put(b"aaa", b"ccc").unwrap(); + wb.put(b"ccc", b"ddd").unwrap(); + wb.rollback_to_save_point().unwrap(); + wb.set_sequence_number(1).unwrap(); + assert_eq!(wb.write().unwrap(), 1); + let sl = engine.core.engine().data[cf_to_id(CF_DEFAULT)].clone(); + let guard = &crossbeam::epoch::pin(); + let val = get_value(&sl, &encode_key(b"aaa", 1, ValueType::Value), guard).unwrap(); + assert_eq!(&b"bbb"[..], val.as_slice()); + assert!(get_value(&sl, &encode_key(b"ccc", 1, ValueType::Value), guard).is_none()) + } + + #[test] + fn test_put_write_clear_delete_put_write() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let r = new_region(1, b"", b"z"); + engine + .core + .region_manager() + .new_region(CacheRegion::from_region(&r)); + engine.core.region_manager().set_safe_point(r.id, 10); + + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r); + wb.put(b"zaaa", b"bbb").unwrap(); + wb.set_sequence_number(1).unwrap(); + _ = wb.write(); + wb.clear(); + wb.prepare_for_region(&r); + wb.put(b"zbbb", b"ccc").unwrap(); + wb.delete(b"zaaa").unwrap(); + wb.set_sequence_number(2).unwrap(); + _ = wb.write(); + let snapshot = engine + .snapshot(CacheRegion::from_region(&r), u64::MAX, 3) + .unwrap(); + assert_eq!( + snapshot.get_value(&b"zbbb"[..]).unwrap().unwrap(), + &b"ccc"[..] + ); + assert!(snapshot.get_value(&b"zaaa"[..]).unwrap().is_none()) + } + + #[test] + fn test_prepare_for_apply() { + let path = Builder::new() + .prefix("test_prepare_for_apply") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), + )); + engine.set_disk_engine(rocks_engine.clone()); + + let r1 = new_region(1, b"k01".to_vec(), b"k05".to_vec()); + { + // load region with epoch and range change, will remove the pending region. + let cache_r1 = CacheRegion::from_region(&r1); + engine.load_region(cache_r1).unwrap(); + let mut r1_new = new_region(1, b"k01".to_vec(), b"k06".to_vec()); + r1_new.mut_region_epoch().version = 2; + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r1_new); + assert!( + engine + .core + .region_manager() + .regions_map + .read() + .regions() + .is_empty() + ); + wb.put(b"zk01", b"val1").unwrap(); + wb.put(b"zk03", b"val1").unwrap(); + wb.put(b"zk05", b"val1").unwrap(); + wb.set_sequence_number(2).unwrap(); + wb.write().unwrap(); + } + // pending region is removed, no data should write to skiplist. + let skip_engine = engine.core.engine(); + assert_eq!(skip_engine.node_count(), 0); + + // epoch changes but new range is contained by the pending region, will update + // the region. + let cache_r1 = CacheRegion::from_region(&r1); + engine.load_region(cache_r1).unwrap(); + let mut r1_new = new_region(1, b"k01".to_vec(), b"k05".to_vec()); + r1_new.mut_region_epoch().version = 2; + let cache_r1_new = CacheRegion::from_region(&r1_new); + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r1_new); + { + let regions_map = engine.core.region_manager().regions_map.read(); + let region_meta = regions_map.region_meta(1).unwrap(); + assert_eq!(region_meta.get_region(), &cache_r1_new); + assert_eq!(region_meta.get_state(), RegionState::Loading); + } + wb.put(b"zk02", b"val1").unwrap(); + wb.put(b"zk04", b"val1").unwrap(); + wb.set_sequence_number(5).unwrap(); + wb.write().unwrap(); + + test_util::eventually( + Duration::from_millis(50), + Duration::from_millis(1000), + || { + let regions_map = engine.core.region_manager().regions_map.read(); + regions_map.region_meta(1).unwrap().get_state() == RegionState::Active + }, + ); + let snapshot = engine.snapshot(cache_r1_new.clone(), u64::MAX, 6).unwrap(); + for i in 1..5 { + let res = snapshot.get_value(format!("zk0{}", i).as_bytes()).unwrap(); + if i % 2 == 0 { + assert_eq!(res.unwrap(), b"val1".as_slice()); + } else { + assert!(res.is_none()); + } + } + assert_eq!(skip_engine.node_count(), 2); + + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r1_new); + wb.put(b"zk01", b"val2").unwrap(); + wb.set_sequence_number(6).unwrap(); + wb.write().unwrap(); + + assert_eq!(skip_engine.node_count(), 3); + + // evict region, data should not be updated. + { + let mut regions_map = engine.core.region_manager().regions_map.write(); + regions_map + .mut_region_meta(1) + .unwrap() + .set_state(RegionState::PendingEvict); + } + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r1_new); + wb.put(b"zk02", b"val2").unwrap(); + wb.set_sequence_number(7).unwrap(); + wb.write().unwrap(); + // node count should not change. + assert_eq!(skip_engine.node_count(), 3); + } + + fn wait_evict_done(engine: &RegionCacheMemoryEngine) { + test_util::eventually( + Duration::from_millis(100), + Duration::from_millis(2000), + || { + let regions_map = engine.core.region_manager.regions_map().read(); + !regions_map + .regions() + .values() + .any(|meta| meta.get_state().is_evict()) + }, + ); + } + + #[test] + fn test_write_batch_with_memory_controller() { + let mut config = InMemoryEngineConfig::default(); + config.evict_threshold = Some(ReadableSize(500)); + config.capacity = Some(ReadableSize(1000)); + config.enable = true; + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + let regions = [ + new_region(1, b"k00", b"k10"), + new_region(2, b"k10", b"k20"), + new_region(3, b"k20", b"k30"), + new_region(4, b"k30", b"k40"), + new_region(5, b"k40", b"k50"), + ]; + for r in ®ions { + engine.new_region(r.clone()); + engine.core.region_manager().set_safe_point(r.id, 10); + let _ = engine + .snapshot(CacheRegion::from_region(r), 1000, 1000) + .unwrap(); + } + let memory_controller = engine.memory_controller(); + + let val1: Vec = vec![0; 150]; + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(®ions[0]); + // memory required: + // 4(key) + 8(sequencen number) + 150(value) + 16(2 Arc) = 28 + // Now, 562 + wb.delete(b"zk21").unwrap(); + assert_eq!(562, memory_controller.mem_usage()); + assert_eq!(wb.count(), 4); + + let val2: Vec = vec![2; 500]; + // The memory will fail to acquire + wb.put(b"zk22", &val2).unwrap(); + assert_eq!(562, memory_controller.mem_usage()); + assert_eq!(wb.count(), 4); + + wb.prepare_for_region(®ions[3]); + // The memory capacity is enough for the following two inserts + // Now, 740 + let val3: Vec = vec![3; 150]; + wb.put(b"zk32", &val3).unwrap(); + assert_eq!(740, memory_controller.mem_usage()); + assert_eq!(wb.count(), 5); + + // The memory will fail to acquire + let val4: Vec = vec![3; 300]; + wb.prepare_for_region(®ions[4]); + wb.put(b"zk41", &val4).unwrap(); + + // We should have allocated 740 as calculated above + assert_eq!(740, memory_controller.mem_usage()); + wb.write_impl(1000).unwrap(); + // We don't count the node overhead (96 bytes for each node) in write batch, + // so after they are written into the engine, the mem usage can even + // exceed the capacity. But this should be fine as this amount should be + // at most MB level. + assert_eq!(1220, memory_controller.mem_usage()); + + let snap1 = engine + .snapshot(CacheRegion::from_region(®ions[0]), 1000, 1010) + .unwrap(); + assert_eq!(snap1.get_value(b"zk01").unwrap().unwrap(), &val1); + let snap2 = engine + .snapshot(CacheRegion::from_region(®ions[1]), 1000, 1010) + .unwrap(); + assert_eq!(snap2.get_value(b"zk11").unwrap().unwrap(), &val1); + + assert_eq!( + engine + .snapshot(CacheRegion::from_region(®ions[2]), 1000, 1000) + .unwrap_err(), + FailedReason::NotCached + ); + + let snap4 = engine + .snapshot(CacheRegion::from_region(®ions[3]), 1000, 1010) + .unwrap(); + assert_eq!(snap4.get_value(b"zk32").unwrap().unwrap(), &val3); + + assert_eq!( + engine + .snapshot(CacheRegion::from_region(®ions[4]), 1000, 1010) + .unwrap_err(), + FailedReason::NotCached + ); + + // For region3, one write is buffered but others is rejected, so the region3 is + // evicted and the keys of it are deleted. After flush the epoch, we should + // get 1220-178-28(kv)-96*2(node overhead) = 822 memory usage. + flush_epoch(); + wait_evict_done(&engine); + assert_eq!(822, memory_controller.mem_usage()); + + drop(snap1); + engine.evict_region( + &CacheRegion::from_region(®ions[0]), + EvictReason::AutoEvict, + None, + ); + + wait_evict_done(&engine); + flush_epoch(); + assert_eq!(548, memory_controller.mem_usage()); + } + + #[test] + fn test_write_batch_with_config_change() { + let mut config = InMemoryEngineConfig::default(); + config.evict_threshold = Some(ReadableSize(u64::MAX)); + config.capacity = Some(ReadableSize(u64::MAX)); + config.enable = true; + let config = Arc::new(VersionTrack::new(config)); + let engine = + RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config.clone())); + let r1 = new_region(1, b"kk00".to_vec(), b"kk10".to_vec()); + let r2 = new_region(2, b"kk10".to_vec(), b"kk20".to_vec()); + for r in [&r1, &r2] { + engine.new_region(r.clone()); + engine.core.region_manager().set_safe_point(r.id, 10); + let _ = engine + .snapshot(CacheRegion::from_region(r), 1000, 1000) + .unwrap(); + } + + let val1: Vec = (0..150).map(|_| 0).collect(); + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r2); + wb.put(b"zkk11", &val1).unwrap(); + let snap1 = engine + .snapshot(CacheRegion::from_region(&r1), 1000, 1000) + .unwrap(); + + // disable the ime + let mut config_manager = InMemoryEngineConfigManager(config.clone()); + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("enable"), ConfigValue::Bool(false)); + config_manager.dispatch(config_change).unwrap(); + + wb.write_impl(1000).unwrap(); + // existing snapshot can still work after the ime is disabled, but new + // snapshot will fail to create + assert!(snap1.get_value(b"zkk00").unwrap().is_none()); + + let mut wb = RegionCacheWriteBatch::from(&engine); + // put should trigger the evict and it won't write into ime + wb.prepare_for_region(&r1); + wb.put(b"zkk01", &val1).unwrap(); + wb.write_impl(1000).unwrap(); + + // new snapshot will fail to create as it's evicted already + let snap1 = engine.snapshot(CacheRegion::from_region(&r1), 1000, 1000); + assert_eq!(snap1.unwrap_err(), FailedReason::NotCached); + let snap2 = engine + .snapshot(CacheRegion::from_region(&r2), 1000, 1000) + .unwrap(); + // if no new write, the ime can still be used. + assert_eq!(snap2.get_value(b"zkk11").unwrap().unwrap(), &val1); + + // enable the ime again + let mut config_manager = InMemoryEngineConfigManager(config.clone()); + let mut config_change = ConfigChange::new(); + config_change.insert(String::from("enable"), ConfigValue::Bool(true)); + config_manager.dispatch(config_change).unwrap(); + + let snap1 = engine.snapshot(CacheRegion::from_region(&r1), 1000, 1000); + assert_eq!(snap1.unwrap_err(), FailedReason::NotCached); + let snap2 = engine + .snapshot(CacheRegion::from_region(&r2), 1000, 1000) + .unwrap(); + assert_eq!(snap2.get_value(b"zkk11").unwrap().unwrap(), &val1); + } + + #[test] + fn test_write_batch_update_outdated_pending_region() { + let path = Builder::new() + .prefix("test_write_batch_update_outdated_pending_region") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests( + Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())), + )); + engine.set_disk_engine(rocks_engine.clone()); + + let r1 = CacheRegion::new(1, 0, b"zk00", b"zk10"); + + engine.core().region_manager().load_region(r1).unwrap(); + + // load a region with a newer epoch and small range, should trigger replace. + let mut r_new = Region::default(); + r_new.set_id(1); + let mut epoch = RegionEpoch::new(); + epoch.version = 1; + r_new.set_region_epoch(epoch); + r_new.set_peers(vec![new_peer(1, 1)].into()); + r_new.set_start_key(b"k00".to_vec()); + r_new.set_end_key(b"k05".to_vec()); + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r_new); + let r_new = CacheRegion::from_region(&r_new); + + { + let regions_map = engine.core.region_manager.regions_map().read(); + let cache_meta = regions_map.region_meta(1).unwrap(); + assert_eq!(cache_meta.get_region(), &r_new); + let meta_by_range = regions_map.region_meta_by_end_key(&r_new.end).unwrap(); + assert_eq!(meta_by_range.get_region(), &r_new); + } + } + + #[test] + fn test_dirty_data_exist_when_prepare_for_region() { + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let r = new_region(1, b"", b"z"); + let cache_region = CacheRegion::from_region(&r); + let mut wb = RegionCacheWriteBatch::from(&engine); + wb.prepare_for_region(&r); + + engine + .core() + .region_manager() + .load_region(cache_region.clone()) + .unwrap(); + wb.prepare_for_region(&r); + wb.put(b"k1", b"val1").unwrap(); + wb.put(b"k2", b"val2").unwrap(); + wb.set_sequence_number(100).unwrap(); + + wb.write().unwrap(); + + assert!(engine.core().engine().data[0].is_empty()); + } +} diff --git a/components/range_cache_memory_engine/tests/failpoints/mod.rs b/components/in_memory_engine/tests/failpoints/mod.rs similarity index 100% rename from components/range_cache_memory_engine/tests/failpoints/mod.rs rename to components/in_memory_engine/tests/failpoints/mod.rs diff --git a/components/in_memory_engine/tests/failpoints/test_memory_engine.rs b/components/in_memory_engine/tests/failpoints/test_memory_engine.rs new file mode 100644 index 00000000000..84bce6bdd64 --- /dev/null +++ b/components/in_memory_engine/tests/failpoints/test_memory_engine.rs @@ -0,0 +1,809 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{mpsc::sync_channel, Arc}, + time::Duration, +}; + +use crossbeam::epoch; +use engine_rocks::util::new_engine; +use engine_traits::{ + CacheRegion, EvictReason, Mutable, RegionCacheEngine, RegionCacheEngineExt, RegionEvent, + WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, +}; +use in_memory_engine::{ + decode_key, encode_key_for_boundary_without_mvcc, encoding_for_filter, + test_util::{new_region, put_data, put_data_in_rocks}, + BackgroundTask, InMemoryEngineConfig, InMemoryEngineContext, InternalBytes, InternalKey, + RegionCacheMemoryEngine, RegionState, SkiplistHandle, ValueType, +}; +use keys::{data_key, DATA_MAX_KEY, DATA_MIN_KEY}; +use kvproto::metapb::Region; +use tempfile::Builder; +use tikv_util::config::{ReadableDuration, ReadableSize, VersionTrack}; +use tokio::{ + sync::{mpsc, Mutex}, + time::timeout, +}; +use txn_types::{Key, TimeStamp, WriteType}; + +#[test] +fn test_set_disk_engine() { + let (tx, rx) = sync_channel(0); + fail::cfg_callback("ime_set_rocks_engine", move || { + let _ = tx.send(true); + }) + .unwrap(); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(InMemoryEngineConfig::config_for_test()), + ))); + let path = Builder::new() + .prefix("test_set_disk_engine") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + engine.set_disk_engine(rocks_engine.clone()); + rx.recv_timeout(Duration::from_secs(5)).unwrap(); +} + +// We should not use skiplist.get directly as we only cares keys without +// sequence number suffix +fn key_exist(sl: &SkiplistHandle, key: &InternalBytes, guard: &epoch::Guard) -> bool { + let mut iter = sl.iterator(); + iter.seek(key, guard); + if iter.valid() && iter.key().same_user_key_with(key) { + return true; + } + false +} + +#[test] +fn test_gc_worker() { + let mut config = InMemoryEngineConfig::config_for_test(); + config.gc_run_interval = ReadableDuration(Duration::from_secs(1)); + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + let memory_controller = engine.memory_controller(); + engine + .core() + .region_manager() + .new_region(CacheRegion::new(1, 0, DATA_MIN_KEY, DATA_MAX_KEY)); + let skip_engine = engine.core().engine(); + let write = skip_engine.cf_handle(CF_WRITE); + let default = skip_engine.cf_handle(CF_DEFAULT); + + fail::cfg("ime_gc_oldest_seqno", "return(1000)").unwrap(); + + let (tx, rx) = sync_channel(0); + fail::cfg_callback("in_memory_engine_gc_finish", move || { + tx.send(true).unwrap(); + }) + .unwrap(); + + let start_ts = TimeStamp::physical_now() - Duration::from_secs(10).as_millis() as u64; + let commit_ts1 = TimeStamp::physical_now() - Duration::from_secs(9).as_millis() as u64; + put_data( + b"k", + b"v1", + start_ts, + commit_ts1, + 100, + false, + &default, + &write, + memory_controller.clone(), + ); + + let start_ts = TimeStamp::physical_now() - Duration::from_secs(8).as_millis() as u64; + let commit_ts2 = TimeStamp::physical_now() - Duration::from_secs(7).as_millis() as u64; + put_data( + b"k", + b"v2", + start_ts, + commit_ts2, + 110, + false, + &default, + &write, + memory_controller.clone(), + ); + + let start_ts = TimeStamp::physical_now() - Duration::from_secs(6).as_millis() as u64; + let commit_ts3 = TimeStamp::physical_now() - Duration::from_secs(5).as_millis() as u64; + put_data( + b"k", + b"v3", + start_ts, + commit_ts3, + 110, + false, + &default, + &write, + memory_controller.clone(), + ); + + let start_ts = TimeStamp::physical_now() - Duration::from_secs(4).as_millis() as u64; + let commit_ts4 = TimeStamp::physical_now() - Duration::from_secs(3).as_millis() as u64; + put_data( + b"k", + b"v4", + start_ts, + commit_ts4, + 110, + false, + &default, + &write, + memory_controller.clone(), + ); + + let guard = &epoch::pin(); + for &ts in &[commit_ts1, commit_ts2, commit_ts3] { + let key = Key::from_raw(b"zk"); + let key = encoding_for_filter(key.as_encoded(), TimeStamp::new(ts)); + + assert!(key_exist(&write, &key, guard)); + } + + let _ = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + let key = Key::from_raw(b"zk"); + // now, the outdated mvcc versions should be gone + for &ts in &[commit_ts1, commit_ts2, commit_ts3] { + let key = encoding_for_filter(key.as_encoded(), TimeStamp::new(ts)); + assert!(!key_exist(&write, &key, guard)); + } + + let key = encoding_for_filter(key.as_encoded(), TimeStamp::new(commit_ts4)); + assert!(key_exist(&write, &key, guard)); +} + +#[test] +fn test_clean_up_tombstone() { + let config = Arc::new(VersionTrack::new(InMemoryEngineConfig::config_for_test())); + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config.clone())); + let region = new_region(1, b"".to_vec(), b"z".to_vec()); + let (tx, rx) = sync_channel(0); + fail::cfg_callback("ime_clean_lock_tombstone_done", move || { + tx.send(true).unwrap(); + }) + .unwrap(); + + engine.new_region(region.clone()); + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.put_cf("lock", b"k", b"val").unwrap(); + wb.put_cf("lock", b"k1", b"val").unwrap(); + wb.put_cf("lock", b"k2", b"val").unwrap(); + wb.delete_cf("lock", b"k").unwrap(); + wb.delete_cf("lock", b"k1").unwrap(); + wb.delete_cf("lock", b"k2").unwrap(); + wb.put_cf("lock", b"k", b"val2").unwrap(); // seq 107 + wb.set_sequence_number(100).unwrap(); + wb.write().unwrap(); + + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.put_cf("lock", b"k", b"val").unwrap(); // seq 120 + wb.put_cf("lock", b"k1", b"val").unwrap(); // seq 121 + wb.put_cf("lock", b"k2", b"val").unwrap(); // seq 122 + wb.delete_cf("lock", b"k").unwrap(); // seq 123 + wb.delete_cf("lock", b"k1").unwrap(); // seq 124 + wb.delete_cf("lock", b"k2").unwrap(); // seq 125 + wb.set_sequence_number(120).unwrap(); + wb.write().unwrap(); + + let lock_handle = engine.core().engine().cf_handle("lock"); + assert_eq!(lock_handle.len(), 13); + + engine + .bg_worker_manager() + .schedule_task(BackgroundTask::CleanLockTombstone(107)) + .unwrap(); + + rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + let mut iter = engine.core().engine().cf_handle("lock").iterator(); + + let mut first = true; + let guard = &epoch::pin(); + for (k, seq, ty) in [ + (b"k".to_vec(), 123, ValueType::Deletion), + (b"k".to_vec(), 120, ValueType::Value), + (b"k".to_vec(), 106, ValueType::Value), + (b"k1".to_vec(), 124, ValueType::Deletion), + (b"k1".to_vec(), 121, ValueType::Value), + (b"k2".to_vec(), 125, ValueType::Deletion), + (b"k2".to_vec(), 122, ValueType::Value), + ] { + if first { + iter.seek_to_first(guard); + first = false; + } else { + iter.next(guard); + } + + let key = iter.key(); + let InternalKey { + user_key, + sequence, + v_type, + } = decode_key(key.as_bytes()); + assert_eq!(sequence, seq); + assert_eq!(user_key, &k); + assert_eq!(v_type, ty); + } +} + +#[test] +fn test_evict_with_loading_range() { + let path = Builder::new().prefix("test").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let config = InMemoryEngineConfig::config_for_test(); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + engine.set_disk_engine(rocks_engine); + + let r1 = new_region(1, b"k00".to_vec(), b"k10".to_vec()); + let r2 = new_region(2, b"k20".to_vec(), b"k30".to_vec()); + let r3 = new_region(3, b"k40".to_vec(), b"k50".to_vec()); + let (snapshot_load_tx, snapshot_load_rx) = sync_channel(0); + + // range1 and range2 will be evicted + let r = new_region(4, b"k05".to_vec(), b"k25".to_vec()); + let engine_clone = engine.clone(); + fail::cfg_callback("ime_on_snapshot_load_finished", move || { + let _ = snapshot_load_tx.send(true); + engine_clone.evict_region(&CacheRegion::from_region(&r), EvictReason::AutoEvict, None); + }) + .unwrap(); + + let (loading_complete_tx, loading_complete_rx) = sync_channel(0); + fail::cfg_callback("ime_on_completes_batch_loading", move || { + let _ = loading_complete_tx.send(true); + }) + .unwrap(); + + let cache_region1 = CacheRegion::from_region(&r1); + let cache_region2 = CacheRegion::from_region(&r2); + let cache_region3 = CacheRegion::from_region(&r3); + engine.load_region(cache_region1.clone()).unwrap(); + engine.load_region(cache_region2.clone()).unwrap(); + engine.load_region(cache_region3.clone()).unwrap(); + + let mut wb = engine.write_batch(); + // prepare range to trigger loading + wb.prepare_for_region(&r1); + wb.prepare_for_region(&r2); + wb.prepare_for_region(&r3); + wb.set_sequence_number(10).unwrap(); + wb.write().unwrap(); + + snapshot_load_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + snapshot_load_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + + loading_complete_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + + let read_ts = TimeStamp::compose(TimeStamp::physical_now(), 0).into_inner(); + engine.snapshot(cache_region1, read_ts, 100).unwrap_err(); + engine.snapshot(cache_region2, read_ts, 100).unwrap_err(); + engine.snapshot(cache_region3, read_ts, 100).unwrap(); +} + +#[test] +fn test_cached_write_batch_cleared_when_load_failed() { + let path = Builder::new().prefix("test").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let mut config = InMemoryEngineConfig::config_for_test(); + config.stop_load_threshold = Some(ReadableSize(20)); + config.evict_threshold = Some(ReadableSize(30)); + config.capacity = Some(ReadableSize(40)); + let config = Arc::new(VersionTrack::new(config)); + let mut engine = + RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(config.clone())); + engine.set_disk_engine(rocks_engine); + + let (tx, rx) = sync_channel(0); + fail::cfg_callback("ime_on_snapshot_load_finished", move || { + let _ = tx.send(true); + }) + .unwrap(); + + fail::cfg("ime_on_snapshot_load_finished2", "pause").unwrap(); + + // range1 will be canceled in on_snapshot_load_finished whereas range2 will be + // canceled at begin + let r1 = new_region(1, b"k00", b"k10"); + let r2 = new_region(2, b"k20", b"k30"); + let cache_region1 = CacheRegion::from_region(&r1); + let cache_region2 = CacheRegion::from_region(&r2); + engine.load_region(cache_region1.clone()).unwrap(); + engine.load_region(cache_region2.clone()).unwrap(); + + let mut wb = engine.write_batch(); + // range1 starts to load + wb.prepare_for_region(&r1); + rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + wb.put(b"zk05", b"val").unwrap(); + wb.put(b"zk06", b"val").unwrap(); + wb.prepare_for_region(&r2); + wb.put(b"zk25", b"val").unwrap(); + wb.set_sequence_number(100).unwrap(); + wb.write().unwrap(); + + fail::remove("ime_on_snapshot_load_finished2"); + + test_util::eventually( + Duration::from_millis(100), + Duration::from_millis(2000), + || { + let regions_map = engine.core().region_manager().regions_map().read(); + // all failed regions should be removed. + [1, 2] + .into_iter() + .all(|i| regions_map.region_meta(i).is_none()) + }, + ); +} + +#[test] +fn test_concurrency_between_delete_range_and_write_to_memory() { + let path = Builder::new().prefix("test").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + let mut wb = rocks_engine.write_batch(); + wb.put_cf(CF_LOCK, b"zk40", b"val").unwrap(); + wb.put_cf(CF_LOCK, b"zk41", b"val").unwrap(); + wb.put_cf(CF_LOCK, b"zk42", b"val").unwrap(); + wb.write().unwrap(); + + let config = InMemoryEngineConfig::config_for_test(); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + engine.set_disk_engine(rocks_engine); + + let r1 = new_region(1, b"k00".to_vec(), b"k10".to_vec()); + let r2 = new_region(2, b"k20".to_vec(), b"k30".to_vec()); + let r3 = new_region(3, b"k40".to_vec(), b"k50".to_vec()); + let (snapshot_load_cancel_tx, snapshot_load_cancel_rx) = sync_channel(0); + fail::cfg_callback("ime_snapshot_load_canceled", move || { + let _ = snapshot_load_cancel_tx.send(true); + }) + .unwrap(); + let (snapshot_load_tx, snapshot_load_rx) = sync_channel(0); + fail::cfg_callback("ime_on_snapshot_load_finished", move || { + let _ = snapshot_load_tx.send(true); + }) + .unwrap(); + fail::cfg("ime_before_clear_regions_in_being_written", "pause").unwrap(); + + let (write_batch_consume_tx, write_batch_consume_rx) = sync_channel(0); + fail::cfg_callback( + "ime_on_region_cache_write_batch_write_consumed", + move || { + let _ = write_batch_consume_tx.send(true); + }, + ) + .unwrap(); + + let (delete_range_tx, delete_range_rx) = sync_channel(0); + fail::cfg_callback("ime_delete_range_done", move || { + let _ = delete_range_tx.send(true); + }) + .unwrap(); + + let cache_region1 = CacheRegion::from_region(&r1); + let cache_region2 = CacheRegion::from_region(&r2); + let cache_region3 = CacheRegion::from_region(&r3); + + engine.new_region(r1.clone()); + engine.new_region(r2.clone()); + engine.load_region(cache_region3.clone()).unwrap(); + + let engine_clone = engine.clone(); + let (range_prepared_tx, range_prepared_rx) = sync_channel(0); + let region1_clone = r1.clone(); + let region2_clone = r2.clone(); + let region3_clone = r3.clone(); + let handle = std::thread::spawn(move || { + let mut wb = engine_clone.write_batch(); + wb.prepare_for_region(®ion1_clone); + wb.put_cf(CF_LOCK, b"zk02", b"val").unwrap(); + wb.put_cf(CF_LOCK, b"zk03", b"val").unwrap(); + wb.put_cf(CF_LOCK, b"zk04", b"val").unwrap(); + wb.set_sequence_number(100).unwrap(); + + let mut wb2 = engine_clone.write_batch(); + wb2.prepare_for_region(®ion2_clone); + wb.put_cf(CF_LOCK, b"zk22", b"val").unwrap(); + wb.put_cf(CF_LOCK, b"zk23", b"val").unwrap(); + wb2.set_sequence_number(200).unwrap(); + + let mut wb3 = engine_clone.write_batch(); + wb3.prepare_for_region(®ion3_clone); + wb3.set_sequence_number(300).unwrap(); + + range_prepared_tx.send(true).unwrap(); + + wb.write().unwrap(); + wb2.write().unwrap(); + wb3.write().unwrap(); + }); + + range_prepared_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // Now, three ranges are in write status, delete range will not be performed + // until they leave the write status + + engine.evict_region(&cache_region1, EvictReason::AutoEvict, None); + engine.evict_region(&cache_region2, EvictReason::AutoEvict, None); + + let verify_data = |r: &Region, expected_num: u64| { + let handle = engine.core().engine().cf_handle(CF_LOCK); + let (start, end) = encode_key_for_boundary_without_mvcc(&CacheRegion::from_region(r)); + let mut iter = handle.iterator(); + let guard = &epoch::pin(); + let mut count = 0; + iter.seek(&start, guard); + while iter.valid() && iter.key() < &end { + count += 1; + iter.next(guard); + } + assert_eq!(count, expected_num); + }; + + write_batch_consume_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // Now, a DeleteRange task has been done: actually, the task will be delayed, so + // the data has not be deleted + verify_data(&r1, 3); + // remove failpoint so that the range can leave write status + fail::remove("ime_before_clear_regions_in_being_written"); + delete_range_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // Now, data should be deleted + verify_data(&r1, 0); + + // Next to test range2 + fail::cfg("ime_before_clear_regions_in_being_written", "pause").unwrap(); + write_batch_consume_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + verify_data(&r2, 2); + // remove failpoint so that the range can leave write status + fail::remove("ime_before_clear_regions_in_being_written"); + delete_range_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + verify_data(&r2, 0); + + // ensure the range enters on_snapshot_load_finished before eviction + snapshot_load_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + engine.evict_region(&CacheRegion::from_region(&r3), EvictReason::AutoEvict, None); + + fail::cfg("ime_before_clear_regions_in_being_written", "pause").unwrap(); + write_batch_consume_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + verify_data(&r3, 3); + snapshot_load_cancel_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + fail::remove("ime_before_clear_regions_in_being_written"); + delete_range_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + verify_data(&r3, 0); + + let _ = handle.join(); +} + +#[test] +fn test_double_delete_range_schedule() { + let path = Builder::new().prefix("test").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let config = InMemoryEngineConfig::config_for_test(); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + engine.set_disk_engine(rocks_engine); + + let r1 = new_region(1, b"k00", b"k10"); + let r2 = new_region(2, b"k20", b"k30"); + let r3 = new_region(3, b"k40", b"k50"); + let (snapshot_load_tx, snapshot_load_rx) = sync_channel(0); + let engine_clone = engine.clone(); + let r = new_region(4, b"k00", b"k60"); + fail::cfg_callback("ime_on_snapshot_load_finished", move || { + let _ = snapshot_load_tx.send(true); + // evict all ranges. So the loading ranges will also be evicted and a delete + // range task will be scheduled. + engine_clone.evict_region(&CacheRegion::from_region(&r), EvictReason::AutoEvict, None); + }) + .unwrap(); + + let (delete_range_tx, delete_range_rx) = sync_channel(0); + fail::cfg_callback("ime_on_delete_range", move || { + let _ = delete_range_tx.send(true); + }) + .unwrap(); + + engine.new_region(r1.clone()); + engine.new_region(r2.clone()); + let cache_region3 = CacheRegion::from_region(&r3); + engine.load_region(cache_region3.clone()).unwrap(); + + let snap1 = engine + .snapshot(CacheRegion::from_region(&r1), 100, 100) + .unwrap(); + let snap2 = engine + .snapshot(CacheRegion::from_region(&r2), 100, 100) + .unwrap(); + + let mut wb = engine.write_batch(); + // prepare range to trigger loading + wb.prepare_for_region(&r3); + wb.set_sequence_number(10).unwrap(); + wb.write().unwrap(); + + snapshot_load_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + delete_range_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + + drop(snap1); + drop(snap2); + + // two cached ranges + delete_range_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + delete_range_rx + .recv_timeout(Duration::from_secs(5)) + .unwrap(); + // sleep a while to ensure no further delete range will be scheduled + delete_range_rx + .recv_timeout(Duration::from_secs(2)) + .unwrap_err(); +} + +#[test] +fn test_load_with_gc() { + let path = Builder::new().prefix("test").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let mut config = InMemoryEngineConfig::config_for_test(); + config.gc_run_interval = ReadableDuration(Duration::from_secs(1)); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + engine.set_disk_engine(rocks_engine.clone()); + + // safe_point: 6 + // Rocks: [k1-5, k1-3, k2-4-d, k2-3, k3-7, k3-4, k3-1, k4-6-d, k4-5] + // After load + // IME: [k1-5, k3-7, k3-4] + put_data_in_rocks(b"k1", b"val", 5, 4, false, &rocks_engine, WriteType::Put); + put_data_in_rocks(b"k1", b"val", 3, 2, false, &rocks_engine, WriteType::Put); + put_data_in_rocks(b"k2", b"val", 4, 3, false, &rocks_engine, WriteType::Delete); + put_data_in_rocks(b"k2", b"val", 3, 2, false, &rocks_engine, WriteType::Put); + put_data_in_rocks(b"k3", b"val", 7, 5, false, &rocks_engine, WriteType::Put); + put_data_in_rocks(b"k3", b"val", 4, 3, false, &rocks_engine, WriteType::Put); + put_data_in_rocks(b"k3", b"val", 1, 0, false, &rocks_engine, WriteType::Put); + put_data_in_rocks(b"k4", b"val", 6, 0, false, &rocks_engine, WriteType::Delete); + put_data_in_rocks(b"k4", b"val", 5, 0, false, &rocks_engine, WriteType::Put); + + fail::cfg("ime_safe_point_in_loading", "return(6)").unwrap(); + let (load_tx, load_rx) = sync_channel(0); + fail::cfg_callback("ime_on_completes_batch_loading", move || { + let _ = load_tx.send(true); + }) + .unwrap(); + + let region = new_region(1, b"", b"z"); + let range = CacheRegion::from_region(®ion); + engine.load_region(range.clone()).unwrap(); + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.set_sequence_number(100).unwrap(); + wb.write().unwrap(); + + load_rx.recv_timeout(Duration::from_secs(5)).unwrap(); + + let expects = vec![(b"k1", 5), (b"k3", 7), (b"k3", 4)]; + let write_handle = engine.core().engine().cf_handle(CF_WRITE); + + let mut iter = write_handle.iterator(); + let guard = &epoch::pin(); + iter.seek_to_first(guard); + for (key, commit_ts) in expects { + let expect_key = Key::from_raw(&data_key(key)).into_encoded(); + let InternalKey { user_key, .. } = decode_key(iter.key().as_bytes()); + let (mem_key, ts) = Key::split_on_ts_for(user_key).unwrap(); + assert_eq!(expect_key, mem_key); + assert_eq!(commit_ts, ts.into_inner()); + iter.next(guard); + } + + // ensure the safe point of the engine + engine.snapshot(range.clone(), 6, 100).unwrap_err(); + engine.snapshot(range, 7, 100).unwrap(); +} + +// test in-memory-engine can handle region split event after load region task +// is scheduled but before the task start running. As source region is split +// into multiple regions, IME should handle all the regions with in the range +// and update their state to active when batch loading task is finished. +#[test] +fn test_region_split_before_batch_loading_start() { + let path = Builder::new().prefix("test").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); + + let mut config = InMemoryEngineConfig::config_for_test(); + config.gc_run_interval = ReadableDuration(Duration::from_secs(1)); + let mut engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + engine.set_disk_engine(rocks_engine.clone()); + + let (tx, rx) = sync_channel(0); + fail::cfg_callback("ime_before_start_loading_region", move || { + let _ = tx.send(()); + }) + .unwrap(); + fail::cfg("ime_on_start_loading_region", "pause").unwrap(); + + let region = new_region(1, b"k00", b"k30"); + let cache_region = CacheRegion::from_region(®ion); + + // write some data into rocksdb to trigger batch loading. + for i in 0..30 { + let key = format!("k{:02}", i); + put_data_in_rocks( + key.as_bytes(), + b"val", + 2, + 1, + false, + &rocks_engine, + WriteType::Put, + ); + } + + engine.load_region(cache_region.clone()).unwrap(); + + // use write batch to trigger scheduling pending region loading task. + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.set_sequence_number(10).unwrap(); + wb.put(b"zk00", b"val2").unwrap(); + wb.put(b"zk10", b"val2").unwrap(); + wb.put(b"zk20", b"val2").unwrap(); + wb.write().unwrap(); + assert_eq!( + engine + .core() + .region_manager() + .regions_map() + .read() + .region_meta(1) + .unwrap() + .get_state(), + RegionState::Loading + ); + + // wait for task start. + rx.recv().unwrap(); + + // split source region into multiple new regions. + let new_regions = vec![ + CacheRegion::new(1, 2, b"zk00", b"zk10"), + CacheRegion::new(2, 2, b"zk10", b"zk20"), + CacheRegion::new(3, 2, b"zk20", b"zk30"), + ]; + let event = RegionEvent::Split { + source: cache_region.clone(), + new_regions: new_regions.clone(), + }; + engine.on_region_event(event); + { + let regions_map = engine.core().region_manager().regions_map().read(); + for i in 1..=3 { + assert_eq!( + regions_map.region_meta(i).unwrap().get_state(), + RegionState::Loading + ); + } + } + + // unblock batch loading. + fail::remove("ime_on_start_loading_region"); + + // all new regions should be active after batch loading task finished. + test_util::eventually( + Duration::from_millis(50), + Duration::from_millis(2000), + || { + let regions_map = engine.core().region_manager().regions_map().read(); + (1..=3).all(|i| regions_map.region_meta(i).unwrap().get_state() == RegionState::Active) + }, + ); +} + +#[test] +fn test_cb_on_eviction() { + let mut config = InMemoryEngineConfig::config_for_test(); + config.gc_run_interval = ReadableDuration(Duration::from_secs(1)); + let engine = RegionCacheMemoryEngine::new(InMemoryEngineContext::new_for_tests(Arc::new( + VersionTrack::new(config), + ))); + + let region = new_region(1, b"", b"z"); + let cache_region = CacheRegion::from_region(®ion); + engine.new_region(region.clone()); + + let mut wb = engine.write_batch(); + wb.prepare_for_region(®ion); + wb.set_sequence_number(10).unwrap(); + wb.put(b"a", b"val1").unwrap(); + wb.put(b"b", b"val2").unwrap(); + wb.put(b"c", b"val3").unwrap(); + wb.write().unwrap(); + + fail::cfg("ime_on_delete_regions", "pause").unwrap(); + + let (tx, rx) = mpsc::channel(1); + engine.evict_region( + &cache_region, + EvictReason::BecomeFollower, + Some(Box::new(move || { + Box::pin(async move { + let _ = tx.send(()).await; + }) + })), + ); + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let rx = Arc::new(Mutex::new(rx)); + let rx_clone = rx.clone(); + rt.block_on(async move { + timeout(Duration::from_secs(1), rx_clone.lock().await.recv()) + .await + .unwrap_err() + }); + fail::remove("ime_on_delete_regions"); + rt.block_on(async move { rx.lock().await.recv().await.unwrap() }); + + { + let regions_map = engine.core().region_manager().regions_map().read(); + assert!(regions_map.region_meta(1).is_none()); + } +} diff --git a/components/online_config/Cargo.toml b/components/online_config/Cargo.toml index d5b4bf7a76a..e91387129a3 100644 --- a/components/online_config/Cargo.toml +++ b/components/online_config/Cargo.toml @@ -11,5 +11,4 @@ online_config_derive = { path = "./online_config_derive" } serde = { version = "1.0", features = ["derive"] } [dev-dependencies] -serde_derive = "1.0" toml = "0.5" diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index a5925a584b2..aae4549e215 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -17,7 +17,6 @@ futures = "0.3" grpcio = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 76bb8c77216..39f80ba6dd6 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -52,6 +52,9 @@ pub struct RegionStat { pub read_bytes: u64, pub read_keys: u64, pub query_stats: QueryStats, + // Now, this info is not sent to PD (maybe in the future). It is needed here to make it + // collected by region collector. + pub cop_detail: RegionWriteCfCopDetail, pub approximate_size: u64, pub approximate_keys: u64, pub last_report_ts: UnixSecs, @@ -564,3 +567,62 @@ fn check_update_service_safe_point_resp( } Ok(()) } + +// Record the coprocessor details for region level. +#[derive(Clone, Debug, Default)] +pub struct RegionWriteCfCopDetail { + // How many times the `next` is called when handling cop request + pub next: usize, + // How many times the `prev` is called when handling cop request + pub prev: usize, + // How many keys that's visible to user + pub processed_keys: usize, +} + +impl RegionWriteCfCopDetail { + pub fn new(next: usize, prev: usize, processed_keys: usize) -> Self { + Self { + next, + prev, + processed_keys, + } + } + + pub fn add(&mut self, other: &RegionWriteCfCopDetail) { + self.next += other.next; + self.prev += other.prev; + self.processed_keys += other.processed_keys; + } + + pub fn sub(&self, other: &RegionWriteCfCopDetail) -> Self { + Self::new( + self.next - other.next, + self.prev - other.prev, + self.processed_keys - other.processed_keys, + ) + } + + #[inline] + pub fn iterated_count(&self) -> usize { + self.next + self.prev + } + + #[inline] + pub fn mvcc_amplification(&self) -> f64 { + // Sometimes, processed_keys is 0 even (next + prev) is pretty high + self.iterated_count() as f64 / (self.processed_keys as f64 + 1.0) + } +} + +#[cfg(test)] +mod tests { + use crate::RegionWriteCfCopDetail; + + #[test] + fn test_procssed_key_0() { + let mut cop_detail = RegionWriteCfCopDetail::default(); + cop_detail.next = 11; + + assert_eq!(cop_detail.mvcc_amplification(), 11.0); + } +} diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 058590dceff..1ac1d80321b 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -732,10 +732,11 @@ impl PdConnector { // There are 3 kinds of situations we will return the new client: // 1. the force is true which represents the client is newly created or the - // original connection has some problem 2. the previous forwarded host is - // not empty and it can connect the leader now which represents the network - // partition problem to leader may be recovered 3. the member information of - // PD has been changed + // original connection has some problem. + // 2. the previous forwarded host is not empty and it can connect the leader + // now which represents the network partition problem to leader may be + // recovered. + // 3. the member information of PD has been changed. pub async fn reconnect_pd( &self, members_resp: GetMembersResponse, @@ -952,6 +953,8 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::Unknown => Err(box_err!(err.get_message())), ErrorType::InvalidValue => Err(box_err!(err.get_message())), ErrorType::GlobalConfigNotFound => panic!("unexpected error {:?}", err), + // It will not happen, because we don't call `batch_scan_regions` in TiKV. + ErrorType::RegionsNotContainAllKeyRange => Err(box_err!(err.get_message())), } } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 23ba11bb5f2..c1c48988f44 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -9,19 +9,13 @@ license = "Apache-2.0" failpoints = ["raft-engine/failpoints"] [dependencies] -codec = { workspace = true } encryption = { workspace = true } engine_traits = { workspace = true } +codec = { workspace = true } file_system = { workspace = true } kvproto = { workspace = true } -lazy_static = "1.4.0" -num_cpus = "1" -online_config = { workspace = true } -protobuf = "2" raft = { workspace = true } raft-engine = { workspace = true } -serde = "1.0" -serde_derive = "1.0" slog = { workspace = true } slog-global = { workspace = true } tikv_util = { workspace = true } diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 7e3f66627b4..4affad396bd 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -37,7 +37,6 @@ engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -fs2 = "0.4" futures = { version = "0.3", features = ["compat"] } health_controller = { workspace = true } keys = { workspace = true } @@ -55,7 +54,6 @@ resource_control = { workspace = true } resource_metering = { workspace = true } service = { workspace = true } slog = "2.3" -smallvec = "1.4" sst_importer = { workspace = true } thiserror = "1.0" tikv_util = { workspace = true } diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index 44580144dce..0aa44ea45db 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -324,7 +324,7 @@ impl Peer { } let Err(cmd) = SimpleWriteReqDecoder::new( |buf, index, term| parse_at(&self.logger, buf, index, term), - &self.logger, + Some(&self.logger), entry.get_data(), entry.get_index(), entry.get_term(), diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 4103551041b..a8a3e5e0465 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -538,7 +538,7 @@ impl Apply { pub async fn apply_unsafe_write(&mut self, data: Box<[u8]>) { let decoder = match SimpleWriteReqDecoder::new( |buf, index, term| parse_at(&self.logger, buf, index, term), - &self.logger, + Some(&self.logger), &data, u64::MAX, u64::MAX, @@ -646,7 +646,7 @@ impl Apply { let req = match entry.get_entry_type() { EntryType::EntryNormal => match SimpleWriteReqDecoder::new( |buf, index, term| parse_at(&self.logger, buf, index, term), - &self.logger, + Some(&self.logger), entry.get_data(), log_index, entry.get_term(), diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs index 868ed12ed32..4b6a8c24fc3 100644 --- a/components/raftstore-v2/src/operation/query/capture.rs +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -118,7 +118,7 @@ impl Apply { self.flush(); let (applied_index, _) = self.apply_progress(); let snap = RegionSnapshot::from_snapshot( - Arc::new(self.tablet().snapshot(None)), + Arc::new(self.tablet().snapshot()), Arc::new(self.region().clone()), ); snap.set_apply_index(applied_index); diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index dd540762a69..a474ff3e049 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -209,7 +209,7 @@ where ReadRequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); let snap = RegionSnapshot::from_snapshot( - Arc::new(delegate.cached_tablet.cache().snapshot(None)), + Arc::new(delegate.cached_tablet.cache().snapshot()), region, ); @@ -240,7 +240,7 @@ where let region = Arc::clone(&delegate.region); let snap = RegionSnapshot::from_snapshot( - Arc::new(delegate.cached_tablet.cache().snapshot(None)), + Arc::new(delegate.cached_tablet.cache().snapshot()), region, ); @@ -264,7 +264,7 @@ where let region = Arc::clone(&delegate.region); let snap = RegionSnapshot::from_snapshot( - Arc::new(delegate.cached_tablet.cache().snapshot(None)), + Arc::new(delegate.cached_tablet.cache().snapshot()), region, ); diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index 7e74405dced..be207e1ab16 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -5,7 +5,9 @@ use std::{sync::Arc, time::Duration}; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{metapb, pdpb}; -use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat}; +use pd_client::{ + metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat, RegionWriteCfCopDetail, +}; use raftstore::store::{ReadStats, WriteStats}; use resource_metering::RawRecords; use slog::{debug, error, info}; @@ -160,6 +162,7 @@ where read_bytes: read_bytes_delta, read_keys: read_keys_delta, query_stats: query_stats.0, + cop_detail: RegionWriteCfCopDetail::default(), approximate_size, approximate_keys, last_report_ts, diff --git a/components/raftstore-v2/src/worker/pd/store.rs b/components/raftstore-v2/src/worker/pd/store.rs index 926ad307cf0..226fef08d11 100644 --- a/components/raftstore-v2/src/worker/pd/store.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -23,6 +23,7 @@ use slog::{error, info, warn}; use tikv_util::{ metrics::RecordPairVec, store::QueryStats, + sys::disk::get_disk_space_stats, time::{Duration, Instant as TiInstant, UnixSecs}, topn::TopN, }; @@ -442,7 +443,8 @@ where /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { - let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { + let (disk_cap, disk_avail) = match get_disk_space_stats(self.tablet_registry.tablet_root()) + { Err(e) => { error!( self.logger, @@ -452,9 +454,8 @@ where ); return None; } - Ok(stats) => stats, + Ok((total_size, available_size)) => (total_size, available_size), }; - let disk_cap = disk_stats.total_space(); let capacity = if self.cfg.value().capacity.0 == 0 { disk_cap } else { @@ -481,7 +482,7 @@ where let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. - available = cmp::min(available, disk_stats.available_space()); + available = cmp::min(available, disk_avail); Some((capacity, used_size, available)) } } diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 258c736d178..3f4cdd961d4 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -37,28 +37,21 @@ engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -fs2 = "0.4" futures = "0.3" futures-util = { version = "0.3.1", default-features = false, features = [ "io", ] } getset = "0.1" -grpcio-health = { workspace = true } health_controller = { workspace = true } into_other = { workspace = true } itertools = "0.10" keys = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" -log = { version = "0.4", features = [ - "max_level_trace", - "release_max_level_debug", -] } log_wrappers = { workspace = true } memory_trace_macros = { workspace = true } online_config = { workspace = true } openssl = { workspace = true } -ordered-float = "2.6" parking_lot = "0.12" pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } @@ -78,7 +71,6 @@ slog-global = { workspace = true } smallvec = "1.4" sst_importer = { workspace = true } strum = { version = "0.20", features = ["derive"] } -strum_macros = "0.24" tempfile = "3.0" thiserror = "1.0" tidb_query_datatype = { workspace = true } @@ -96,7 +88,5 @@ encryption_export = { workspace = true } engine_panic = { workspace = true } engine_rocks = { workspace = true } engine_test = { workspace = true } -hybrid_engine = { workspace = true } panic_hook = { workspace = true } -range_cache_memory_engine = { workspace = true } test_sst_importer = { workspace = true } diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 8abfe38bb51..7eb13210ee1 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -69,8 +69,10 @@ pub enum ConsistencyCheckMethod { Mvcc = 1, } -/// Default region split size. -pub const SPLIT_SIZE: ReadableSize = ReadableSize::mb(96); +/// Default region split size. In version < 8.3.0, the default split size is +/// 96MB. In version >= 8.3.0, the default split size is increased to 256MB to +/// allow for larger region size in TiKV. +pub const SPLIT_SIZE: ReadableSize = ReadableSize::mb(256); pub const RAFTSTORE_V2_SPLIT_SIZE: ReadableSize = ReadableSize::gb(10); /// Default batch split limit. diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 7e25feff788..b350d971e2d 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -3,15 +3,16 @@ // #[PerformanceCriticalPath] called by Fsm on_ready_compute_hash use std::{borrow::Cow, marker::PhantomData, mem, ops::Deref}; -use engine_traits::{CfName, KvEngine}; +use engine_traits::{CfName, KvEngine, WriteBatch}; use kvproto::{ metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, - raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_cmdpb::{CmdType, ComputeHashRequest, RaftCmdRequest}, raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; +use read_write::WriteBatchObserver; use tikv_util::box_try; use super::{split_observer::SplitObserver, *}; @@ -289,12 +290,37 @@ impl_box_observer_g!( ConsistencyCheckObserver, WrappedConsistencyCheckObserver ); -impl_box_observer!(BoxMessageObserver, MessageObserver, WrappedMessageObserver); +impl_box_observer!( + BoxRaftMessageObserver, + RaftMessageObserver, + WrappedRaftMessageObserver +); +impl_box_observer!( + BoxExtraMessageObserver, + ExtraMessageObserver, + WrappedExtraMessageObserver +); impl_box_observer!( BoxRegionHeartbeatObserver, RegionHeartbeatObserver, WrappedRegionHeartbeatObserver ); +impl_box_observer!( + BoxWriteBatchObserver, + WriteBatchObserver, + WrappedBoxWriteBatchObserver +); +impl_box_observer!( + BoxSnapshotObserver, + SnapshotObserver, + WrappedBoxSnapshotObserver +); +impl_box_observer!( + BoxDestroyPeerObserver, + DestroyPeerObserver, + WrappedBoxDestroyPeerObserver +); + /// Registry contains all registered coprocessors. #[derive(Clone)] pub struct Registry @@ -312,8 +338,15 @@ where read_index_observers: Vec>, pd_task_observers: Vec>, update_safe_ts_observers: Vec>, - message_observers: Vec>, + raft_message_observers: Vec>, + extra_message_observers: Vec>, region_heartbeat_observers: Vec>, + destroy_peer_observers: Vec>, + // For now, `write_batch_observer` and `snapshot_observer` can only have one + // observer solely because of simplicity. However, it is possible to have + // multiple observers in the future if needed. + write_batch_observer: Option, + snapshot_observer: Option, // TODO: add endpoint } @@ -331,8 +364,12 @@ impl Default for Registry { read_index_observers: Default::default(), pd_task_observers: Default::default(), update_safe_ts_observers: Default::default(), - message_observers: Default::default(), + raft_message_observers: Default::default(), + extra_message_observers: Default::default(), region_heartbeat_observers: Default::default(), + destroy_peer_observers: Default::default(), + write_batch_observer: None, + snapshot_observer: None, } } } @@ -402,8 +439,12 @@ impl Registry { push!(priority, qo, self.update_safe_ts_observers); } - pub fn register_message_observer(&mut self, priority: u32, qo: BoxMessageObserver) { - push!(priority, qo, self.message_observers); + pub fn register_raft_message_observer(&mut self, priority: u32, qo: BoxRaftMessageObserver) { + push!(priority, qo, self.raft_message_observers); + } + + pub fn register_extra_message_observer(&mut self, priority: u32, qo: BoxExtraMessageObserver) { + push!(priority, qo, self.extra_message_observers); } pub fn register_region_heartbeat_observer( @@ -413,6 +454,22 @@ impl Registry { ) { push!(priority, qo, self.region_heartbeat_observers); } + + pub fn register_destroy_peer_observer( + &mut self, + priority: u32, + destroy_peer_observer: BoxDestroyPeerObserver, + ) { + push!(priority, destroy_peer_observer, self.destroy_peer_observers); + } + + pub fn register_write_batch_observer(&mut self, write_batch_observer: BoxWriteBatchObserver) { + self.write_batch_observer = Some(write_batch_observer); + } + + pub fn register_snapshot_observer(&mut self, snapshot_observer: BoxSnapshotObserver) { + self.snapshot_observer = Some(snapshot_observer); + } } /// A macro that loops over all observers and returns early when error is found @@ -574,6 +631,21 @@ impl CoprocessorHost { } } + pub fn pre_delete_range(&self, start_key: &[u8], end_key: &[u8]) { + let region = Region::default(); + let mut ctx = ObserverContext::new(®ion); + for observer in &self.registry.query_observers { + let observer = observer.observer.inner(); + let mut request = Request::new(); + request.set_cmd_type(CmdType::DeleteRange); + request.mut_delete_range().set_start_key(start_key.to_vec()); + request.mut_delete_range().set_end_key(end_key.to_vec()); + if observer.pre_exec_query(&mut ctx, &[request], 0, 0) { + return; + } + } + } + // (index, term) is for the applying entry. pub fn pre_exec(&self, region: &Region, cmd: &RaftCmdRequest, index: u64, term: u64) -> bool { let mut ctx = ObserverContext::new(region); @@ -683,8 +755,22 @@ impl CoprocessorHost { ); } - pub fn pre_transfer_leader(&self, r: &Region, tr: &TransferLeaderRequest) -> Result<()> { - try_loop_ob!(r, &self.registry.admin_observers, pre_transfer_leader, tr) + pub fn pre_transfer_leader( + &self, + r: &Region, + tr: &TransferLeaderRequest, + ) -> Result> { + let mut ctx = ObserverContext::new(r); + let mut msgs = vec![]; + for o in &self.registry.admin_observers { + if let Some(msg) = (o.observer).inner().pre_transfer_leader(&mut ctx, tr)? { + msgs.push(msg); + } + if ctx.bypass { + break; + } + } + Ok(msgs) } pub fn post_apply_snapshot( @@ -828,9 +914,16 @@ impl CoprocessorHost { true } + pub fn on_extra_message(&self, r: &Region, msg: &ExtraMessage) { + for observer in &self.registry.extra_message_observers { + let observer = observer.observer.inner(); + observer.on_extra_message(r, msg); + } + } + /// Returns false if the message should not be stepped later. pub fn on_raft_message(&self, msg: &RaftMessage) -> bool { - for observer in &self.registry.message_observers { + for observer in &self.registry.raft_message_observers { let observer = observer.observer.inner(); if !observer.on_raft_message(msg) { return false; @@ -886,6 +979,38 @@ impl CoprocessorHost { } } + pub fn on_create_apply_write_batch(&self, wb: WB) -> WriteBatchWrapper { + let observable_wb = self + .registry + .write_batch_observer + .as_ref() + .map(|observer| observer.inner().create_observable_write_batch()); + WriteBatchWrapper::new(wb, observable_wb) + } + + pub fn on_destroy_peer(&self, region: &Region) { + if self.registry.destroy_peer_observers.is_empty() { + return; + } + + for observer in &self.registry.destroy_peer_observers { + let observer = observer.observer.inner(); + observer.on_destroy_peer(region); + } + } + + pub fn on_snapshot( + &self, + region: &Region, + read_ts: u64, + seqno: u64, + ) -> Option> { + self.registry + .snapshot_observer + .as_ref() + .map(move |observer| observer.inner().on_snapshot(region, read_ts, seqno)) + } + pub fn shutdown(&self) { for entry in &self.registry.admin_observers { entry.observer.inner().stop(); @@ -1200,7 +1325,7 @@ mod tests { } } - impl MessageObserver for TestCoprocessor { + impl RaftMessageObserver for TestCoprocessor { fn on_raft_message(&self, _: &RaftMessage) -> bool { self.called .fetch_add(ObserverIndex::OnRaftMessage as usize, Ordering::SeqCst); @@ -1245,7 +1370,7 @@ mod tests { host.registry .register_update_safe_ts_observer(1, BoxUpdateSafeTsObserver::new(ob.clone())); host.registry - .register_message_observer(1, BoxMessageObserver::new(ob.clone())); + .register_raft_message_observer(1, BoxRaftMessageObserver::new(ob.clone())); let mut index: usize = 0; let region = Region::default(); @@ -1276,7 +1401,7 @@ mod tests { index += ObserverIndex::PostApplyQuery as usize; assert_all!([&ob.called], &[index]); - host.on_role_change(®ion, RoleChange::new(StateRole::Leader)); + host.on_role_change(®ion, RoleChange::new_for_test(StateRole::Leader)); index += ObserverIndex::OnRoleChange as usize; assert_all!([&ob.called], &[index]); diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index c62ae409470..b40d5294610 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -17,7 +17,7 @@ use kvproto::{ AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request, TransferLeaderRequest, }, - raft_serverpb::RaftApplyState, + raft_serverpb::{ExtraMessage, RaftApplyState}, }; use pd_client::RegionStat; use raft::{eraftpb, StateRole}; @@ -31,17 +31,22 @@ pub mod region_info_accessor; mod split_check; pub mod split_observer; use kvproto::raft_serverpb::RaftMessage; +mod read_write; pub use self::{ config::{Config, ConsistencyCheckMethod}, consistency_check::{ConsistencyCheckObserver, Raw as RawConsistencyCheckObserver}, dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, - BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, + BoxPdTaskObserver, BoxQueryObserver, BoxRaftMessageObserver, BoxRegionChangeObserver, BoxRoleObserver, BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, StoreHandle, }, error::{Error, Result}, + read_write::{ + ObservableWriteBatch, ObservedSnapshot, SnapshotObserver, WriteBatchObserver, + WriteBatchWrapper, + }, region_info_accessor::{ Callback as RegionInfoCallback, RangeKey, RegionCollector, RegionInfo, RegionInfoAccessor, RegionInfoProvider, SeekRegionCallback, @@ -87,6 +92,7 @@ pub struct RegionState { pub peer_id: u64, pub pending_remove: bool, pub modified_region: Option, + pub new_regions: Vec, } /// Context for exec observers of mutation to be applied to ApplyContext. @@ -139,8 +145,8 @@ pub trait AdminObserver: Coprocessor { &self, _ctx: &mut ObserverContext<'_>, _tr: &TransferLeaderRequest, - ) -> Result<()> { - Ok(()) + ) -> Result> { + Ok(None) } } @@ -289,8 +295,7 @@ pub struct RoleChange { } impl RoleChange { - #[cfg(any(test, feature = "testexport"))] - pub fn new(state: StateRole) -> Self { + pub fn new_for_test(state: StateRole) -> Self { RoleChange { state, leader_id: raft::INVALID_ID, @@ -356,13 +361,18 @@ pub trait RegionHeartbeatObserver: Coprocessor { fn on_region_heartbeat(&self, _: &mut ObserverContext<'_>, _: &RegionStat) {} } -pub trait MessageObserver: Coprocessor { +pub trait RaftMessageObserver: Coprocessor { /// Returns false if the message should not be stepped later. fn on_raft_message(&self, _: &RaftMessage) -> bool { true } } +// +pub trait ExtraMessageObserver: Coprocessor { + fn on_extra_message(&self, _: &Region, _: &ExtraMessage) {} +} + #[derive(Clone, Debug, Default)] pub struct Cmd { pub index: u64, @@ -600,6 +610,11 @@ pub trait UpdateSafeTsObserver: Coprocessor { fn on_update_safe_ts(&self, _: u64, _: u64, _: u64) {} } +pub trait DestroyPeerObserver: Coprocessor { + /// Hook to call when destroying a peer. + fn on_destroy_peer(&self, _: &Region) {} +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/raftstore/src/coprocessor/read_write/mod.rs b/components/raftstore/src/coprocessor/read_write/mod.rs new file mode 100644 index 00000000000..8c55177f01f --- /dev/null +++ b/components/raftstore/src/coprocessor/read_write/mod.rs @@ -0,0 +1,7 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +mod snapshot; +mod write_batch; + +pub use snapshot::*; +pub use write_batch::*; diff --git a/components/raftstore/src/coprocessor/read_write/snapshot.rs b/components/raftstore/src/coprocessor/read_write/snapshot.rs new file mode 100644 index 00000000000..b65a98680b4 --- /dev/null +++ b/components/raftstore/src/coprocessor/read_write/snapshot.rs @@ -0,0 +1,21 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::any::Any; + +use kvproto::metapb::Region; + +/// ObservedSnapshot is a trait that represents data that are observed during +/// taking snapshot. +/// It inherits from Any to allow downcasting to concrete types. +pub trait ObservedSnapshot: Any + Send + Sync {} + +/// SnapshotObserver is a trait that observes the snapshot process. +pub trait SnapshotObserver: Send { + /// on_snapshot is called when raftstore is taking RegionSnapshot. + fn on_snapshot( + &self, + region: &Region, + read_ts: u64, + sequence_number: u64, + ) -> Box; +} diff --git a/components/raftstore/src/coprocessor/read_write/write_batch.rs b/components/raftstore/src/coprocessor/read_write/write_batch.rs new file mode 100644 index 00000000000..72c98523086 --- /dev/null +++ b/components/raftstore/src/coprocessor/read_write/write_batch.rs @@ -0,0 +1,177 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::atomic::{AtomicBool, Ordering}; + +use engine_traits::{Mutable, Result, WriteBatch, WriteOptions, CF_DEFAULT}; +use kvproto::metapb; + +pub trait WriteBatchObserver: Send { + fn create_observable_write_batch(&self) -> Box; +} + +/// It observes write operations of an `engine_trait::WriteBatch`, and provides +/// additional methods to specify which region the write operations belong to. +// TODO: May be we can unified it with `CmdObserver`? +pub trait ObservableWriteBatch: WriteBatch + Send { + /// It declares that the following consecutive write will be within this + /// region. + fn prepare_for_region(&mut self, region: &metapb::Region); + /// Commit the WriteBatch with the given options and sequence number. + fn write_opt_seq(&mut self, opts: &WriteOptions, seq_num: u64); + /// It is called after a write operation is finished. + fn post_write(&mut self); +} + +pub struct WriteBatchWrapper { + write_batch: WB, + observable_write_batch: Option>, +} + +impl WriteBatchWrapper { + pub fn new( + write_batch: WB, + observable_write_batch: Option>, + ) -> Self { + Self { + write_batch, + observable_write_batch, + } + } + + pub fn prepare_for_region(&mut self, region: &metapb::Region) { + if let Some(w) = self.observable_write_batch.as_mut() { + w.prepare_for_region(region) + } + } +} + +impl WriteBatch for WriteBatchWrapper { + fn write(&mut self) -> Result { + self.write_opt(&WriteOptions::default()) + } + + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + self.write_callback_opt(opts, |_| ()) + } + + fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut(u64)) -> Result { + let called = AtomicBool::new(false); + let res = self.write_batch.write_callback_opt(opts, |s| { + if !called.fetch_or(true, Ordering::SeqCst) { + if let Some(w) = self.observable_write_batch.as_mut() { + w.write_opt_seq(opts, s); + } + } + cb(s); + }); + if let Some(w) = self.observable_write_batch.as_mut() { + w.post_write(); + } + res + } + + fn data_size(&self) -> usize { + self.write_batch.data_size() + } + + fn count(&self) -> usize { + self.write_batch.count() + } + + fn is_empty(&self) -> bool { + self.write_batch.is_empty() + } + + fn should_write_to_engine(&self) -> bool { + self.write_batch.should_write_to_engine() + } + + fn clear(&mut self) { + if let Some(w) = self.observable_write_batch.as_mut() { + w.clear() + } + self.write_batch.clear(); + } + + fn set_save_point(&mut self) { + if let Some(w) = self.observable_write_batch.as_mut() { + w.set_save_point() + } + self.write_batch.set_save_point() + } + + fn pop_save_point(&mut self) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.pop_save_point()?; + } + self.write_batch.pop_save_point() + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.rollback_to_save_point()?; + } + self.write_batch.rollback_to_save_point() + } + + fn merge(&mut self, _: Self) -> Result<()> { + unimplemented!("WriteBatchWrapper does not support merge") + } +} + +impl Mutable for WriteBatchWrapper { + fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.put_cf(CF_DEFAULT, key, value)?; + } + self.write_batch.put(key, value) + } + + fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.put_cf(cf, key, value)?; + } + self.write_batch.put_cf(cf, key, value) + } + + fn delete(&mut self, key: &[u8]) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.delete_cf(CF_DEFAULT, key)?; + } + self.write_batch.delete(key) + } + + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.delete_cf(cf, key)?; + } + self.write_batch.delete_cf(cf, key) + } + + fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.delete_range_cf(CF_DEFAULT, begin_key, end_key)?; + } + self.write_batch.delete_range(begin_key, end_key) + } + + fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + if let Some(w) = self.observable_write_batch.as_mut() { + w.delete_range_cf(cf, begin_key, end_key)?; + } + self.write_batch.delete_range_cf(cf, begin_key, end_key) + } + + // Override the default methods `put_msg` and `put_msg_cf` to prevent + // potential loss of put observations if WB also overrides them. + fn put_msg(&mut self, key: &[u8], m: &M) -> Result<()> { + // It's okay to call `self.put` even though it does not strictly + // follow the `put_msg` semantics, as there are no implementors + // that override it. + self.put(key, &m.write_to_bytes()?) + } + fn put_msg_cf(&mut self, cf: &str, key: &[u8], m: &M) -> Result<()> { + // See put_msg. + self.put_cf(cf, key, &m.write_to_bytes()?) + } +} diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index 81032a87230..760cf318d95 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -1,7 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cmp::Ordering, collections::{ BTreeMap, Bound::{Excluded, Unbounded}, @@ -29,6 +28,9 @@ use super::{ RegionHeartbeatObserver, Result, RoleChange, RoleObserver, }; +// TODO(SpadeA): this 100 may be adjusted by observing more workloads. +const ITERATED_COUNT_FILTER_FACTOR: usize = 100; + /// `RegionInfoAccessor` is used to collect all regions' information on this /// TiKV into a collection so that other parts of TiKV can get region /// information from it. It registers a observer to raftstore, which is named @@ -167,6 +169,10 @@ pub enum RegionInfoQuery { count: usize, callback: Callback, }, + GetRegionsStat { + region_ids: Vec, + callback: Callback>, + }, /// Gets all contents from the collection. Only used for testing. DebugDump(mpsc::Sender<(RegionsMap, RegionRangesMap)>), } @@ -192,6 +198,9 @@ impl Display for RegionInfoQuery { RegionInfoQuery::GetTopRegions { count, .. } => { write!(f, "GetTopRegions(count: {})", count) } + RegionInfoQuery::GetRegionsStat { region_ids, .. } => { + write!(f, "GetRegionsActivity(region_ids: {:?})", region_ids) + } RegionInfoQuery::DebugDump(_) => write!(f, "DebugDump"), } } @@ -297,15 +306,21 @@ pub struct RegionCollector { // together in a struct exposing add, delete, and get_top_regions methods. region_activity: RegionActivityMap, region_leaders: Arc>>, + // It is calculated as '(next + prev) / processed_keys' + mvcc_amplification_threshold: Box usize + Send>, } impl RegionCollector { - pub fn new(region_leaders: Arc>>) -> Self { + pub fn new( + region_leaders: Arc>>, + mvcc_amplification_threshold: Box usize + Send>, + ) -> Self { Self { region_leaders, regions: HashMap::default(), region_activity: HashMap::default(), region_ranges: BTreeMap::default(), + mvcc_amplification_threshold, } } @@ -569,58 +584,138 @@ impl RegionCollector { /// /// Otherwise, return the top `count` regions for which this node is the /// leader from `self.region_activity`. Top regions are determined by - /// comparing `read_keys` in each region's most recent + /// comparing `next + prev` in each region's most recent /// region stat. /// /// Note: this function is `O(N log(N))` with respect to size of /// region_activity. This is acceptable, as region_activity is populated /// by heartbeats for this node's region, so N cannot be greater than /// approximately `300_000``. - pub fn handle_get_top_regions(&mut self, count: usize, callback: Callback) { + pub fn handle_get_top_regions(&self, count: usize, callback: Callback) { let compare_fn = |a: &RegionActivity, b: &RegionActivity| { - let a = a.region_stat.read_keys; - let b = b.region_stat.read_keys; + let a = a.region_stat.cop_detail.iterated_count(); + let b = b.region_stat.cop_detail.iterated_count(); b.cmp(&a) }; - let top_regions = if count == 0 { - self.regions - .values() - .map(|ri| { - ( - ri.region.clone(), - self.region_activity.get(&ri.region.get_id()), - ) - }) - .sorted_by(|(_, a), (_, b)| match (a, b) { - (None, None) => Ordering::Equal, - (None, Some(_)) => Ordering::Greater, - (Some(_), None) => Ordering::Less, - (Some(a), Some(b)) => compare_fn(a, b), - }) - .map(|(r, ra)| { - ( - r, - ra.map(|ra| ra.region_stat.approximate_size) - .unwrap_or_default(), + + // Only used to log. + let mut max_qps = 0; + let mut top_regions = self + .region_activity + .iter() + .filter_map(|(id, ac)| { + max_qps = u64::max(ac.region_stat.query_stats.coprocessor, max_qps); + self.regions + .get(id) + .filter(|ri| { + ri.role == StateRole::Leader + && ac.region_stat.cop_detail.iterated_count() != 0 + && !ri.region.is_in_flashback + }) + .map(|ri| (ri, ac)) + }) + .sorted_by(|(_, activity_0), (_, activity_1)| compare_fn(activity_0, activity_1)) + .take(count) + .map(|(ri, ac)| (ri.region.clone(), ac.region_stat.clone())) + .collect::>(); + + // TODO(SpadeA): remove it when auto load/evict is stable + { + let debug: Vec<_> = top_regions + .iter() + .map(|(r, s)| { + format!( + "region_id={}, read_keys={}, cop={}, cop_detail={:?}, mvcc_amplification={}", + r.get_id(), + s.read_keys, + s.query_stats.coprocessor, + s.cop_detail, + s.cop_detail.mvcc_amplification(), ) }) - .collect::>() + .collect_vec(); + + info!( + "ime get top k regions before filter"; + "count" => count, + "max_qps" => max_qps, + "regions" => ?debug, + ); + } + + // Get the average iterated count of the first top 10 regions and use the + // 1/ITERATED_COUNT_FILTER_FACTOR of it to filter regions with less read + // flows + let top_regions_iterated_count: Vec<_> = top_regions + .iter() + .map(|(_, r)| r.cop_detail.iterated_count()) + .take(10) + .collect(); + let iterated_count_to_filter: usize = if !top_regions_iterated_count.is_empty() { + top_regions_iterated_count.iter().sum::() + / top_regions_iterated_count.len() + / ITERATED_COUNT_FILTER_FACTOR } else { - let count = usize::max(count, self.region_activity.len()); - self.region_activity + 0 + }; + top_regions.retain(|(_, s)| { + s.cop_detail.iterated_count() >= iterated_count_to_filter + // plus processed_keys by 1 to make it not 0 + && s.cop_detail.mvcc_amplification() + >= (self.mvcc_amplification_threshold)() as f64 + }); + + // TODO(SpadeA): remove it when auto load/evict is stable + { + let debug: Vec<_> = top_regions .iter() - .filter_map(|(id, ac)| { - self.regions - .get(id) - .filter(|ri| ri.role == StateRole::Leader) - .map(|ri| (ri, ac)) + .map(|(r, s)| { + format!( + "region_id={}, read_keys={}, cop={}, cop_detail={:?}, mvcc_amplification={}", + r.get_id(), + s.read_keys, + s.query_stats.coprocessor, + s.cop_detail, + s.cop_detail.mvcc_amplification(), + ) }) - .sorted_by(|(_, activity_0), (_, activity_1)| compare_fn(activity_0, activity_1)) - .take(count) - .map(|(ri, ac)| (ri.region.clone(), ac.region_stat.approximate_size)) - .collect::>() - }; - callback(top_regions) + .collect_vec(); + + info!( + "ime get top k regions after filter"; + "count" => count, + "read_count" => debug.len(), + "max_qps" => max_qps, + "regions" => ?debug, + ); + } + + callback( + top_regions + .into_iter() + .map(|(r, stat)| (r, stat.clone())) + .collect_vec(), + ) + } + + fn handle_get_regions_stat( + &self, + region_ids: Vec, + callback: Callback>, + ) { + callback( + region_ids + .into_iter() + .filter_map(|id| { + self.region_activity.get(&id).map(|r| { + ( + self.regions.get(&id).unwrap().region.clone(), + r.region_stat.clone(), + ) + }) + }) + .collect_vec(), + ) } fn handle_raftstore_event(&mut self, event: RaftStoreEvent) { @@ -703,6 +798,12 @@ impl Runnable for RegionCollector { RegionInfoQuery::GetTopRegions { count, callback } => { self.handle_get_top_regions(count, callback); } + RegionInfoQuery::GetRegionsStat { + region_ids, + callback, + } => { + self.handle_get_regions_stat(region_ids, callback); + } RegionInfoQuery::DebugDump(tx) => { tx.send((self.regions.clone(), self.region_ranges.clone())) .unwrap(); @@ -768,12 +869,13 @@ impl RegionInfoAccessor { pub fn new( host: &mut CoprocessorHost, region_stats_manager_enabled_cb: RegionStatsManagerEnabledCb, + mvcc_amplification_threshold: Box usize + Send>, ) -> Self { let region_leaders = Arc::new(RwLock::new(HashSet::default())); let worker = WorkerBuilder::new("region-collector-worker").create(); let scheduler = worker.start_with_timer( "region-collector-worker", - RegionCollector::new(region_leaders.clone()), + RegionCollector::new(region_leaders.clone(), mvcc_amplification_threshold), ); register_region_event_listener(host, scheduler.clone(), region_stats_manager_enabled_cb); @@ -803,10 +905,15 @@ impl RegionInfoAccessor { .unwrap(); rx.recv().unwrap() } + + #[cfg(any(test, feature = "testexport"))] + pub fn scheduler(&self) -> &Scheduler { + &self.scheduler + } } /// Top regions result: region and its approximate size. -pub type TopRegions = Vec<(Region, u64)>; +pub type TopRegions = Vec<(Region, RegionStat)>; pub trait RegionInfoProvider: Send + Sync { /// Get a iterator of regions that contains `from` or have keys larger than @@ -817,7 +924,7 @@ pub trait RegionInfoProvider: Send + Sync { fn find_region_by_id( &self, - _reigon_id: u64, + _region_id: u64, _callback: Callback>, ) -> Result<()> { unimplemented!() @@ -830,7 +937,12 @@ pub trait RegionInfoProvider: Send + Sync { fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> Result> { unimplemented!() } - fn get_top_regions(&self, _count: Option) -> Result { + + fn get_top_regions(&self, _count: NonZeroUsize) -> Result { + unimplemented!() + } + + fn get_regions_stat(&self, _: Vec) -> Result> { unimplemented!() } } @@ -905,10 +1017,10 @@ impl RegionInfoProvider for RegionInfoAccessor { }) }) } - fn get_top_regions(&self, count: Option) -> Result { + fn get_top_regions(&self, count: NonZeroUsize) -> Result { let (tx, rx) = mpsc::channel(); let msg = RegionInfoQuery::GetTopRegions { - count: count.map_or_else(|| 0, usize::from), + count: usize::from(count), callback: Box::new(move |regions| { if let Err(e) = tx.send(regions) { warn!("failed to send get_top_regions result: {:?}", e); @@ -927,6 +1039,29 @@ impl RegionInfoProvider for RegionInfoAccessor { }) }) } + + fn get_regions_stat(&self, region_ids: Vec) -> Result> { + let (tx, rx) = mpsc::channel(); + let msg = RegionInfoQuery::GetRegionsStat { + region_ids, + callback: Box::new(move |regions_activity| { + if let Err(e) = tx.send(regions_activity) { + warn!("failed to send get_regions_activity result: {:?}", e); + } + }), + }; + self.scheduler + .schedule(msg) + .map_err(|e| box_err!("failed to send request to region collector: {:?}", e)) + .and_then(|_| { + rx.recv().map_err(|e| { + box_err!( + "failed to receive get_regions_activity result from region_collector: {:?}", + e + ) + }) + }) + } } // Use in tests only. @@ -1006,7 +1141,7 @@ impl RegionInfoProvider for MockRegionInfoProvider { .ok_or(box_err!("Not found region containing {:?}", key)) } - fn get_top_regions(&self, _count: Option) -> Result { + fn get_top_regions(&self, _count: NonZeroUsize) -> Result { let mut regions = Vec::new(); let (tx, rx) = mpsc::channel(); @@ -1014,7 +1149,8 @@ impl RegionInfoProvider for MockRegionInfoProvider { b"", Box::new(move |iter| { for region_info in iter { - tx.send((region_info.region.clone(), 0)).unwrap(); + tx.send((region_info.region.clone(), RegionStat::default())) + .unwrap(); } }), )?; @@ -1028,12 +1164,14 @@ impl RegionInfoProvider for MockRegionInfoProvider { #[cfg(test)] mod tests { + use kvproto::metapb::RegionEpoch; + use pd_client::RegionWriteCfCopDetail; use txn_types::Key; use super::*; fn new_region_collector() -> RegionCollector { - RegionCollector::new(Arc::new(RwLock::new(HashSet::default()))) + RegionCollector::new(Arc::new(RwLock::new(HashSet::default())), Box::new(|| 0)) } fn new_region(id: u64, start_key: &[u8], end_key: &[u8], version: u64) -> Region { @@ -1673,4 +1811,101 @@ mod tests { ) .unwrap(); } + + #[test] + fn test_get_top_regions() { + let mut collector = + RegionCollector::new(Arc::new(RwLock::new(HashSet::default())), Box::new(|| 10)); + + let test_set = vec![ + // mvcc amp 5000 + (1, b"".to_vec(), b"k10".to_vec(), 1_000_000, 0, 200 - 1), + // mvcc amp 5 + ( + 2, + b"k10".to_vec(), + b"k20".to_vec(), + 1_000_000, + 0, + 2_000_000 - 1, + ), + // mvcc amp 50, filtered by mvcc amp + (3, b"k20".to_vec(), b"k30".to_vec(), 0, 100_000, 2_000 - 1), + // mvcc amp 100 + ( + 4, + b"k30".to_vec(), + b"k40".to_vec(), + 100_000, + 100_000, + 2_000 - 1, + ), + // mvcc amp 1000, filtered by next + prev + (5, b"k40".to_vec(), b"k50".to_vec(), 1000, 0, 0), + ]; + + let mut region1 = None; + let mut region4 = None; + for (id, start, end, next, prev, processed_keys) in test_set { + let mut region = Region::default(); + region.set_id(id); + region.set_start_key(start); + region.set_end_key(end); + let mut epoch = RegionEpoch::new(); + epoch.set_version(10); + region.set_region_epoch(epoch); + if id == 1 { + region1 = Some(region.clone()); + } else if id == 4 { + region4 = Some(region.clone()); + } + + collector.handle_raftstore_event(RaftStoreEvent::CreateRegion { + region: region.clone(), + role: StateRole::Leader, + }); + let mut stat = RegionStat::default(); + stat.cop_detail = RegionWriteCfCopDetail::new(next, prev, processed_keys); + collector.handle_raftstore_event(RaftStoreEvent::UpdateRegionActivity { + region, + activity: RegionActivity { region_stat: stat }, + }); + } + + let (tx, rx) = mpsc::channel(); + let cb = Box::new(move |regions| { + tx.send(regions).unwrap(); + }); + + collector.handle_get_top_regions(4, cb.clone()); + let regions = rx + .recv() + .unwrap() + .into_iter() + .map(|(r, _)| r.id) + .collect::>(); + assert_eq!(regions, vec![1, 4, 3]); + + let mut region1 = region1.unwrap(); + region1.set_is_in_flashback(true); + collector.handle_raftstore_event(RaftStoreEvent::UpdateRegion { + region: region1, + role: StateRole::Leader, + }); + + collector.handle_raftstore_event(RaftStoreEvent::RoleChange { + region: region4.unwrap(), + role: StateRole::Follower, + initialized: true, + }); + + collector.handle_get_top_regions(4, cb); + let regions = rx + .recv() + .unwrap() + .into_iter() + .map(|(r, _)| r.id) + .collect::>(); + assert_eq!(vec![3], regions); + } } diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 452616caf7e..afa8b831ba2 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -7,7 +7,7 @@ use std::{ // #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine, Snapshot, SnapshotContext}; +use engine_traits::{KvEngine, RaftEngine, Snapshot}; use error_code::ErrorCodeExt; use kvproto::{metapb, raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::SnapshotStatus; @@ -92,7 +92,10 @@ where /// Report a `StoreResolved` event to all Raft groups. fn report_resolved(&self, store_id: u64, group_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreResolved { store_id, group_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreResolved { + store_id, + group_id, + })) }) } } @@ -115,14 +118,24 @@ where .map_err(|e| handle_send_error(region_id, e)) } +pub struct ReadContext { + pub(crate) read_id: Option, + pub(crate) read_ts: Option, +} + +impl ReadContext { + pub fn new(read_id: Option, read_ts: Option) -> Self { + ReadContext { read_id, read_ts } + } +} + pub trait LocalReadRouter: Send + Clone where EK: KvEngine, { fn read( &mut self, - snap_ctx: Option, - read_id: Option, + ctx: ReadContext, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()>; @@ -252,12 +265,11 @@ impl RaftStoreRouter for ServerRaftStoreRouter impl LocalReadRouter for ServerRaftStoreRouter { fn read( &mut self, - snap_ctx: Option, - read_id: Option, + ctx: ReadContext, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()> { - self.local_reader.read(snap_ctx, read_id, req, cb); + self.local_reader.read(ctx, req, cb); Ok(()) } diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 4b468d900fe..7f32b1aaf71 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -8,6 +8,7 @@ //! raft db and then invoking callback or sending msgs if any. use std::{ + collections::VecDeque, fmt, mem, sync::Arc, thread::{self, JoinHandle}, @@ -39,9 +40,10 @@ use tikv_util::{ debug, info, slow_log, sys::thread::StdThreadBuildWrapper, thd_name, - time::{duration_to_sec, Instant}, + time::{duration_to_sec, setup_for_spin_interval, spin_at_least, Duration, Instant}, warn, }; +use tracker::TrackerTokenArray; use super::write_router::{SharedSenders, WriteSenders}; use crate::{ @@ -61,6 +63,8 @@ const KV_WB_DEFAULT_SIZE: usize = 16 * 1024; const RAFT_WB_SHRINK_SIZE: usize = 10 * 1024 * 1024; const RAFT_WB_DEFAULT_SIZE: usize = 256 * 1024; const RAFT_WB_SPLIT_SIZE: usize = ReadableSize::gb(1).0 as usize; +/// The default size of the raft write batch recorder. +const RAFT_WB_DEFAULT_RECORDER_SIZE: usize = 30; /// Notify the event to the specified region. pub trait PersistedNotifier: Clone + Send + 'static { @@ -373,6 +377,105 @@ impl ExtraBatchWrite { } } +/// WriteTaskBatchRecorder is a sliding window, used to record the batch size +/// and calculate the wait duration. +/// If the batch size is smaller than the threshold, it will return a +/// recommended wait duration for the caller as a hint to wait for more writes. +/// The wait duration is calculated based on the trend of the change of the +/// batch size. The range of the trend is [0.5, 2.0]. If the batch size is +/// increasing, the trend will be larger than 1.0, and the wait duration will +/// be shorter. +/// By default, the wait duration is 20us, the relative range for wait duration +/// is [10us, 40us]. +struct WriteTaskBatchRecorder { + batch_size_hint: usize, + capacity: usize, + history: VecDeque, + sum: usize, + avg: usize, + trend: f64, + /// Wait duration in nanoseconds. + wait_duration: Duration, + /// The count of wait. + wait_count: u64, + /// The max count of wait. + wait_max_count: u64, +} + +impl WriteTaskBatchRecorder { + fn new(batch_size_hint: usize, wait_duration: Duration) -> Self { + // Initialize the spin duration in advance. + setup_for_spin_interval(); + Self { + batch_size_hint, + history: VecDeque::new(), + capacity: RAFT_WB_DEFAULT_RECORDER_SIZE, + sum: 0, + avg: 0, + trend: 1.0, + wait_duration, + wait_count: 0, + wait_max_count: 1, + } + } + + #[cfg(test)] + fn get_avg(&self) -> usize { + self.avg + } + + #[cfg(test)] + fn get_trend(&self) -> f64 { + self.trend + } + + #[inline] + fn update_config(&mut self, batch_size: usize, wait_duration: Duration) { + self.batch_size_hint = batch_size; + self.wait_duration = wait_duration; + } + + fn record(&mut self, size: usize) { + self.history.push_back(size); + self.sum += size; + + let mut len = self.history.len(); + if len > self.capacity { + self.sum = self + .sum + .saturating_sub(self.history.pop_front().unwrap_or(0)); + len = self.capacity; + } + self.avg = self.sum / len; + + if len >= self.capacity && self.batch_size_hint > 0 { + // The trend ranges from 0.5 to 2.0. + let trend = self.avg as f64 / self.batch_size_hint as f64; + self.trend = trend.clamp(0.5, 2.0); + } else { + self.trend = 1.0; + } + } + + #[inline] + fn reset_wait_count(&mut self) { + self.wait_count = 0; + } + + #[inline] + fn should_wait(&self, batch_size: usize) -> bool { + batch_size < self.batch_size_hint && self.wait_count < self.wait_max_count + } + + fn wait_for_a_while(&mut self) { + self.wait_count += 1; + // Use a simple linear function to calculate the wait duration. + spin_at_least(Duration::from_nanos( + (self.wait_duration.as_nanos() as f64 * (1.0 / self.trend)) as u64, + )); + } +} + /// WriteTaskBatch is used for combining several WriteTask into one. struct WriteTaskBatch where @@ -393,6 +496,7 @@ where // region_id -> (peer_id, ready_number) pub readies: HashMap, pub(crate) raft_wb_split_size: usize, + recorder: WriteTaskBatchRecorder, } impl WriteTaskBatch @@ -400,7 +504,11 @@ where EK: KvEngine, ER: RaftEngine, { - fn new(raft_wb: ER::LogBatch) -> Self { + fn new( + raft_wb: ER::LogBatch, + write_batch_size_hint: usize, + write_wait_duration: Duration, + ) -> Self { Self { raft_wbs: vec![raft_wb], raft_states: HashMap::default(), @@ -410,6 +518,7 @@ where persisted_cbs: vec![], readies: HashMap::default(), raft_wb_split_size: RAFT_WB_SPLIT_SIZE, + recorder: WriteTaskBatchRecorder::new(write_batch_size_hint, write_wait_duration), } } @@ -490,6 +599,8 @@ where self.persisted_cbs.push(v); } self.tasks.push(task); + // Record the size of the batch. + self.recorder.record(self.get_raft_size()); } fn clear(&mut self) { @@ -500,6 +611,7 @@ where self.state_size = 0; self.tasks.clear(); self.readies.clear(); + self.recorder.reset_wait_count(); } #[inline] @@ -563,6 +675,20 @@ where } } } + + #[inline] + fn update_config(&mut self, batch_size: usize, wait_duration: Duration) { + self.recorder.update_config(batch_size, wait_duration); + } + + #[inline] + fn should_wait(&self) -> bool { + self.recorder.should_wait(self.get_raft_size()) + } + + fn wait_for_a_while(&mut self) { + self.recorder.wait_for_a_while(); + } } pub struct Worker @@ -604,7 +730,11 @@ where trans: T, cfg: &Arc>, ) -> Self { - let batch = WriteTaskBatch::new(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); + let batch = WriteTaskBatch::new( + raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE), + cfg.value().raft_write_batch_size_hint.0 as usize, + cfg.value().raft_write_wait_duration.0, + ); let perf_context = ER::get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); @@ -643,7 +773,17 @@ where Ok(msg) => { stopped |= self.handle_msg(msg); } - Err(TryRecvError::Empty) => break, + Err(TryRecvError::Empty) => { + // If the size of the batch is small enough, it will wait for + // a while to make the batch larger. This can reduce the IOPS + // amplification if there are many trivial writes. + if self.batch.should_wait() { + self.batch.wait_for_a_while(); + continue; + } else { + break; + } + } Err(TryRecvError::Disconnected) => { stopped = true; break; @@ -786,6 +926,8 @@ where self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_RAFTDB_DURATION_HISTOGRAM.observe(write_raft_time); + debug!("raft log is persisted"; + "req_info" => TrackerTokenArray::new(trackers.as_slice())); } fail_point!("raft_after_save"); @@ -881,6 +1023,10 @@ where if let Some(incoming) = self.cfg_tracker.any_new() { self.raft_write_size_limit = incoming.raft_write_size_limit.0 as usize; self.metrics.waterfall_metrics = incoming.waterfall_metrics; + self.batch.update_config( + incoming.raft_write_batch_size_hint.0 as usize, + incoming.raft_write_wait_duration.0, + ); } } @@ -1064,7 +1210,11 @@ pub fn write_to_db_for_test( EK: KvEngine, ER: RaftEngine, { - let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); + let mut batch = WriteTaskBatch::new( + engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE), + 0, + Duration::default(), + ); batch.add_write_task(&engines.raft, task); let metrics = StoreWriteMetrics::new(false); batch.before_write_to_db(&metrics); diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 97e865a6bfe..aa6bd8aca3c 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -277,6 +277,37 @@ impl TestWriters { } } +#[test] +fn test_write_task_batch_recorder() { + let mut recorder = WriteTaskBatchRecorder::new(1024, Duration::from_nanos(50)); // 1kb, 50 nanoseconds + assert_eq!(recorder.get_avg(), 0); + assert_eq!(recorder.get_trend(), 1.0); + assert!(!recorder.should_wait(4096)); + assert!(recorder.should_wait(512)); + // [512 ...] + for _ in 0..30 { + recorder.record(512); + } + assert_eq!(recorder.get_avg(), 512); + assert_eq!(recorder.get_trend(), 0.5); + assert!(recorder.should_wait(128)); + let start = Instant::now(); + recorder.wait_for_a_while(); + assert!(start.saturating_elapsed() >= Duration::from_nanos(100)); + // [4096 ...] + for _ in 0..30 { + recorder.record(4096); + } + assert_eq!(recorder.get_avg(), 4096); + assert_eq!(recorder.get_trend(), 2.0); + assert!(!recorder.should_wait(128)); + recorder.reset_wait_count(); + assert!(recorder.should_wait(128)); + let start = Instant::now(); + recorder.wait_for_a_while(); + assert!(start.saturating_elapsed() >= Duration::from_nanos(20)); +} + #[test] fn test_worker() { let region_1 = 1; @@ -330,7 +361,7 @@ fn test_worker() { t.worker.write_to_db(true); - let snapshot = engines.kv.snapshot(None); + let snapshot = engines.kv.snapshot(); assert_eq!(snapshot.get_value(b"kv_k1").unwrap().unwrap(), b"kv_v1"); assert_eq!(snapshot.get_value(b"kv_k2").unwrap().unwrap(), b"kv_v2"); assert_eq!(snapshot.get_value(b"kv_k3").unwrap().unwrap(), b"kv_v3"); @@ -536,7 +567,7 @@ fn test_basic_flow() { must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); - let snapshot = engines.kv.snapshot(None); + let snapshot = engines.kv.snapshot(); assert!(snapshot.get_value(b"kv_k1").unwrap().is_none()); assert_eq!(snapshot.get_value(b"kv_k2").unwrap().unwrap(), b"kv_v2"); assert_eq!(snapshot.get_value(b"kv_k3").unwrap().unwrap(), b"kv_v3"); diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index c03ad2788a6..992f0856556 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -507,6 +507,7 @@ mod tests { cf_opts.set_max_bytes_for_level_base(MAX_OUTPUT_FILE_SIZE); cf_opts.set_max_bytes_for_level_multiplier(5); cf_opts.set_target_file_size_base(MAX_OUTPUT_FILE_SIZE); + cf_opts.set_level_compaction_dynamic_level_bytes(false); cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory( CompactionGuardGeneratorFactory::new( CF_DEFAULT, diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 152c49de0bd..005896ef6de 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -303,7 +303,7 @@ pub struct Config { #[doc(hidden)] #[online_config(skip)] /// Disable this feature by set to 0, logic will be removed in other pr. - /// When TiKV memory usage reaches `memory_usage_high_water` it will try to + /// When TiKV memory usage is near `memory_usage_high_water` it will try to /// limit memory increasing. For raftstore layer entries will be evicted /// from entry cache, if they utilize memory more than /// `evict_cache_on_memory_ratio` * total. @@ -327,6 +327,19 @@ pub struct Config { /// triggered. pub raft_write_size_limit: ReadableSize, + /// When the size of raft db writebatch is smaller than this value, write + /// will wait for a while to make the writebatch larger, which will reduce + /// the write amplification. + #[doc(hidden)] + pub raft_write_batch_size_hint: ReadableSize, + + /// When the size of raft db writebatch is smaller than this value, write + /// will wait for a while. This is used to reduce the write amplification. + /// It should be smaller than 1ms. Invalid to use too long duration because + /// it will make the write request wait too long. + #[doc(hidden)] + pub raft_write_wait_duration: ReadableDuration, + pub waterfall_metrics: bool, pub io_reschedule_concurrent_max_count: usize, @@ -516,6 +529,8 @@ impl Default for Config { cmd_batch: true, cmd_batch_concurrent_ready_max_count: 1, raft_write_size_limit: ReadableSize::mb(1), + raft_write_batch_size_hint: ReadableSize::kb(8), + raft_write_wait_duration: ReadableDuration::micros(20), waterfall_metrics: true, io_reschedule_concurrent_max_count: 4, io_reschedule_hotpot_duration: ReadableDuration::secs(5), @@ -827,6 +842,13 @@ impl Config { return Err(box_err!("local-read-batch-size must be greater than 0")); } + if self.raft_write_wait_duration.as_micros() > 1000 { + return Err(box_err!( + "raft-write-wait-duration should be less than 1ms, current value is {}ms", + self.raft_write_wait_duration.as_millis() + )); + } + // Since the following configuration supports online update, in order to // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an @@ -1186,6 +1208,12 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_write_size_limit"]) .set(self.raft_write_size_limit.0 as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["raft_write_batch_size_hint"]) + .set(self.raft_write_batch_size_hint.0 as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["raft_write_wait_duration"]) + .set(self.raft_write_wait_duration.as_micros() as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["waterfall_metrics"]) .set((self.waterfall_metrics as i32).into()); @@ -1595,5 +1623,11 @@ mod tests { cfg.raft_log_gc_count_limit(), split_size * 3 / 4 / ReadableSize::kb(1) ); + + cfg = Config::new(); + cfg.optimize_for(false); + cfg.raft_write_wait_duration = ReadableDuration::micros(1001); + cfg.validate(split_size, true, split_size / 20, false) + .unwrap_err(); } } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 8085de7273b..a09c3475e1a 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -335,26 +335,8 @@ impl EntryCache { fn trace_cached_entries(&mut self, entries: CachedEntries) { let dangle_size = { let mut guard = entries.entries.lock().unwrap(); - - let last_idx = guard.0.last().map(|e| e.index).unwrap(); - let cache_front = match self.cache.front().map(|e| e.index) { - Some(i) => i, - None => u64::MAX, - }; - - let dangle_range = if last_idx < cache_front { - // All entries are not in entry cache. - 0..guard.0.len() - } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { - // Some entries are in entry cache. - 0..i - } else { - // All entries are in entry cache. - 0..0 - }; - let mut size = 0; - for e in &guard.0[dangle_range] { + for e in &guard.0 { size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } guard.1 = size; @@ -416,7 +398,7 @@ pub enum RaftlogFetchState { Fetched(Box), } -#[derive(Debug, PartialEq)] +#[derive(PartialEq)] pub struct RaftlogFetchResult { pub ents: raft::Result>, // because entries may be empty, so store the original low index that the task issued @@ -431,6 +413,19 @@ pub struct RaftlogFetchResult { pub term: u64, } +impl std::fmt::Debug for RaftlogFetchResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // avoid dumping entries content + f.debug_struct("RaftlogFetchResult") + .field("low", &self.low) + .field("max_size", &self.max_size) + .field("hit_size_limit", &self.hit_size_limit) + .field("tried_cnt", &self.tried_cnt) + .field("term", &self.term) + .finish() + } +} + #[derive(Default)] struct AsyncFetchStats { async_fetch: Cell, @@ -1259,6 +1254,8 @@ impl EntryStorage { /// Evict entries from the cache. pub fn evict_entry_cache(&mut self, half: bool) { + fail_point!("mock_evict_entry_cache", |_| {}); + if !self.is_entry_cache_empty() { let cache = &mut self.cache; let cache_len = cache.cache.len(); @@ -1395,7 +1392,7 @@ pub mod tests { // Test trace an entry which is still in cache. let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); cache.trace_cached_entries(cached_entries); - check_mem_size_change(0); + check_mem_size_change(5); // Test compare `cached_last` with `trunc_to_idx` in `EntryCache::append_impl`. cache.append(0, 0, &[new_padded_entry(103, 4, 7)]); @@ -1409,7 +1406,7 @@ pub mod tests { // Test compact the last traced dangle entry. cache.persisted = 102; cache.compact_to(103); - check_mem_size_change(-5); + check_mem_size_change(-10); // Test compact all entries. cache.persisted = 103; diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index dd3fde82110..c4002708655 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -29,9 +29,9 @@ use batch_system::{ use collections::{HashMap, HashMapEntry, HashSet}; use crossbeam::channel::{TryRecvError, TrySendError}; use engine_traits::{ - util::SequenceNumber, CacheRange, DeleteStrategy, KvEngine, Mutable, PerfContext, - PerfContextKind, RaftEngine, RaftEngineReadOnly, Range as EngineRange, Snapshot, SstMetaInfo, - WriteBatch, WriteOptions, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + util::SequenceNumber, DeleteStrategy, KvEngine, Mutable, PerfContext, PerfContextKind, + RaftEngine, RaftEngineReadOnly, Range as EngineRange, Snapshot, SstMetaInfo, WriteBatch, + WriteOptions, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use fail::fail_point; use health_controller::types::LatencyInspector; @@ -70,7 +70,7 @@ use tikv_util::{ Either, MustConsumeVec, }; use time::Timespec; -use tracker::GLOBAL_TRACKERS; +use tracker::{TrackerToken, TrackerTokenArray, GLOBAL_TRACKERS}; use uuid::Builder as UuidBuilder; use self::memtrace::*; @@ -79,7 +79,7 @@ use crate::{ bytes_capacity, coprocessor::{ ApplyCtxInfo, Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel, - RegionState, + RegionState, WriteBatchWrapper, }, store::{ cmd_resp, @@ -406,7 +406,7 @@ where exec_log_index: u64, exec_log_term: u64, - kv_wb: EK::WriteBatch, + kv_wb: WriteBatchWrapper, kv_wb_last_bytes: u64, kv_wb_last_keys: u64, @@ -492,6 +492,7 @@ where priority: Priority, ) -> ApplyContext { let kv_wb = engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); + let kv_wb = host.on_create_apply_write_batch(kv_wb); ApplyContext { tag, @@ -542,8 +543,7 @@ where pub fn prepare_for(&mut self, delegate: &mut ApplyDelegate) { self.applied_batch .push_batch(&delegate.observe_info, delegate.region.get_id()); - let range = CacheRange::from_region(&delegate.region); - self.kv_wb.prepare_for_range(range); + self.kv_wb.prepare_for_region(&delegate.region); } /// Commits all changes have done for delegate. `persistent` indicates @@ -619,7 +619,9 @@ where let data_size = self.kv_wb().data_size(); if data_size > APPLY_WB_SHRINK_SIZE { // Control the memory usage for the WriteBatch. - self.kv_wb = self.engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); + let kv_wb = self.engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); + let kv_wb = self.host.on_create_apply_write_batch(kv_wb); + self.kv_wb = kv_wb; } else { // Clear data, reuse the WriteBatch, this can reduce memory allocations and // deallocations. @@ -627,6 +629,16 @@ where } self.kv_wb_last_bytes = 0; self.kv_wb_last_keys = 0; + } else { + fail_point!( + "after_write_to_db_skip_write_node_1", + self.store_id == 1, + |_| { unreachable!() } + ); + // We call `clear` here because some WriteBatch impl may have some internal + // state that need to be reset even if the write batch is empty. + // Please refer to `RegionCacheWriteBatch::clear` for more details. + self.kv_wb_mut().clear(); } if !self.delete_ssts.is_empty() { let tag = self.tag.clone(); @@ -644,6 +656,19 @@ where } = mem::replace(&mut self.applied_batch, ApplyCallbackBatch::new()); // Call it before invoking callback for preventing Commit is executed before // Prewrite is observed. + debug!("raft log is applied to the kv db"; + "req_info" => TrackerTokenArray::new( + &cb_batch.iter().fold(vec![], |mut acc: Vec, cb| { + acc.extend( + cb.0.write_trackers().into_iter(). + filter_map(|time_tracker| time_tracker.as_tracker_token()) + .collect::>().as_slice() + ); + acc + }) + ), + "cmd_batch" => ?cmd_batch.len(), + ); self.host .on_flush_applied_cmd_batch(batch_max_level, cmd_batch, &self.engine); // Invoke callbacks @@ -708,12 +733,12 @@ where } #[inline] - pub fn kv_wb(&self) -> &EK::WriteBatch { + pub fn kv_wb(&self) -> &WriteBatchWrapper { &self.kv_wb } #[inline] - pub fn kv_wb_mut(&mut self) -> &mut EK::WriteBatch { + pub fn kv_wb_mut(&mut self) -> &mut WriteBatchWrapper { &mut self.kv_wb } @@ -930,7 +955,8 @@ where /// All of messages that need to continue to be handled after /// the source peer has applied its logs and pending entries /// are all handled. - pending_msgs: Vec>, + #[allow(clippy::vec_box)] + pending_msgs: Vec>>, /// Cache heap size for itself. heap_size: Option, @@ -1190,7 +1216,7 @@ where self.metrics.written_keys += apply_ctx.delta_keys(); } - fn write_apply_state(&self, wb: &mut EK::WriteBatch) { + fn write_apply_state(&self, wb: &mut WriteBatchWrapper) { wb.put_msg_cf( CF_RAFT, &keys::apply_state_key(self.region.get_id()), @@ -1248,7 +1274,7 @@ where // just return Yield return ApplyResult::Yield; } - let mut has_unflushed_data = + let has_unflushed_data = self.last_flush_applied_index != self.apply_state.get_applied_index(); if (has_unflushed_data && should_write_to_engine(!apply_ctx.kv_wb().is_empty(), &cmd) @@ -1265,12 +1291,8 @@ where { return ApplyResult::Yield; } - has_unflushed_data = false; } if self.priority != apply_ctx.priority { - if has_unflushed_data { - apply_ctx.commit(self); - } return ApplyResult::Yield; } @@ -1510,17 +1532,19 @@ where self.apply_state.set_applied_index(index); self.applied_term = term; - let (modified_region, mut pending_handle_ssts) = match exec_result { - ApplyResult::Res(ref e) => match e { - ExecResult::SplitRegion { ref derived, .. } => (Some(derived.clone()), None), - ExecResult::PrepareMerge { ref region, .. } => (Some(region.clone()), None), - ExecResult::CommitMerge { ref region, .. } => (Some(region.clone()), None), - ExecResult::RollbackMerge { ref region, .. } => (Some(region.clone()), None), - ExecResult::IngestSst { ref ssts } => (None, Some(ssts.clone())), - ExecResult::Flashback { ref region } => (Some(region.clone()), None), - _ => (None, None), + let (modified_region, new_regions, mut pending_handle_ssts) = match &exec_result { + ApplyResult::Res(e) => match e { + ExecResult::SplitRegion { regions, .. } => { + (Some(self.region.clone()), regions.clone(), None) + } + ExecResult::PrepareMerge { region, .. } => (Some(region.clone()), vec![], None), + ExecResult::CommitMerge { region, .. } => (Some(region.clone()), vec![], None), + ExecResult::RollbackMerge { region, .. } => (Some(region.clone()), vec![], None), + ExecResult::IngestSst { ssts } => (None, vec![], Some(ssts.clone())), + ExecResult::Flashback { region } => (Some(region.clone()), vec![], None), + _ => (None, vec![], None), }, - _ => (None, None), + _ => (None, vec![], None), }; let mut apply_ctx_info = ApplyCtxInfo { pending_handle_ssts: &mut pending_handle_ssts, @@ -1535,9 +1559,11 @@ where peer_id: self.id(), pending_remove: self.pending_remove, modified_region, + new_regions, }, &mut apply_ctx_info, ); + match pending_handle_ssts { None => (), Some(mut v) => { @@ -2880,11 +2906,11 @@ where fail_point!("before_handle_catch_up_logs_for_merge"); // Sends message to the source peer fsm and pause `exec_commit_merge` process let logs_up_to_date = Arc::new(AtomicU64::new(0)); - let msg = SignificantMsg::CatchUpLogs(CatchUpLogs { + let msg = Box::new(SignificantMsg::CatchUpLogs(CatchUpLogs { target_region_id: self.region_id(), merge: merge.to_owned(), logs_up_to_date: logs_up_to_date.clone(), - }); + })); ctx.notifier .notify_one(source_region_id, PeerMsg::SignificantMsg(msg)); return Ok(( @@ -3270,7 +3296,7 @@ where // open files in rocksdb. // TODO: figure out another way to do consistency check without snapshot // or short life snapshot. - snap: ctx.engine.snapshot(None), + snap: ctx.engine.snapshot(), }) }, )) @@ -3793,15 +3819,20 @@ where term: u64, compact_index: u64, }, + // Trigger loading pending region for in_memory_engine, + InMemoryEngineLoadRegion { + region_id: u64, + trigger_load_cb: Box, + }, } -impl ResourceMetered for Msg { +impl ResourceMetered for Box> { fn consume_resource(&self, resource_ctl: &Arc) -> Option { if !resource_ctl.is_customized() { return None; } - match self { - Msg::Apply { apply, .. } => { + match **self { + Msg::Apply { ref apply, .. } => { let mut dominant_group = "".to_owned(); let mut max_write_bytes = 0; for cached_entries in &apply.entries { @@ -3894,6 +3925,9 @@ where region_id, term, compact_index ) } + Msg::InMemoryEngineLoadRegion { region_id, .. } => { + write!(f, "[region {}] try load in memory cache", region_id) + } } } } @@ -3945,7 +3979,7 @@ where EK: KvEngine, { delegate: ApplyDelegate, - receiver: Receiver>, + receiver: Receiver>>, mailbox: Option>>, } @@ -3955,12 +3989,14 @@ where { fn from_peer( peer: &Peer, - ) -> (LooseBoundedSender>, Box>) { + ) -> (LooseBoundedSender>>, Box>) { let reg = Registration::new(peer); ApplyFsm::from_registration(reg) } - fn from_registration(reg: Registration) -> (LooseBoundedSender>, Box>) { + fn from_registration( + reg: Registration, + ) -> (LooseBoundedSender>>, Box>) { let (tx, rx) = loose_bounded(usize::MAX); let delegate = ApplyDelegate::from_registration(reg); ( @@ -4131,13 +4167,11 @@ where self.destroy(ctx); ctx.notifier.notify_one( self.delegate.region_id(), - PeerMsg::ApplyRes { - res: TaskRes::Destroy { - region_id: self.delegate.region_id(), - peer_id: self.delegate.id(), - merge_from_snapshot: d.merge_from_snapshot, - }, - }, + PeerMsg::ApplyRes(Box::new(TaskRes::Destroy { + region_id: self.delegate.region_id(), + peer_id: self.delegate.id(), + merge_from_snapshot: d.merge_from_snapshot, + })), ); } } @@ -4201,7 +4235,7 @@ where .store(region_id, Ordering::SeqCst); // To trigger the target apply fsm if let Some(mailbox) = ctx.router.mailbox(catch_up_logs.target_region_id) { - let _ = mailbox.force_send(Msg::Noop); + let _ = mailbox.force_send(Box::new(Msg::Noop)); } else { error!( "failed to get mailbox, are we shutting down?"; @@ -4246,7 +4280,7 @@ where } if let Err(e) = snap_task.generate_and_schedule_snapshot::( - apply_ctx.engine.snapshot(None), + apply_ctx.engine.snapshot(), self.delegate.applied_term, self.delegate.apply_state.clone(), &apply_ctx.region_scheduler, @@ -4318,7 +4352,7 @@ where ReadResponse { response: Default::default(), snapshot: Some(RegionSnapshot::from_snapshot( - Arc::new(apply_ctx.engine.snapshot(None)), + Arc::new(apply_ctx.engine.snapshot()), Arc::new(self.delegate.region.clone()), )), txn_extra_op: TxnExtraOp::Noop, @@ -4412,7 +4446,8 @@ where ctx.finish_for(&mut self.delegate, result); } - fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { + #[allow(clippy::vec_box)] + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; loop { @@ -4427,7 +4462,7 @@ where }; if batch_apply.is_some() { - match &msg { + match *msg { Msg::Apply { .. } => (), _ => { self.handle_apply(apply_ctx, batch_apply.take().unwrap()); @@ -4440,7 +4475,7 @@ where } } - match msg { + match *msg { Msg::Apply { start, mut apply } => { let apply_wait = start.saturating_elapsed(); apply_ctx.apply_wait.observe(apply_wait.as_secs_f64()); @@ -4461,7 +4496,9 @@ where } else { self.handle_apply(apply_ctx, batch_apply.take().unwrap()); if let Some(ref mut state) = self.delegate.yield_state { - state.pending_msgs.push(Msg::Apply { start, apply }); + state + .pending_msgs + .push(Box::new(Msg::Apply { start, apply })); state.pending_msgs.extend(drainer); break; } @@ -4505,6 +4542,12 @@ where } => { self.unsafe_force_compact(apply_ctx, term, compact_index); } + Msg::InMemoryEngineLoadRegion { + trigger_load_cb, .. + } => { + trigger_load_cb(&self.delegate.region); + fail_point!("on_apply_in_memory_engine_load_region"); + } } } } @@ -4514,7 +4557,7 @@ impl Fsm for ApplyFsm where EK: KvEngine, { - type Message = Msg; + type Message = Box>; #[inline] fn is_stopped(&self) -> bool { @@ -4621,7 +4664,8 @@ pub struct ApplyPoller where EK: KvEngine, { - msg_buf: Vec>, + #[allow(clippy::vec_box)] + msg_buf: Vec>>, apply_ctx: ApplyContext, messages_per_tick: usize, cfg_tracker: Tracker, @@ -4686,6 +4730,7 @@ where } handle_result = HandleResult::KeepProcessing; } + fail_point!("before_handle_normal"); fail_point!("before_handle_normal_3", normal.delegate.id() == 3, |_| { HandleResult::KeepProcessing }); @@ -4853,9 +4898,9 @@ where EK: KvEngine, { pub fn schedule_task(&self, region_id: u64, msg: Msg) { - let reg = match self.try_send(region_id, msg) { + let reg = match self.try_send(region_id, Box::new(msg)) { Either::Left(Ok(())) => return, - Either::Left(Err(TrySendError::Disconnected(msg))) | Either::Right(msg) => match msg { + Either::Left(Err(TrySendError::Disconnected(msg))) | Either::Right(msg) => match *msg { Msg::Registration(reg) => reg, Msg::Apply { mut apply, .. } => { info!( @@ -4935,6 +4980,11 @@ where "region_id" => region_id); return; } + Msg::InMemoryEngineLoadRegion { region_id, .. } => { + info!("skip check load in memory region cache because target region is not found"; + "region_id" => region_id); + return; + } }, Either::Left(Err(TrySendError::Full(_))) => unreachable!(), }; @@ -5075,6 +5125,7 @@ mod memtrace { Msg::Recover(..) => 0, Msg::CheckCompact { .. } => 0, Msg::UnsafeForceCompact { .. } => 0, + Msg::InMemoryEngineLoadRegion { .. } => 0, } } } @@ -5193,8 +5244,8 @@ mod tests { impl Notifier for TestNotifier { fn notify(&self, apply_res: Vec>) { for r in apply_res { - let res = TaskRes::Apply(r); - let _ = self.tx.send(PeerMsg::ApplyRes { res }); + let res = Box::new(TaskRes::Apply(r)); + let _ = self.tx.send(PeerMsg::ApplyRes(res)); } } fn notify_one(&self, _: u64, msg: PeerMsg) { @@ -5401,10 +5452,7 @@ mod tests { E: KvEngine, { match receiver.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(res), - .. - }) => res, + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(res))) => res, e => panic!("unexpected res {:?}", e), } } @@ -5552,10 +5600,7 @@ mod tests { ], ); let apply_res = match rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(res), - .. - }) => res, + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(res))) => res, e => panic!("unexpected apply result: {:?}", e), }; let apply_state_key = keys::apply_state_key(2); @@ -5586,12 +5631,9 @@ mod tests { router.schedule_task(2, Msg::destroy(2, false)); let (region_id, peer_id) = match rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Destroy { - region_id, peer_id, .. - }, - .. - }) => (region_id, peer_id), + Ok(PeerMsg::ApplyRes(box TaskRes::Destroy { + region_id, peer_id, .. + })) => (region_id, peer_id), e => panic!("expected destroy result, but got {:?}", e), }; assert_eq!(peer_id, 1); @@ -7424,7 +7466,7 @@ mod tests { let cmd_batch = cmdbatch_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert_eq!(2, cmd_batch.len()); - // Stop observer regoin 1. + // Stop observer region 1. observe_handle.stop_observing(); let observe_handle = ObserveHandle::new(); @@ -7854,9 +7896,9 @@ mod tests { .unwrap(); assert_ne!(initial_state.get_applied_index(), 0); match apply_res_rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(apply_res), - }) => assert_eq!(apply_res.apply_state, initial_state), + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(apply_res))) => { + assert_eq!(apply_res.apply_state, initial_state) + } e => panic!("unexpected result: {:?}", e), } index_id += 1; @@ -7888,9 +7930,9 @@ mod tests { .unwrap() .unwrap(); match apply_res_rx.recv_timeout(Duration::from_secs(3)) { - Ok(PeerMsg::ApplyRes { - res: TaskRes::Apply(apply_res), - }) => assert_eq!(apply_res.apply_state, apply_state), + Ok(PeerMsg::ApplyRes(box TaskRes::Apply(apply_res))) => { + assert_eq!(apply_res.apply_state, apply_state) + } e => panic!("unexpected result: {:?}", e), } assert!( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 9abc1c39945..5f5a9f6b773 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -210,7 +210,7 @@ where while let Ok(msg) = self.receiver.try_recv() { let callback = match msg { PeerMsg::RaftCommand(cmd) => cmd.callback, - PeerMsg::CasualMessage(CasualMessage::SplitRegion { callback, .. }) => callback, + PeerMsg::CasualMessage(box CasualMessage::SplitRegion { callback, .. }) => callback, PeerMsg::RaftMessage(im, _) => { raft_messages_size += im.heap_size; continue; @@ -684,7 +684,7 @@ where && !self.fsm.peer.disk_full_peers.majority()) || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) { - self.fsm.batch_req_builder.add(cmd, req_size); + self.fsm.batch_req_builder.add(*cmd, req_size); if self.fsm.batch_req_builder.should_finish(&self.ctx.cfg) { self.propose_pending_batch_raft_command(); } @@ -697,7 +697,7 @@ where } } PeerMsg::Tick(tick) => self.on_tick(tick), - PeerMsg::ApplyRes { res } => { + PeerMsg::ApplyRes(res) => { self.on_apply_res(res); } PeerMsg::SignificantMsg(msg) => self.on_significant_msg(msg), @@ -1129,8 +1129,8 @@ where } } - fn on_casual_msg(&mut self, msg: CasualMessage) { - match msg { + fn on_casual_msg(&mut self, msg: Box>) { + match *msg { CasualMessage::SplitRegion { region_epoch, split_keys, @@ -1289,6 +1289,16 @@ where let _ = self.fsm.peer.raft_group.campaign(); self.fsm.has_ready = true; } + CasualMessage::InMemoryEngineLoadRegion { + region_id, + trigger_load_cb, + } => self.ctx.apply_router.schedule_task( + region_id, + ApplyTask::InMemoryEngineLoadRegion { + region_id, + trigger_load_cb, + }, + ), } } @@ -1491,8 +1501,8 @@ where ); } - fn on_significant_msg(&mut self, msg: SignificantMsg) { - match msg { + fn on_significant_msg(&mut self, msg: Box>) { + match *msg { SignificantMsg::SnapshotStatus { to_peer_id, status, .. } => { @@ -1873,7 +1883,7 @@ where // follower state let _ = self.ctx.router.send( self.region_id(), - PeerMsg::CasualMessage(CasualMessage::Campaign), + PeerMsg::CasualMessage(Box::new(CasualMessage::Campaign)), ); } self.fsm.has_ready = true; @@ -2440,9 +2450,9 @@ where } } - fn on_apply_res(&mut self, res: ApplyTaskRes) { + fn on_apply_res(&mut self, res: Box>) { fail_point!("on_apply_res", |_| {}); - match res { + match *res { ApplyTaskRes::Apply(mut res) => { debug!( "async apply finish"; @@ -2613,8 +2623,8 @@ where } } - fn on_raft_message(&mut self, msg: InspectedRaftMessage) -> Result<()> { - let InspectedRaftMessage { heap_size, mut msg } = msg; + fn on_raft_message(&mut self, m: Box) -> Result<()> { + let InspectedRaftMessage { heap_size, mut msg } = *m; let peer_disk_usage = msg.disk_usage; let stepped = Cell::new(false); let memtrace_raft_entries = &mut self.fsm.peer.memtrace_raft_entries as *mut usize; @@ -2627,7 +2637,7 @@ where MEMTRACE_RAFT_MESSAGES.trace(TraceEvent::Sub(heap_size)); if stepped.get() { unsafe { - // It could be less than exact for entry overwritting. + // It could be less than exact for entry overwriting. *memtrace_raft_entries += heap_size; MEMTRACE_RAFT_ENTRIES.trace(TraceEvent::Add(heap_size)); } @@ -2645,13 +2655,45 @@ where "is_initialized_peer" => is_initialized_peer, ); + let msg_type = msg.get_message().get_msg_type(); + #[cfg(feature = "failpoints")] + let fp_enable = |target_msg_type: MessageType| -> bool { + self.fsm.region_id() == 1000 + && self.store_id() == 2 + && !is_initialized_peer + && msg_type == target_msg_type + }; + #[cfg(feature = "failpoints")] + fail_point!( + "on_snap_msg_1000_2", + fp_enable(MessageType::MsgSnapshot), + |_| Ok(()) + ); + #[cfg(feature = "failpoints")] + fail_point!( + "on_vote_msg_1000_2", + fp_enable(MessageType::MsgRequestVote), + |_| Ok(()) + ); + #[cfg(feature = "failpoints")] + fail_point!( + "on_append_msg_1000_2", + fp_enable(MessageType::MsgAppend), + |_| Ok(()) + ); + #[cfg(feature = "failpoints")] + fail_point!( + "on_heartbeat_msg_1000_2", + fp_enable(MessageType::MsgHeartbeat), + |_| Ok(()) + ); + if self.fsm.peer.pending_remove || self.fsm.stopped { return Ok(()); } self.handle_reported_disk_usage(&msg); - let msg_type = msg.get_message().get_msg_type(); if matches!(self.ctx.self_disk_usage, DiskUsage::AlreadyFull) && MessageType::MsgTimeoutNow == msg_type { @@ -2984,6 +3026,9 @@ where } fn on_extra_message(&mut self, mut msg: RaftMessage) { + self.ctx + .coprocessor_host + .on_extra_message(self.fsm.peer.region(), msg.get_extra_msg()); match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { if msg.get_extra_msg().forcely_awaken { @@ -3081,6 +3126,12 @@ where } } } + ExtraMessageType::MsgPreLoadRegionRequest => { + // It has been handled in on_extra_message in coprocessor_host + } + ExtraMessageType::MsgPreLoadRegionResponse => { + // Ignore now + } } } @@ -3199,10 +3250,10 @@ where ); if self.handle_destroy_peer(job) { // It's not frequent, so use 0 as `heap_size` is ok. - let store_msg = StoreMsg::RaftMessage(InspectedRaftMessage { + let store_msg = StoreMsg::RaftMessage(Box::new(InspectedRaftMessage { heap_size: 0, msg: msg.clone(), - }); + })); if let Err(e) = self.ctx.router.send_control(store_msg) { info!( "failed to send back store message, are we shutting down?"; @@ -3436,7 +3487,24 @@ where } let mut meta = self.ctx.store_meta.lock().unwrap(); - if meta.regions[&self.region_id()] != *self.region() { + // Check if the region matches the metadata. A mismatch means another + // peer has replaced the current peer, which can happen during a split: a + // peer is first created via raft message, then replaced by another peer + // (of the same region) when the split is applied. + let region_mismatch = match meta.regions.get(&self.region_id()) { + Some(region) => *region != *self.region(), + None => { + // If the region doesn't exist, treat it as a mismatch. This can + // happen in rare situations (e.g. #17469). + warn!( + "region not found in meta"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + true + } + }; + if region_mismatch { if !self.fsm.peer.is_initialized() { info!( "stale delegate detected, skip"; @@ -3450,7 +3518,7 @@ where panic!( "{} meta corrupted: {:?} != {:?}", self.fsm.peer.tag, - meta.regions[&self.region_id()], + meta.regions.get(&self.region_id()), self.region() ); } @@ -3537,7 +3605,7 @@ where // may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), - PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), + PeerMsg::CasualMessage(Box::new(CasualMessage::RegionOverlapped)), ); } } @@ -3615,11 +3683,11 @@ where .router .force_send( source_region_id, - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result, - }), + })), ) .unwrap(); } @@ -3806,7 +3874,7 @@ where if self.fsm.peer.has_unpersisted_ready() { assert!(self.ctx.sync_write_worker.is_none()); // The destroy must be delayed if there are some unpersisted readies. - // Otherwise there is a race of writting kv db and raft db between here + // Otherwise there is a race of writing kv db and raft db between here // and write worker. return Some(DelayReason::UnPersistedReady); } @@ -3847,9 +3915,9 @@ where ) .flush() .when_done(move || { - if let Err(e) = - mb.force_send(PeerMsg::SignificantMsg(SignificantMsg::RaftLogGcFlushed)) - { + if let Err(e) = mb.force_send(PeerMsg::SignificantMsg(Box::new( + SignificantMsg::RaftLogGcFlushed, + ))) { if tikv_util::thread_group::is_shutdown(!cfg!(test)) { return; } @@ -3888,6 +3956,7 @@ where // [PerformanceCriticalPath] TODO: spin off the I/O code (self.fsm.peer.destroy) fn destroy_peer(&mut self, merged_by_target: bool) -> bool { + self.ctx.coprocessor_host.on_destroy_peer(self.region()); fail_point!("destroy_peer"); // Mark itself as pending_remove self.fsm.peer.pending_remove = true; @@ -4343,14 +4412,12 @@ where share_size = self .fsm .peer - .split_check_trigger - .approximate_size + .approximate_size() .map(|v| v / new_region_count); share_keys = self .fsm .peer - .split_check_trigger - .approximate_keys + .approximate_keys() .map(|v| v / new_region_count); } @@ -4366,8 +4433,8 @@ where let is_leader = self.fsm.peer.is_leader(); if is_leader { if share_source_region_size { - self.fsm.peer.split_check_trigger.approximate_size = share_size; - self.fsm.peer.split_check_trigger.approximate_keys = share_keys; + self.fsm.peer.set_approximate_size(share_size); + self.fsm.peer.set_approximate_keys(share_keys); } self.fsm.peer.heartbeat_pd(self.ctx); // Notify pd immediately to let it update the region meta. @@ -4502,8 +4569,8 @@ where new_peer.has_ready |= campaigned; if is_leader { - new_peer.peer.split_check_trigger.approximate_size = share_size; - new_peer.peer.split_check_trigger.approximate_keys = share_keys; + new_peer.peer.set_approximate_size(share_size); + new_peer.peer.set_approximate_keys(share_keys); *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. @@ -4534,7 +4601,7 @@ where .swap_remove_front(|m| m.get_to_peer() == &meta_peer) { let peer_msg = PeerMsg::RaftMessage( - InspectedRaftMessage { heap_size: 0, msg }, + Box::new(InspectedRaftMessage { heap_size: 0, msg }), Some(TiInstant::now()), ); if let Err(e) = self.ctx.router.force_send(new_region_id, peer_msg) { @@ -4765,14 +4832,14 @@ where .router .force_send( target_id, - PeerMsg::RaftCommand(RaftCommand::new_ext( + PeerMsg::RaftCommand(Box::new(RaftCommand::new_ext( request, Callback::None, RaftCmdExtraOpts { deadline: None, disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, }, - )), + ))), ) .map_err(|_| Error::RegionNotFound(target_id)) } @@ -5029,11 +5096,11 @@ where } if let Err(e) = self.ctx.router.force_send( source.get_id(), - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result: MergeResultKind::FromTargetLog, - }), + })), ) { panic!( "{} failed to send merge result(FromTargetLog) to source region {}, err {}", @@ -5307,11 +5374,11 @@ where for r in &persist_res.destroy_regions { if let Err(e) = self.ctx.router.force_send( r.get_id(), - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: self.fsm.region_id(), target: self.fsm.peer.peer.clone(), result: MergeResultKind::FromTargetSnapshotStep2, - }), + })), ) { panic!( "{} failed to send merge result(FromTargetSnapshotStep2) to source region {}, err {}", diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 08955f07187..8da5743fecb 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -384,9 +384,7 @@ where let region_id = r.region_id; if let Err(e) = self.router.force_send( region_id, - PeerMsg::ApplyRes { - res: ApplyTaskRes::Apply(r), - }, + PeerMsg::ApplyRes(Box::new(ApplyTaskRes::Apply(r))), ) { error!("failed to send apply result"; "region_id" => region_id, "err" => ?e); } @@ -423,7 +421,7 @@ where heap_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } let peer_msg = PeerMsg::RaftMessage( - InspectedRaftMessage { heap_size, msg }, + Box::new(InspectedRaftMessage { heap_size, msg }), Some(TiInstant::now()), ); let event = TraceEvent::Add(heap_size); @@ -468,10 +466,10 @@ where cmd: RaftCommand, ) -> std::result::Result<(), TrySendError>> { let region_id = cmd.request.get_header().get_region_id(); - match self.send(region_id, PeerMsg::RaftCommand(cmd)) { + match self.send(region_id, PeerMsg::RaftCommand(Box::new(cmd))) { Ok(()) => Ok(()), - Err(TrySendError::Full(PeerMsg::RaftCommand(cmd))) => Err(TrySendError::Full(cmd)), - Err(TrySendError::Disconnected(PeerMsg::RaftCommand(cmd))) => { + Err(TrySendError::Full(PeerMsg::RaftCommand(box cmd))) => Err(TrySendError::Full(cmd)), + Err(TrySendError::Disconnected(PeerMsg::RaftCommand(box cmd))) => { Err(TrySendError::Disconnected(cmd)) } _ => unreachable!(), @@ -480,7 +478,7 @@ where fn report_unreachable(&self, store_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreUnreachable { store_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreUnreachable { store_id })) }); } @@ -491,7 +489,10 @@ where /// Broadcasts resolved result to all regions. pub fn report_resolved(&self, store_id: u64, group_id: u64) { self.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::StoreResolved { store_id, group_id }) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::StoreResolved { + store_id, + group_id, + })) }) } @@ -1074,12 +1075,7 @@ impl PollHandler, St fail_point!( "pause_on_peer_destroy_res", peer.peer_id() == 1 - && matches!( - msg, - PeerMsg::ApplyRes { - res: ApplyTaskRes::Destroy { .. }, - } - ), + && matches!(msg, PeerMsg::ApplyRes(box ApplyTaskRes::Destroy { .. })), |_| unreachable!() ); self.peer_msg_buf.push(msg); @@ -1679,7 +1675,9 @@ impl RaftBatchSystem { for region_id in regions { let _ = router_clone.send( region_id, - PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), + PeerMsg::CasualMessage(Box::new( + CasualMessage::ForceCompactRaftLogs, + )), ); } } @@ -2143,7 +2141,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER Ok(CheckMsgStatus::NewPeer) } - fn on_raft_message(&mut self, msg: InspectedRaftMessage) -> Result<()> { + fn on_raft_message(&mut self, msg: Box) -> Result<()> { let (heap_size, forwarded) = (msg.heap_size, Cell::new(false)); defer!(if !forwarded.get() { MEMTRACE_RAFT_MESSAGES.trace(TraceEvent::Sub(heap_size)); @@ -2234,8 +2232,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER check_msg_status == CheckMsgStatus::NewPeerFirst, )? { // Peer created, send the message again. - let peer_msg = - PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); + let peer_msg = PeerMsg::RaftMessage( + Box::new(InspectedRaftMessage { heap_size, msg }), + None, + ); if self.ctx.router.send(region_id, peer_msg).is_ok() { forwarded.set(true); } @@ -2258,7 +2258,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER store_meta.pending_msgs.push(msg); } else { drop(store_meta); - let peer_msg = PeerMsg::RaftMessage(InspectedRaftMessage { heap_size, msg }, None); + let peer_msg = + PeerMsg::RaftMessage(Box::new(InspectedRaftMessage { heap_size, msg }), None); if let Err(e) = self.ctx.router.force_send(region_id, peer_msg) { warn!("handle first request failed"; "region_id" => region_id, "error" => ?e); } else { @@ -2423,7 +2424,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER // region may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), - PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), + PeerMsg::CasualMessage(Box::new(CasualMessage::RegionOverlapped)), ); } } @@ -2438,11 +2439,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .router .force_send( id, - PeerMsg::SignificantMsg(SignificantMsg::MergeResult { + PeerMsg::SignificantMsg(Box::new(SignificantMsg::MergeResult { target_region_id: region_id, target: target.clone(), result: MergeResultKind::Stale, - }), + })), ) .unwrap(); } @@ -2510,9 +2511,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER for (region_id, declined_bytes) in region_declined_bytes.drain(..) { let _ = self.ctx.router.send( region_id, - PeerMsg::CasualMessage(CasualMessage::CompactionDeclinedBytes { + PeerMsg::CasualMessage(Box::new(CasualMessage::CompactionDeclinedBytes { bytes: declined_bytes, - }), + })), ); } } @@ -3201,7 +3202,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let _ = self.ctx.router.send( target_region_id, - PeerMsg::RaftCommand(RaftCommand::new(request, Callback::None)), + PeerMsg::RaftCommand(Box::new(RaftCommand::new(request, Callback::None))), ); } @@ -3239,7 +3240,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER for region_id in regions { let _ = self.ctx.router.send( region_id, - PeerMsg::CasualMessage(CasualMessage::ClearRegionSize), + PeerMsg::CasualMessage(Box::new(CasualMessage::ClearRegionSize)), ); } } diff --git a/components/raftstore/src/store/memory.rs b/components/raftstore/src/store/memory.rs index b1f40d93715..c1ad65392a8 100644 --- a/components/raftstore/src/store/memory.rs +++ b/components/raftstore/src/store/memory.rs @@ -9,7 +9,7 @@ use tikv_alloc::{ mem_trace, trace::{Id, MemoryTrace}, }; -use tikv_util::sys::memory_usage_reaches_high_water; +use tikv_util::sys::memory_usage_reaches_near_high_water; lazy_static! { pub static ref MEMTRACE_ROOT: Arc = mem_trace!( @@ -57,18 +57,28 @@ lazy_static! { MEMTRACE_ROOT.sub_trace(Id::Name("raft_entries")); } +pub fn get_memory_usage_entry_cache() -> u64 { + (|| { + fail_point!("mock_memory_usage_entry_cache", |t| { + t.unwrap().parse::().unwrap() + }); + MEMTRACE_ENTRY_CACHE.sum() as u64 + })() +} + pub fn needs_evict_entry_cache(evict_cache_on_memory_ratio: f64) -> bool { fail_point!("needs_evict_entry_cache", |_| true); + if evict_cache_on_memory_ratio < f64::EPSILON { return false; } let mut usage = 0; - if memory_usage_reaches_high_water(&mut usage) { - let ec_usage = MEMTRACE_ENTRY_CACHE.sum() as u64; - if ec_usage as f64 > usage as f64 * evict_cache_on_memory_ratio { - return true; - } + let is_near = memory_usage_reaches_near_high_water(&mut usage); + if !is_near { + return false; } - false + + let ec_usage = get_memory_usage_entry_cache(); + ec_usage as f64 > usage as f64 * evict_cache_on_memory_ratio } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 971c9038594..2c1f69d8eb4 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -71,7 +71,7 @@ pub use self::{ replication_mode::{GlobalReplicationState, StoreGroup}, snap::{ check_abort, copy_snapshot, - snap_io::{apply_sst_cf_file, build_sst_cf_file_list}, + snap_io::{apply_sst_cf_files_by_ingest, build_sst_cf_file_list}, ApplyOptions, CfFile, Error as SnapError, SnapEntry, SnapKey, SnapManager, SnapManagerBuilder, Snapshot, SnapshotStatistics, TabletSnapKey, TabletSnapManager, }, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 21add281570..ba3a30f02a8 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -13,7 +13,7 @@ use kvproto::{ brpb::CheckAdminResponse, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb, - metapb::RegionEpoch, + metapb::{Region, RegionEpoch}, pdpb::{self, CheckPolicy}, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, @@ -649,6 +649,11 @@ pub enum CasualMessage { // Trigger raft to campaign which is used after exiting force leader Campaign, + // Trigger loading pending region for in_memory_engine, + InMemoryEngineLoadRegion { + region_id: u64, + trigger_load_cb: Box, + }, } impl fmt::Debug for CasualMessage { @@ -720,6 +725,11 @@ impl fmt::Debug for CasualMessage { peer_id, tombstone ), CasualMessage::Campaign => write!(fmt, "Campaign"), + CasualMessage::InMemoryEngineLoadRegion { region_id, .. } => write!( + fmt, + "[region={}] try load in memory region cache", + region_id + ), } } } @@ -733,7 +743,6 @@ pub struct RaftCmdExtraOpts { /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. -#[derive(Debug)] pub struct RaftCommand { pub send_time: Instant, pub request: RaftCmdRequest, @@ -741,6 +750,21 @@ pub struct RaftCommand { pub extra_opts: RaftCmdExtraOpts, } +impl fmt::Debug for RaftCommand { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("RaftCommand") + .field("send_time", &self.send_time) + .field("request", &self.request.get_requests().len()) + .field( + "admin_request", + &self.request.get_admin_request().get_cmd_type(), + ) + .field("callback", &self.callback) + .field("extra_opts", &self.extra_opts) + .finish() + } +} + impl RaftCommand { #[inline] pub fn new(request: RaftCmdRequest, callback: Callback) -> RaftCommand { @@ -795,28 +819,25 @@ impl fmt::Debug for InspectedRaftMessage { } /// Message that can be sent to a peer. -#[allow(clippy::large_enum_variant)] #[derive(EnumCount, EnumVariantNames)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage, Option), + RaftMessage(Box, Option), /// Raft command is the command that is expected to be proposed by the /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. - RaftCommand(RaftCommand), + RaftCommand(Box>), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. - ApplyRes { - res: ApplyTaskRes, - }, + ApplyRes(Box>), /// Message that can't be lost but rarely created. If they are lost, real /// bad things happen like some peers will be considered dead in the /// group. - SignificantMsg(SignificantMsg), + SignificantMsg(Box>), /// Start the FSM. Start, /// A message only used to notify a peer. @@ -826,7 +847,7 @@ pub enum PeerMsg { ready_number: u64, }, /// Message that is not important and can be dropped occasionally. - CasualMessage(CasualMessage), + CasualMessage(Box>), /// Ask region to report a heartbeat to PD. HeartbeatPd, /// Asks region to change replication mode. @@ -849,7 +870,7 @@ impl fmt::Debug for PeerMsg { tick }, PeerMsg::SignificantMsg(msg) => write!(fmt, "{:?}", msg), - PeerMsg::ApplyRes { res } => write!(fmt, "ApplyRes {:?}", res), + PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), PeerMsg::Start => write!(fmt, "Startup"), PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { @@ -874,8 +895,8 @@ impl PeerMsg { PeerMsg::RaftMessage(..) => 0, PeerMsg::RaftCommand(_) => 1, PeerMsg::Tick(_) => 2, - PeerMsg::SignificantMsg(_) => 3, - PeerMsg::ApplyRes { .. } => 4, + PeerMsg::ApplyRes { .. } => 3, + PeerMsg::SignificantMsg(_) => 4, PeerMsg::Start => 5, PeerMsg::Noop => 6, PeerMsg::Persisted { .. } => 7, @@ -892,7 +913,7 @@ impl PeerMsg { pub fn is_send_failure_ignorable(&self) -> bool { matches!( self, - PeerMsg::SignificantMsg(SignificantMsg::CaptureChange { .. }) + PeerMsg::SignificantMsg(box SignificantMsg::CaptureChange { .. }) ) } } @@ -902,7 +923,7 @@ pub enum StoreMsg where EK: KvEngine, { - RaftMessage(InspectedRaftMessage), + RaftMessage(Box), // Clear region size and keys for all regions in the range, so we can force them to // re-calculate their size later. @@ -1003,3 +1024,18 @@ impl StoreMsg { } } } + +#[cfg(test)] +mod tests { + #[test] + fn test_msg_size() { + use std::mem; + + use engine_rocks::RocksEngine; + + use super::*; + + // make sure the msg is small enough + assert_eq!(mem::size_of::>(), 32); + } +} diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index ca702525d15..0f4f25950ff 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -19,8 +19,8 @@ use bytes::Bytes; use collections::{HashMap, HashSet}; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_traits::{ - CacheRange, Engines, KvEngine, PerfContext, RaftEngine, Snapshot, SnapshotContext, WriteBatch, - WriteOptions, CF_DEFAULT, CF_LOCK, CF_WRITE, + Engines, KvEngine, PerfContext, RaftEngine, Snapshot, WriteBatch, WriteOptions, CF_DEFAULT, + CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use fail::fail_point; @@ -66,7 +66,7 @@ use tikv_util::{ Either, }; use time::{Duration as TimeDuration, Timespec}; -use tracker::GLOBAL_TRACKERS; +use tracker::{TrackerTokenArray, GLOBAL_TRACKERS}; use txn_types::{TimeStamp, WriteBatchFlags}; use uuid::Uuid; @@ -90,7 +90,7 @@ use crate::{ RoleChange, }, errors::RAFTSTORE_IS_BUSY, - router::RaftStoreRouter, + router::{RaftStoreRouter, ReadContext}, store::{ async_io::{read::ReadTask, write::WriteMsg, write_router::WriteRouter}, fsm::{ @@ -1700,6 +1700,26 @@ where self.raft_group.mut_store() } + #[inline] + pub fn approximate_size(&self) -> Option { + self.split_check_trigger.approximate_size + } + + #[inline] + pub fn approximate_keys(&self) -> Option { + self.split_check_trigger.approximate_keys + } + + #[inline] + pub fn set_approximate_size(&mut self, approximate_size: Option) { + self.split_check_trigger.approximate_size = approximate_size; + } + + #[inline] + pub fn set_approximate_keys(&mut self, approximate_keys: Option) { + self.split_check_trigger.approximate_keys = approximate_keys; + } + /// Whether the snapshot is handling. /// See the comments of `check_snap_status` for more details. #[inline] @@ -3085,6 +3105,13 @@ where // In this case the apply can be guaranteed to be successful. Invoke the // on_committed callback if necessary. p.cb.invoke_committed(); + + debug!("raft log is committed"; + "req_info" => TrackerTokenArray::new(p.cb.write_trackers() + .into_iter() + .filter_map(|time_tracker| time_tracker.as_tracker_token()) + .collect::>().as_slice()) + ); } p }) @@ -3888,7 +3915,12 @@ where self.should_wake_up = true; } - fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + fn pre_transfer_leader( + &mut self, + peer: &metapb::Peer, + extra_msgs: Vec, + ctx: &mut PollContext, + ) -> bool { // Checks if safe to transfer leader. if self.raft_group.raft.has_pending_conf() { info!( @@ -3914,6 +3946,17 @@ where // forbids setting it for MsgTransferLeader messages. msg.set_log_term(self.term()); self.raft_group.raft.msgs.push(msg); + + extra_msgs.into_iter().for_each(|extra_msg| { + let mut msg = RaftMessage::default(); + msg.set_region_id(self.region_id); + msg.set_from_peer(self.peer.clone()); + msg.set_to_peer(peer.clone()); + msg.set_region_epoch(self.region().get_region_epoch().clone()); + msg.set_extra_msg(extra_msg); + self.send_raft_messages(ctx, vec![msg]); + }); + true } @@ -4767,27 +4810,29 @@ where /// to do the remaining work. /// /// See also: tikv/rfcs#37. - fn propose_transfer_leader( + fn propose_transfer_leader( &mut self, ctx: &mut PollContext, req: RaftCmdRequest, cb: Callback, ) -> bool { let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); - if let Err(err) = ctx + let extra_msgs = match ctx .coprocessor_host .pre_transfer_leader(self.region(), transfer_leader) { - warn!("Coprocessor rejected transfer leader."; "err" => ?err, + Err(err) => { + warn!("Coprocessor rejected transfer leader."; "err" => ?err, "region_id" => self.region_id, "peer_id" => self.peer.get_id(), "transferee" => transfer_leader.get_peer().get_id()); - let mut resp = RaftCmdResponse::new(); - *resp.mut_header().mut_error() = Error::from(err).into(); - cb.invoke_with_response(resp); - return false; - } - + let mut resp = RaftCmdResponse::new(); + *resp.mut_header().mut_error() = Error::from(err).into(); + cb.invoke_with_response(resp); + return false; + } + Ok(msgs) => msgs, + }; ctx.raft_metrics.propose.transfer_leader.inc(); let prs = self.raft_group.raft.prs(); @@ -4819,7 +4864,7 @@ where let transferred = if peer.id == self.peer.id { false } else { - self.pre_transfer_leader(peer) + self.pre_transfer_leader(peer, extra_msgs, ctx) }; // transfer leader command doesn't need to replicate log and apply, so we @@ -4956,9 +5001,9 @@ where Ok(propose_index) } - fn handle_read>( + fn handle_read( &self, - reader: &mut E, + ctx: &mut PollContext, req: RaftCmdRequest, check_epoch: bool, read_index: Option, @@ -5006,16 +5051,23 @@ where } } - let snap_ctx = if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { - Some(SnapshotContext { - range: Some(CacheRange::from_region(®ion)), - read_ts, - }) + let read_ctx = if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + ReadContext::new(None, Some(read_ts)) } else { - None + ReadContext::new(None, None) }; - let mut resp = reader.execute(&req, &Arc::new(region), read_index, snap_ctx, None); + let mut reader = PollContextReader { + engines: &ctx.engines, + }; + let mut resp = reader.execute( + &read_ctx, + &req, + &Arc::new(region), + read_index, + None, + &ctx.coprocessor_host, + ); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); snap.bucket_meta = self @@ -5202,7 +5254,7 @@ where // Check disk usages for the peer itself and other peers in the raft group. // The return value indicates whether the proposal is allowed or not. - fn check_normal_proposal_with_disk_full_opt( + fn check_normal_proposal_with_disk_full_opt( &mut self, ctx: &mut PollContext, disk_full_opt: DiskFullOpt, @@ -5238,7 +5290,7 @@ where "peer_id" => self.peer.get_id(), "target_peer_id" => p.get_id(), ); - self.pre_transfer_leader(&p); + self.pre_transfer_leader(&p, vec![], ctx); } } } else { @@ -5562,8 +5614,8 @@ where pending_peers: self.collect_pending_peers(ctx), written_bytes: self.peer_stat.written_bytes, written_keys: self.peer_stat.written_keys, - approximate_size: self.split_check_trigger.approximate_size, - approximate_keys: self.split_check_trigger.approximate_keys, + approximate_size: self.approximate_size(), + approximate_keys: self.approximate_keys(), replication_status: self.region_replication_status(ctx), wait_data_peers: self.wait_data_peers.clone(), }); @@ -6000,7 +6052,11 @@ where } } -impl ReadExecutor for PollContext +struct PollContextReader<'a, EK, ER> { + engines: &'a Engines, +} + +impl<'a, EK, ER> ReadExecutor for PollContextReader<'a, EK, ER> where EK: KvEngine, ER: RaftEngine, @@ -6011,12 +6067,8 @@ where &self.engines.kv } - fn get_snapshot( - &mut self, - snap_ctx: Option, - _: &Option>, - ) -> Arc { - Arc::new(self.engines.kv.snapshot(snap_ctx)) + fn get_snapshot(&mut self, _: &Option>) -> Arc { + Arc::new(self.engines.kv.snapshot()) } } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 428f1a12229..06072c4a05f 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -377,7 +377,7 @@ where #[inline] pub fn raw_snapshot(&self) -> EK::Snapshot { - self.engines.kv.snapshot(None) + self.engines.kv.snapshot() } #[inline] @@ -1661,7 +1661,7 @@ pub mod tests { .unwrap() .unwrap(); gen_task.generate_and_schedule_snapshot::( - engines.kv.clone().snapshot(None), + engines.kv.clone().snapshot(), entry.get_term(), apply_state, sched, diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 3456d4c3add..b1491444db1 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -2,10 +2,11 @@ // #[PerformanceCriticalPath] use std::{ + fmt, num::NonZeroU64, sync::{ atomic::{AtomicU64, Ordering}, - Arc, + Arc, Mutex, }, }; @@ -23,6 +24,7 @@ use tikv_util::{ }; use crate::{ + coprocessor::ObservedSnapshot, store::{util, PeerStorage, TxnExt}, Error, Result, }; @@ -30,7 +32,6 @@ use crate::{ /// Snapshot of a region. /// /// Only data within a region can be accessed. -#[derive(Debug)] pub struct RegionSnapshot { snap: Arc, region: Arc, @@ -41,6 +42,22 @@ pub struct RegionSnapshot { // `None` means the snapshot does not provide peer related transaction extensions. pub txn_ext: Option>, pub bucket_meta: Option>, + + observed_snap: Option>>>>, +} + +impl fmt::Debug for RegionSnapshot { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("RegionSnapshot") + .field("region", &self.region) + .field("apply_index", &self.apply_index) + .field("from_v2", &self.from_v2) + .field("term", &self.term) + .field("txn_extra_op", &self.txn_extra_op) + .field("txn_ext", &self.txn_ext) + .field("bucket_meta", &self.bucket_meta) + .finish() + } } impl RegionSnapshot @@ -59,7 +76,7 @@ where where EK: KvEngine, { - RegionSnapshot::from_snapshot(Arc::new(db.snapshot(None)), Arc::new(region)) + RegionSnapshot::from_snapshot(Arc::new(db.snapshot()), Arc::new(region)) } pub fn from_snapshot(snap: Arc, region: Arc) -> RegionSnapshot { @@ -74,6 +91,43 @@ where txn_extra_op: TxnExtraOp::Noop, txn_ext: None, bucket_meta: None, + observed_snap: None, + } + } + + pub fn set_observed_snapshot(&mut self, observed_snap: Box) { + self.observed_snap = Some(Arc::new(Mutex::new(Some(observed_snap)))); + } + + /// Replace underlying snapshot with its observed snapshot. + /// + /// One use case is to allow callers to build a `RegionSnapshot` with an + /// optimized snapshot. See RaftKv::async_in_memory_snapshot for an example. + /// + /// # Panics + /// + /// It panics, if it has been cloned before this calling `replace_snapshot` + /// or if `snap_fn` panics, the panic is propagated to the caller. + pub fn replace_snapshot(mut self, snap_fn: F) -> RegionSnapshot + where + Sp: Snapshot, + F: FnOnce(S, Option>) -> Sp, + { + let mut observed = None; + if let Some(observed_snap) = self.observed_snap.take() { + observed = observed_snap.lock().unwrap().take(); + } + let inner = Arc::into_inner(self.snap).unwrap(); + RegionSnapshot { + snap: Arc::new(snap_fn(inner, observed)), + region: self.region, + apply_index: self.apply_index, + from_v2: self.from_v2, + term: self.term, + txn_extra_op: self.txn_extra_op, + txn_ext: self.txn_ext, + bucket_meta: self.bucket_meta, + observed_snap: None, } } @@ -175,11 +229,6 @@ where pub fn get_end_key(&self) -> &[u8] { self.region.get_end_key() } - - #[cfg(test)] - pub fn snap(&self) -> Arc { - self.snap.clone() - } } impl Clone for RegionSnapshot @@ -196,6 +245,7 @@ where txn_extra_op: self.txn_extra_op, txn_ext: self.txn_ext.clone(), bucket_meta: self.bucket_meta.clone(), + observed_snap: self.observed_snap.clone(), } } } diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index 9c3f9611675..6f2901402a6 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -231,7 +231,7 @@ pub struct SimpleWriteReqDecoder<'a> { impl<'a> SimpleWriteReqDecoder<'a> { pub fn new( fallback: impl FnOnce(&'a [u8], u64, u64) -> RaftCmdRequest, - logger: &Logger, + logger: Option<&Logger>, buf: &'a [u8], index: u64, term: u64, @@ -241,13 +241,22 @@ impl<'a> SimpleWriteReqDecoder<'a> { let mut is = CodedInputStream::from_bytes(&buf[1..]); let header = match is.read_message() { Ok(h) => h, - Err(e) => slog_panic!( - logger, - "data corrupted"; - "term" => term, - "index" => index, - "error" => ?e - ), + Err(e) => { + if let Some(logger) = logger { + slog_panic!( + logger, + "data corrupted"; + "term" => term, + "index" => index, + "error" => ?e + ) + } else { + panic!( + "data corrupted term: {}, index: {}, error {:?}", + term, index, e + ) + } + } }; let read = is.pos(); Ok(SimpleWriteReqDecoder { @@ -560,7 +569,7 @@ mod tests { let (bytes, _) = req_encoder.encode(); let logger = slog_global::borrow_global().new(o!()); let mut decoder = - SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let write = decoder.next().unwrap(); let SimpleWrite::Put(put) = write else { @@ -579,7 +588,8 @@ mod tests { assert_matches!(decoder.next(), None); let (bytes, _) = req_encoder2.encode(); - decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + decoder = + SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") @@ -615,7 +625,7 @@ mod tests { SimpleWriteReqEncoder::>::new(header, bin, 0); let (bytes, _) = req_encoder.encode(); let mut decoder = - SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); let SimpleWrite::Ingest(ssts) = write else { panic!("should be ingest") @@ -661,7 +671,7 @@ mod tests { let bytes = raft_cmd.write_to_bytes().unwrap(); let logger = slog_global::borrow_global().new(o!()); let decoded = - SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap_err(); + SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bytes, 0, 0).unwrap_err(); // SimpleWriteReqDecoder should be able to decode naive RaftCmdRequest. assert_eq!(decoded, raft_cmd); @@ -703,7 +713,7 @@ mod tests { let (bytes, _) = req_encoder.encode(); let mut decoder = - SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); + SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let req = decoder.next().unwrap(); let SimpleWrite::Put(put) = req else { @@ -732,7 +742,7 @@ mod tests { let (bin, _) = req_encoder.encode(); assert_eq!( header.as_ref(), - SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bin, 0, 0) .unwrap() .to_raft_cmd_request() .get_header(), @@ -747,7 +757,7 @@ mod tests { 512, ); let (bin, _) = req_encoder.encode(); - let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + let req = SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bin, 0, 0) .unwrap() .to_raft_cmd_request(); assert_eq!(req.get_requests().len(), 1); @@ -764,7 +774,7 @@ mod tests { 512, ); let (bin, _) = req_encoder.encode(); - let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + let req = SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bin, 0, 0) .unwrap() .to_raft_cmd_request(); assert_eq!(req.get_requests().len(), 1); @@ -780,7 +790,7 @@ mod tests { 512, ); let (bin, _) = req_encoder.encode(); - let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + let req = SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bin, 0, 0) .unwrap() .to_raft_cmd_request(); assert_eq!(req.get_requests().len(), 1); @@ -807,7 +817,7 @@ mod tests { 512, ); let (bin, _) = req_encoder.encode(); - let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + let req = SimpleWriteReqDecoder::new(decoder_fallback, Some(&logger), &bin, 0, 0) .unwrap() .to_raft_cmd_request(); assert_eq!(req.get_requests().len(), 5); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index f19b730e55f..8516703de57 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -37,7 +37,9 @@ use protobuf::Message; use raft::eraftpb::Snapshot as RaftSnapshot; use thiserror::Error; use tikv_util::{ - box_err, box_try, debug, error, info, + box_err, box_try, + config::ReadableSize, + debug, error, info, time::{duration_to_sec, Instant, Limiter, UnixSecs}, warn, HandyRwLock, }; @@ -1110,6 +1112,9 @@ impl Snapshot { } pub fn apply(&mut self, options: ApplyOptions) -> Result<()> { + let apply_without_ingest = self + .mgr + .can_apply_cf_without_ingest(self.total_size(), self.total_count()); let post_check = |cf_file: &CfFile, offset: usize| { if !plain_file_used(cf_file.cf) { let file_paths = cf_file.file_paths(); @@ -1136,27 +1141,27 @@ impl Snapshot { let abort_checker = ApplyAbortChecker(options.abort); let coprocessor_host = options.coprocessor_host; let region = options.region; - let key_mgr = self.mgr.encryption_key_manager.as_ref(); + let key_mgr = self.mgr.encryption_key_manager.clone(); + let batch_size = options.write_batch_size; for cf_file in &mut self.cf_files { if cf_file.size.is_empty() { // Skip empty cf file. continue; } let cf = cf_file.cf; + let mut cb = |kv: &[(Vec, Vec)]| { + coprocessor_host.post_apply_plain_kvs_from_snapshot(®ion, cf, kv) + }; if plain_file_used(cf_file.cf) { let path = &cf_file.file_paths()[0]; - let batch_size = options.write_batch_size; - let cb = |kv: &[(Vec, Vec)]| { - coprocessor_host.post_apply_plain_kvs_from_snapshot(®ion, cf, kv) - }; snap_io::apply_plain_cf_file( path, - key_mgr, + key_mgr.as_ref(), &abort_checker, &options.db, cf, batch_size, - cb, + &mut cb, )?; } else { let path = cf_file.path.to_str().unwrap(); // path is not used at all @@ -1165,8 +1170,22 @@ impl Snapshot { .iter() .map(|s| s.as_str()) .collect::>(); - snap_io::apply_sst_cf_file(clone_files.as_slice(), &options.db, cf)?; - coprocessor_host.post_apply_sst_from_snapshot(®ion, cf, path); + if apply_without_ingest { + // Apply the snapshot without ingest, to accelerate the applying process. + snap_io::apply_sst_cf_files_without_ingest( + clone_files.as_slice(), + &options.db, + cf, + key_mgr.clone(), + &abort_checker, + batch_size, + &mut cb, + )?; + } else { + // Apply the snapshot by ingest. + snap_io::apply_sst_cf_files_by_ingest(clone_files.as_slice(), &options.db, cf)?; + coprocessor_host.post_apply_sst_from_snapshot(®ion, cf, path); + } } } Ok(()) @@ -1440,6 +1459,9 @@ struct SnapManagerCore { max_per_file_size: Arc, enable_multi_snapshot_files: Arc, stats: Arc>>, + // Minimal column family size & kv counts for applying by ingest. + min_ingest_cf_size: u64, + min_ingest_cf_kvs: u64, } /// `SnapManagerCore` trace all current processing snapshots. @@ -1773,6 +1795,11 @@ impl SnapManager { self.core.recv_concurrency_limiter.set_limit(limit); } + pub fn set_min_ingest_cf_limit(&mut self, bytes: ReadableSize) { + self.core.min_ingest_cf_size = bytes.0; + self.core.min_ingest_cf_kvs = std::cmp::max(10000, (bytes.as_mb_f64() * 10000.0) as u64); + } + pub fn collect_stat(&self, snap: SnapshotStat) { debug!( "collect snapshot stat"; @@ -1995,6 +2022,16 @@ impl SnapManagerCore { } u64::MAX } + + pub fn can_apply_cf_without_ingest(&self, cf_size: u64, cf_kvs: u64) -> bool { + if self.min_ingest_cf_size == 0 { + return false; + } + // If the size and the count of keys of cf are relatively small, it's + // recommended to directly write it into kvdb rather than ingest, + // for mitigating performance issue when ingesting snapshot. + cf_size <= self.min_ingest_cf_size && cf_kvs <= self.min_ingest_cf_kvs + } } /// `SnapRecvConcurrencyLimiter` enforces a limit on the number of simultaneous @@ -2100,6 +2137,8 @@ pub struct SnapManagerBuilder { enable_receive_tablet_snapshot: bool, key_manager: Option>, concurrent_recv_snap_limit: usize, + min_ingest_snapshot_size: u64, + min_ingest_snapshot_kvs: u64, } impl SnapManagerBuilder { @@ -2132,6 +2171,13 @@ impl SnapManagerBuilder { self.enable_receive_tablet_snapshot = enabled; self } + pub fn min_ingest_snapshot_limit(mut self, bytes: ReadableSize) -> SnapManagerBuilder { + self.min_ingest_snapshot_size = bytes.0; + // Keeps the same assumptions in region size, "Assume the average size of KVs is + // 100B". So, it calculate the count of kvs with `bytes / `MiB` * 10000`. + self.min_ingest_snapshot_kvs = std::cmp::max(10000, (bytes.as_mb_f64() * 10000.0) as u64); + self + } #[must_use] pub fn encryption_key_manager(mut self, m: Option>) -> SnapManagerBuilder { self.key_manager = m; @@ -2174,6 +2220,8 @@ impl SnapManagerBuilder { self.enable_multi_snapshot_files, )), stats: Default::default(), + min_ingest_cf_size: self.min_ingest_snapshot_size, + min_ingest_cf_kvs: self.min_ingest_snapshot_kvs, }, max_total_size: Arc::new(AtomicU64::new(max_total_size)), tablet_snap_manager, @@ -2482,9 +2530,6 @@ pub mod tests { use tikv_util::time::Limiter; use super::*; - // ApplyOptions, SnapEntry, SnapKey, SnapManager, SnapManagerBuilder, SnapManagerCore, - // Snapshot, SnapshotStatistics, META_FILE_SUFFIX, SNAPSHOT_CFS, SNAP_GEN_PREFIX, - // }; use crate::{ coprocessor::CoprocessorHost, store::{peer_storage::JOB_STATUS_RUNNING, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}, @@ -2664,6 +2709,8 @@ pub mod tests { max_per_file_size: Arc::new(AtomicU64::new(max_per_file_size)), enable_multi_snapshot_files: Arc::new(AtomicBool::new(true)), stats: Default::default(), + min_ingest_cf_size: 0, + min_ingest_cf_kvs: 0, } } @@ -2784,7 +2831,7 @@ pub mod tests { .tempdir() .unwrap(); let db = get_db(src_db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let src_dir = Builder::new() .prefix("test-snap-file-db-src") @@ -2892,7 +2939,7 @@ pub mod tests { .tempdir() .unwrap(); let db = get_db(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let dir = Builder::new() .prefix("test-snap-validation") @@ -3025,7 +3072,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let dir = Builder::new() .prefix("test-snap-corruption") @@ -3084,7 +3131,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db_with_100keys(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let dir = Builder::new() .prefix("test-snap-corruption-meta") @@ -3165,7 +3212,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let key1 = SnapKey::new(1, 1, 1); let mgr_core = create_manager_core(&path, u64::MAX); let mut s1 = Snapshot::new_for_building(&path, &key1, &mgr_core).unwrap(); @@ -3236,7 +3283,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(src_db_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let key = SnapKey::new(1, 1, 1); let region = gen_test_region(1, 1, 1); @@ -3318,7 +3365,7 @@ pub mod tests { .max_total_size(max_total_size) .build::<_>(snapfiles_path.path().to_str().unwrap()); snap_mgr.init().unwrap(); - let snapshot = engine.kv.snapshot(None); + let snapshot = engine.kv.snapshot(); // Add an oldest snapshot for receiving. let recv_key = SnapKey::new(100, 100, 100); @@ -3443,7 +3490,7 @@ pub mod tests { .tempdir() .unwrap(); let db: KvTestEngine = open_test_db(kv_dir.path(), None, None).unwrap(); - let snapshot = db.snapshot(None); + let snapshot = db.snapshot(); let key = SnapKey::new(1, 1, 1); let region = gen_test_region(1, 1, 1); diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 641afb3ad36..2962dd903e6 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -2,7 +2,6 @@ use std::{ cell::RefCell, fs, - fs::{File, OpenOptions}, io::{self, BufReader, Read, Write}, sync::Arc, usize, @@ -10,10 +9,11 @@ use std::{ use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter, Iv}; use engine_traits::{ - CfName, Error as EngineError, Iterable, KvEngine, Mutable, SstCompressionType, SstReader, - SstWriter, SstWriterBuilder, WriteBatch, + CfName, Error as EngineError, IterOptions, Iterable, Iterator, KvEngine, Mutable, RefIterable, + SstCompressionType, SstReader, SstWriter, SstWriterBuilder, WriteBatch, }; use fail::fail_point; +use file_system::{File, OpenOptions}; use kvproto::encryptionpb::EncryptionMethod; use tikv_util::{ box_try, @@ -238,6 +238,9 @@ where /// Apply the given snapshot file into a column family. `callback` will be /// invoked after each batch of key value pairs written to db. +/// +/// Attention, callers should manually flush and sync the column family after +/// applying all sst files to make sure the data durability. pub fn apply_plain_cf_file( path: &str, key_mgr: Option<&Arc>, @@ -245,7 +248,7 @@ pub fn apply_plain_cf_file( db: &E, cf: &str, batch_size: usize, - mut callback: F, + callback: &mut F, ) -> Result<(), Error> where E: KvEngine, @@ -295,13 +298,13 @@ where } } -pub fn apply_sst_cf_file(files: &[&str], db: &E, cf: &str) -> Result<(), Error> +pub fn apply_sst_cf_files_by_ingest(files: &[&str], db: &E, cf: &str) -> Result<(), Error> where E: KvEngine, { if files.len() > 1 { info!( - "apply_sst_cf_file starts on cf {}. All files {:?}", + "apply_sst_cf_files_by_ingest starts on cf {}. All files {:?}", cf, files ); } @@ -309,6 +312,93 @@ where Ok(()) } +fn apply_sst_cf_file_without_ingest( + path: &str, + db: &E, + cf: &str, + key_mgr: Option>, + stale_detector: &impl StaleDetector, + batch_size: usize, + callback: &mut F, +) -> Result<(), Error> +where + E: KvEngine, + F: for<'r> FnMut(&'r [(Vec, Vec)]), +{ + let sst_reader = E::SstReader::open(path, key_mgr)?; + let mut iter = sst_reader.iter(IterOptions::default())?; + iter.seek_to_first()?; + + let mut wb = db.write_batch(); + let mut write_to_db = |batch: &mut Vec<(Vec, Vec)>| -> Result<(), EngineError> { + batch.iter().try_for_each(|(k, v)| wb.put_cf(cf, k, v))?; + wb.write()?; + wb.clear(); + callback(batch); + batch.clear(); + Ok(()) + }; + + // Collect keys to a vec rather than wb so that we can invoke the callback less + // times. + let mut batch = Vec::with_capacity(1024); + let mut batch_data_size = 0; + loop { + if stale_detector.is_stale() { + return Err(Error::Abort); + } + if !iter.valid()? { + break; + } + let key = iter.key().to_vec(); + let value = iter.value().to_vec(); + batch_data_size += key.len() + value.len(); + batch.push((key, value)); + if batch_data_size >= batch_size { + box_try!(write_to_db(&mut batch)); + batch_data_size = 0; + } + iter.next()?; + } + if !batch.is_empty() { + box_try!(write_to_db(&mut batch)); + } + Ok(()) +} + +/// Apply the given snapshot file into a column family by directly writing kv +/// pairs to db, without ingesting them. `callback` will be invoked after each +/// batch of key value pairs written to db. +/// +/// Attention, callers should manually flush and sync the column family after +/// applying all sst files to make sure the data durability. +pub fn apply_sst_cf_files_without_ingest( + files: &[&str], + db: &E, + cf: &str, + key_mgr: Option>, + stale_detector: &impl StaleDetector, + batch_size: usize, + callback: &mut F, +) -> Result<(), Error> +where + E: KvEngine, + F: for<'r> FnMut(&'r [(Vec, Vec)]), +{ + for path in files { + box_try!(apply_sst_cf_file_without_ingest( + path, + db, + cf, + key_mgr.clone(), + stale_detector, + batch_size, + callback + )); + } + Ok(()) +} + fn create_sst_file_writer(engine: &E, cf: CfName, path: &str) -> Result where E: KvEngine, @@ -378,7 +468,7 @@ mod tests { .unwrap(); let db1: KvTestEngine = open_test_empty_db(dir1.path(), db_opt, None).unwrap(); - let snap = db.snapshot(None); + let snap = db.snapshot(); for cf in SNAPSHOT_CFS { let snap_cf_dir = Builder::new().prefix("test-snap-cf").tempdir().unwrap(); let mut cf_file = CfFile { @@ -406,11 +496,19 @@ mod tests { let detector = TestStaleDetector {}; let tmp_file_path = &cf_file.tmp_file_paths()[0]; - apply_plain_cf_file(tmp_file_path, None, &detector, &db1, cf, 16, |v| { - v.iter() - .cloned() - .for_each(|pair| applied_keys.entry(cf).or_default().push(pair)) - }) + apply_plain_cf_file( + tmp_file_path, + None, + &detector, + &db1, + cf, + 16, + &mut |v: &[(Vec, Vec)]| { + v.iter() + .cloned() + .for_each(|pair| applied_keys.entry(cf).or_default().push(pair)) + }, + ) .unwrap(); } @@ -462,7 +560,7 @@ mod tests { let stats = build_sst_cf_file_list::( &mut cf_file, &db, - &db.snapshot(None), + &db.snapshot(), &keys::data_key(b"a"), &keys::data_key(b"z"), *max_file_size, @@ -498,7 +596,7 @@ mod tests { .iter() .map(|s| s.as_str()) .collect::>(); - apply_sst_cf_file(&tmp_file_paths, &db1, CF_DEFAULT).unwrap(); + apply_sst_cf_files_by_ingest(&tmp_file_paths, &db1, CF_DEFAULT).unwrap(); assert_eq_db(&db, &db1); } } diff --git a/components/raftstore/src/store/snapshot_backup.rs b/components/raftstore/src/store/snapshot_backup.rs index 9168e974fc2..7643df09edb 100644 --- a/components/raftstore/src/store/snapshot_backup.rs +++ b/components/raftstore/src/store/snapshot_backup.rs @@ -10,7 +10,10 @@ use std::{ use engine_traits::{KvEngine, RaftEngine}; use futures::channel::mpsc::UnboundedSender; -use kvproto::{brpb::CheckAdminResponse, metapb::RegionEpoch, raft_cmdpb::AdminCmdType}; +use kvproto::{ + brpb::CheckAdminResponse, metapb::RegionEpoch, raft_cmdpb::AdminCmdType, + raft_serverpb::ExtraMessage, +}; use tikv_util::{info, warn}; use tokio::sync::oneshot; @@ -75,7 +78,7 @@ impl SnapshotBrHandle for Arc crate::Result<()> { let msg_gen = || { metrics::SNAP_BR_WAIT_APPLY_EVENT.sent.inc(); - PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply(req.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply(req.clone()))) }; self.lock().unwrap().broadcast_normal(msg_gen); Ok(()) @@ -86,7 +89,7 @@ impl SnapshotBrHandle for Arc, ) -> crate::Result<()> { self.lock().unwrap().broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + PeerMsg::SignificantMsg(Box::new(SignificantMsg::CheckPendingAdmin(tx.clone()))) }); Ok(()) } @@ -263,9 +266,9 @@ impl AdminObserver for Arc { &self, _ctx: &mut crate::coprocessor::ObserverContext<'_>, _tr: &kvproto::raft_cmdpb::TransferLeaderRequest, - ) -> crate::coprocessor::Result<()> { + ) -> crate::coprocessor::Result> { if self.allowed() { - return Ok(()); + return Ok(None); } metrics::SNAP_BR_SUSPEND_COMMAND_TYPE .with_label_values(&["TransferLeader"]) diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 2ca19fbe5fe..35761aa5d18 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -78,7 +78,10 @@ where { #[inline] fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - match self.router.send(region_id, PeerMsg::CasualMessage(msg)) { + match self + .router + .send(region_id, PeerMsg::CasualMessage(Box::new(msg))) + { Ok(()) => Ok(()), Err(TrySendError::Full(_)) => Err(Error::Transport(DiscardReason::Full)), Err(TrySendError::Disconnected(_)) => Err(Error::RegionNotFound(region_id)), @@ -102,7 +105,7 @@ where fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { if let Err(SendError(msg)) = self .router - .force_send(region_id, PeerMsg::SignificantMsg(msg)) + .force_send(region_id, PeerMsg::SignificantMsg(Box::new(msg))) { // TODO: panic here once we can detect system is shutting down reliably. diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 120cc87b349..e791b520c05 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -49,19 +49,13 @@ impl fmt::Debug for TxnExt { } lazy_static! { - pub static ref GLOBAL_MEM_SIZE: IntGauge = register_int_gauge!( + pub static ref INSTANCE_MEM_SIZE: IntGauge = register_int_gauge!( "tikv_pessimistic_lock_memory_size", "Total memory size of pessimistic locks in bytes." ) .unwrap(); } -const GLOBAL_MEM_SIZE_LIMIT: usize = 100 << 20; // 100 MiB - -// 512 KiB, so pessimistic locks in one region can be proposed in a single -// command. -const PEER_MEM_SIZE_LIMIT: usize = 512 << 10; - /// Pessimistic locks of a region peer. #[derive(PartialEq)] pub struct PeerPessimisticLocks { @@ -156,7 +150,12 @@ impl PeerPessimisticLocks { /// Inserts pessimistic locks into the map. /// /// Returns whether the operation succeeds. - pub fn insert(&mut self, pairs: Vec

) -> Result<(), Vec

> { + pub fn insert( + &mut self, + pairs: Vec

, + peer_mem_size_limit: usize, + instance_mem_size_limit: usize, + ) -> Result<(), Vec

> { let mut incr = 0; // Pre-check the memory limit of pessimistic locks. for pair in &pairs { @@ -168,8 +167,8 @@ impl PeerPessimisticLocks { incr += key.len() + lock.memory_size(); } } - if self.memory_size + incr > PEER_MEM_SIZE_LIMIT - || GLOBAL_MEM_SIZE.get() as usize + incr > GLOBAL_MEM_SIZE_LIMIT + if self.memory_size + incr > peer_mem_size_limit + || INSTANCE_MEM_SIZE.get() as usize + incr > instance_mem_size_limit { return Err(pairs); } @@ -179,7 +178,7 @@ impl PeerPessimisticLocks { self.map.insert(key, (lock, false)); } self.memory_size += incr; - GLOBAL_MEM_SIZE.add(incr as i64); + INSTANCE_MEM_SIZE.add(incr as i64); Ok(()) } @@ -187,13 +186,13 @@ impl PeerPessimisticLocks { if let Some((lock, _)) = self.map.remove(key) { let desc = key.len() + lock.memory_size(); self.memory_size -= desc; - GLOBAL_MEM_SIZE.sub(desc as i64); + INSTANCE_MEM_SIZE.sub(desc as i64); } } pub fn clear(&mut self) { self.map = BTreeMap::default(); - GLOBAL_MEM_SIZE.sub(self.memory_size as i64); + INSTANCE_MEM_SIZE.sub(self.memory_size as i64); self.memory_size = 0; } @@ -322,7 +321,7 @@ impl<'a> IntoIterator for &'a PeerPessimisticLocks { impl Drop for PeerPessimisticLocks { fn drop(&mut self) { - GLOBAL_MEM_SIZE.sub(self.memory_size as i64); + INSTANCE_MEM_SIZE.sub(self.memory_size as i64); } } @@ -394,12 +393,26 @@ mod tests { let k1 = Key::from_raw(b"k1"); let k2 = Key::from_raw(b"k22"); let k3 = Key::from_raw(b"k333"); + let peer_mem_size_limit = 512 << 10; + let instance_mem_size_limit = 100 << 20; // Test the memory size of peer pessimistic locks after inserting. - locks1.insert(vec![(k1.clone(), lock(b"k1"))]).unwrap(); + locks1 + .insert( + vec![(k1.clone(), lock(b"k1"))], + peer_mem_size_limit, + instance_mem_size_limit, + ) + .unwrap(); assert_eq!(locks1.get(&k1), Some(&(lock(b"k1"), false))); assert_eq!(locks1.memory_size, k1.len() + lock(b"k1").memory_size()); - locks1.insert(vec![(k2.clone(), lock(b"k1"))]).unwrap(); + locks1 + .insert( + vec![(k2.clone(), lock(b"k1"))], + peer_mem_size_limit, + instance_mem_size_limit, + ) + .unwrap(); assert_eq!(locks1.get(&k2), Some(&(lock(b"k1"), false))); assert_eq!( locks1.memory_size, @@ -407,22 +420,34 @@ mod tests { ); // Test the global memory size after inserting. - locks2.insert(vec![(k3.clone(), lock(b"k1"))]).unwrap(); + locks2 + .insert( + vec![(k3.clone(), lock(b"k1"))], + peer_mem_size_limit, + instance_mem_size_limit, + ) + .unwrap(); assert_eq!(locks2.get(&k3), Some(&(lock(b"k1"), false))); assert_eq!( - GLOBAL_MEM_SIZE.get() as usize, + INSTANCE_MEM_SIZE.get() as usize, locks1.memory_size + locks2.memory_size ); // Test the memory size after replacing, it should not change. - locks1.insert(vec![(k2.clone(), lock(b"k2"))]).unwrap(); + locks1 + .insert( + vec![(k2.clone(), lock(b"k2"))], + peer_mem_size_limit, + instance_mem_size_limit, + ) + .unwrap(); assert_eq!(locks1.get(&k2), Some(&(lock(b"k2"), false))); assert_eq!( locks1.memory_size, k1.len() + k2.len() + 2 * lock(b"k1").memory_size() ); assert_eq!( - GLOBAL_MEM_SIZE.get() as usize, + INSTANCE_MEM_SIZE.get() as usize, locks1.memory_size + locks2.memory_size ); @@ -431,7 +456,7 @@ mod tests { assert!(locks1.get(&k1).is_none()); assert_eq!(locks1.memory_size, k2.len() + lock(b"k2").memory_size()); assert_eq!( - GLOBAL_MEM_SIZE.get() as usize, + INSTANCE_MEM_SIZE.get() as usize, locks1.memory_size + locks2.memory_size ); @@ -439,33 +464,47 @@ mod tests { locks2.clear(); assert!(locks2.is_empty()); assert_eq!(locks2.memory_size, 0); - assert_eq!(GLOBAL_MEM_SIZE.get() as usize, locks1.memory_size); + assert_eq!(INSTANCE_MEM_SIZE.get() as usize, locks1.memory_size); // Test the global memory size after dropping. drop(locks1); drop(locks2); - assert_eq!(GLOBAL_MEM_SIZE.get(), 0); + assert_eq!(INSTANCE_MEM_SIZE.get(), 0); } #[test] fn test_insert_checking_memory_limit() { let _guard = TEST_MUTEX.lock().unwrap(); - defer!(GLOBAL_MEM_SIZE.set(0)); + defer!(INSTANCE_MEM_SIZE.set(0)); + let peer_mem_size_limit = 512 << 10; + let instance_mem_size_limit = 100 << 20; let mut locks = PeerPessimisticLocks::default(); locks - .insert(vec![(Key::from_raw(b"k1"), lock(&[0; 512000]))]) + .insert( + vec![(Key::from_raw(b"k1"), lock(&[0; 512000]))], + peer_mem_size_limit, + instance_mem_size_limit, + ) .unwrap(); // Exceeding the region limit locks - .insert(vec![(Key::from_raw(b"k2"), lock(&[0; 32000]))]) + .insert( + vec![(Key::from_raw(b"k2"), lock(&[0; 32000]))], + peer_mem_size_limit, + instance_mem_size_limit, + ) .unwrap_err(); assert!(locks.get(&Key::from_raw(b"k2")).is_none()); // Not exceeding the region limit, but exceeding the global limit - GLOBAL_MEM_SIZE.set(101 << 20); - let res = locks.insert(vec![(Key::from_raw(b"k2"), lock(b"abc"))]); + INSTANCE_MEM_SIZE.set(101 << 20); + let res = locks.insert( + vec![(Key::from_raw(b"k2"), lock(b"abc"))], + peer_mem_size_limit, + instance_mem_size_limit, + ); res.unwrap_err(); assert!(locks.get(&Key::from_raw(b"k2")).is_none()); } @@ -479,7 +518,7 @@ mod tests { region } let _guard = TEST_MUTEX.lock().unwrap(); - defer!(GLOBAL_MEM_SIZE.set(0)); + defer!(INSTANCE_MEM_SIZE.set(0)); let mut original = PeerPessimisticLocks::from_locks(vec![ lock_with_key(b"a", true), diff --git a/components/raftstore/src/store/unsafe_recovery.rs b/components/raftstore/src/store/unsafe_recovery.rs index fd750f54278..876f181807b 100644 --- a/components/raftstore/src/store/unsafe_recovery.rs +++ b/components/raftstore/src/store/unsafe_recovery.rs @@ -80,7 +80,9 @@ impl UnsafeRecoveryHandle for Mutex UnsafeRecoveryHandle for Mutex Result<()> { if header.get_term() == 0 || term <= header.get_term() + 1 { Ok(()) } else { - // If header's term is 2 verions behind current term, + // If header's term is 2 versions behind current term, // leadership may have been changed away. Err(Error::StaleCommand) } @@ -765,10 +765,9 @@ pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { if entry.get_entry_type() != EntryType::EntryNormal { return RaftRequestHeader::default(); } - let logger = slog_global::get_global().new(slog::o!()); match SimpleWriteReqDecoder::new( |_, _, _| RaftCmdRequest::default(), - &logger, + None, entry.get_data(), entry.get_index(), entry.get_term(), @@ -818,10 +817,9 @@ pub enum RaftCmd<'a> { } pub fn parse_raft_cmd_request<'a>(data: &'a [u8], index: u64, term: u64, tag: &str) -> RaftCmd<'a> { - let logger = slog_global::get_global().new(slog::o!()); match SimpleWriteReqDecoder::new( |_, _, _| parse_data_at(data, index, tag), - &logger, + None, data, index, term, @@ -2304,7 +2302,7 @@ mod tests { header.set_term(7); check_term(&header, 7).unwrap(); check_term(&header, 8).unwrap(); - // If header's term is 2 verions behind current term, + // If header's term is 2 versions behind current term, // leadership may have been changed away. check_term(&header, 9).unwrap_err(); check_term(&header, 10).unwrap_err(); diff --git a/components/raftstore/src/store/worker/cleanup_snapshot.rs b/components/raftstore/src/store/worker/cleanup_snapshot.rs index c84d6ddb4d3..78167a726d2 100644 --- a/components/raftstore/src/store/worker/cleanup_snapshot.rs +++ b/components/raftstore/src/store/worker/cleanup_snapshot.rs @@ -70,12 +70,12 @@ where "region_id" => region_id, ); - let gc_snap = PeerMsg::CasualMessage(CasualMessage::GcSnap { snaps }); + let gc_snap = PeerMsg::CasualMessage(Box::new(CasualMessage::GcSnap { snaps })); match (*self.router).send(region_id, gc_snap) { Ok(()) => Ok(()), Err(TrySendError::Disconnected(_)) if self.router.is_shutdown() => Ok(()), Err(TrySendError::Disconnected(PeerMsg::CasualMessage( - CasualMessage::GcSnap { snaps }, + box CasualMessage::GcSnap { snaps }, ))) => { // The snapshot exists because MsgAppend has been rejected. So the // peer must have been exist. But now it's disconnected, so the peer diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 4dfe180ceb8..3bf6b1358e4 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -445,7 +445,7 @@ pub fn need_compact(range_stats: &RangeStats, compact_threshold: &CompactThresho // We trigger region compaction when their are to many tombstones as well as // redundant keys, both of which can severly impact scan operation: let estimate_num_del = range_stats.num_entries - range_stats.num_versions; - let redundant_keys = range_stats.num_entries - range_stats.num_rows; + let redundant_keys = range_stats.redundant_keys(); (redundant_keys >= compact_threshold.redundant_rows_threshold && redundant_keys * 100 >= compact_threshold.redundant_rows_percent_threshold * range_stats.num_entries) @@ -870,4 +870,43 @@ mod tests { .unwrap(); assert_eq!(stats.num_entries - stats.num_versions, 0); } + + #[test] + fn test_need_compact() { + // many tombstone case + let range_stats = RangeStats { + num_entries: 1000, + num_versions: 200, + num_deletes: 0, + num_rows: 200, + }; + assert!(need_compact( + &range_stats, + &CompactThreshold::new(10, 30, 100, 100) + )); + + // many mvcc put case + let range_stats = RangeStats { + num_entries: 1000, + num_versions: 1000, + num_deletes: 0, + num_rows: 200, + }; + assert!(need_compact( + &range_stats, + &CompactThreshold::new(100, 100, 100, 30) + )); + + // many mvcc delete case + let range_stats = RangeStats { + num_entries: 1000, + num_versions: 1000, + num_deletes: 800, + num_rows: 1000, + }; + assert!(need_compact( + &range_stats, + &CompactThreshold::new(100, 100, 100, 30) + )); + } } diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index d034cd8604f..fef2bae332c 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -162,7 +162,7 @@ mod tests { index: 10, context: vec![ConsistencyCheckMethod::Raw as u8], region: region.clone(), - snap: db.snapshot(None), + snap: db.snapshot(), }); let mut checksum_bytes = vec![]; checksum_bytes.write_u32::(sum).unwrap(); diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index d6f644ee00a..4f4d6b85034 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -35,7 +35,7 @@ use kvproto::{ raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, }; -use pd_client::{metrics::*, BucketStat, Error, PdClient, RegionStat}; +use pd_client::{metrics::*, BucketStat, Error, PdClient, RegionStat, RegionWriteCfCopDetail}; use prometheus::local::LocalHistogram; use raft::eraftpb::ConfChangeType; use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecords}; @@ -44,7 +44,7 @@ use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, store::QueryStats, - sys::{thread::StdThreadBuildWrapper, SysQuota}, + sys::{disk::get_disk_space_stats, thread::StdThreadBuildWrapper, SysQuota}, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, @@ -291,10 +291,12 @@ pub struct PeerStat { pub read_bytes: u64, pub read_keys: u64, pub query_stats: QueryStats, + pub cop_detail: RegionWriteCfCopDetail, // last_region_report_attributes records the state of the last region heartbeat pub last_region_report_read_bytes: u64, pub last_region_report_read_keys: u64, pub last_region_report_query_stats: QueryStats, + pub last_region_report_cop_detail: RegionWriteCfCopDetail, pub last_region_report_written_bytes: u64, pub last_region_report_written_keys: u64, pub last_region_report_ts: UnixSecs, @@ -1547,7 +1549,7 @@ where cb: Callback::None, } }; - if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(Box::new(msg))) { error!("send halfsplit request failed"; "region_id" => region_id, "err" => ?e); } } else if resp.has_merge() { @@ -1601,6 +1603,7 @@ where peer_stat .query_stats .add_query_stats(®ion_info.query_stats.0); + peer_stat.cop_detail.add(®ion_info.cop_detail); self.store_stat .engine_total_query_num .add_query_stats(®ion_info.query_stats.0); @@ -1737,7 +1740,7 @@ where match resp.await { Ok(Some((region, leader))) => { if leader.get_store_id() != 0 { - let msg = CasualMessage::QueryRegionLeaderResp { region, leader }; + let msg = Box::new(CasualMessage::QueryRegionLeaderResp { region, leader }); if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { error!("send region info message failed"; "region_id" => region_id, "err" => ?e); } @@ -1991,14 +1994,14 @@ where let start_key = split_info.start_key.unwrap(); let end_key = split_info.end_key.unwrap(); let region_id = region.get_id(); - let msg = CasualMessage::HalfSplitRegion { + let msg = Box::new(CasualMessage::HalfSplitRegion { region_epoch: region.get_region_epoch().clone(), start_key: Some(start_key.clone()), end_key: Some(end_key.clone()), policy: pdpb::CheckPolicy::Scan, source: "auto_split", cb: Callback::None, - }; + }); if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { error!("send auto half split request failed"; "region_id" => region_id, @@ -2031,6 +2034,7 @@ where written_keys_delta, last_report_ts, query_stats, + cop_detail, cpu_usage, ) = { let region_id = hb_task.region.get_id(); @@ -2049,12 +2053,16 @@ where let query_stats = peer_stat .query_stats .sub_query_stats(&peer_stat.last_region_report_query_stats); + let cop_detail = peer_stat + .cop_detail + .sub(&peer_stat.last_region_report_cop_detail); let mut last_report_ts = peer_stat.last_region_report_ts; peer_stat.last_region_report_written_bytes = hb_task.written_bytes; peer_stat.last_region_report_written_keys = hb_task.written_keys; peer_stat.last_region_report_read_bytes = peer_stat.read_bytes; peer_stat.last_region_report_read_keys = peer_stat.read_keys; peer_stat.last_region_report_query_stats = peer_stat.query_stats.clone(); + peer_stat.last_region_report_cop_detail = peer_stat.cop_detail.clone(); let unix_secs_now = UnixSecs::now(); peer_stat.last_region_report_ts = unix_secs_now; @@ -2085,6 +2093,7 @@ where written_keys_delta, last_report_ts, query_stats.0, + cop_detail, cpu_usage, ) }; @@ -2100,6 +2109,7 @@ where read_bytes: read_bytes_delta, read_keys: read_keys_delta, query_stats, + cop_detail, approximate_size, approximate_keys, last_report_ts, @@ -2433,7 +2443,7 @@ fn collect_engine_size( return Some((engine_size.capacity, engine_size.used, engine_size.avail)); } let store_info = store_info.unwrap(); - let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { + let (disk_cap, disk_avail) = match get_disk_space_stats(store_info.kv_engine.path()) { Err(e) => { error!( "get disk stat for rocksdb failed"; @@ -2442,9 +2452,8 @@ fn collect_engine_size( ); return None; } - Ok(stats) => stats, + Ok((total_size, available_size)) => (total_size, available_size), }; - let disk_cap = disk_stats.total_space(); let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { disk_cap } else { @@ -2468,7 +2477,7 @@ fn collect_engine_size( let mut available = capacity.checked_sub(used_size).unwrap_or_default(); // We only care about rocksdb SST file size, so we should check disk available // here. - available = cmp::min(available, disk_stats.available_space()); + available = cmp::min(available, disk_avail); Some((capacity, used_size, available)) } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a1893bf8d93..269105e081b 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -12,7 +12,7 @@ use std::{ }; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; -use engine_traits::{CacheRange, KvEngine, Peekable, RaftEngine, SnapshotContext}; +use engine_traits::{KvEngine, Peekable, RaftEngine, SnapshotMiscExt}; use fail::fail_point; use kvproto::{ errorpb, @@ -34,7 +34,9 @@ use txn_types::{TimeStamp, WriteBatchFlags}; use super::metrics::*; use crate::{ + coprocessor::CoprocessorHost, errors::RAFTSTORE_IS_BUSY, + router::ReadContext, store::{ cmd_resp, fsm::store::StoreMeta, @@ -57,7 +59,6 @@ pub trait ReadExecutor { /// Currently, only multi-rocksdb version may return `None`. fn get_snapshot( &mut self, - snap_ctx: Option, read_context: &Option>, ) -> Arc<::Snapshot>; @@ -65,7 +66,6 @@ pub trait ReadExecutor { &mut self, req: &Request, region: &metapb::Region, - snap_ctx: Option, read_context: &Option>, ) -> Result { let key = req.get_get().get_key(); @@ -73,7 +73,7 @@ pub trait ReadExecutor { util::check_key_in_region(key, region)?; let mut resp = Response::default(); - let snapshot = self.get_snapshot(snap_ctx, read_context); + let snapshot = self.get_snapshot(read_context); let res = if !req.get_get().get_cf().is_empty() { let cf = req.get_get().get_cf(); snapshot @@ -108,11 +108,12 @@ pub trait ReadExecutor { fn execute( &mut self, + ctx: &ReadContext, msg: &RaftCmdRequest, region: &Arc, read_index: Option, - snap_ctx: Option, local_read_ctx: Option>, + host: &CoprocessorHost, ) -> ReadResponse<::Snapshot> { let requests = msg.get_requests(); let mut response = ReadResponse { @@ -124,24 +125,33 @@ pub trait ReadExecutor { for req in requests { let cmd_type = req.get_cmd_type(); let mut resp = match cmd_type { - CmdType::Get => { - match self.get_value(req, region.as_ref(), snap_ctx.clone(), &local_read_ctx) { - Ok(resp) => resp, - Err(e) => { - error!(?e; - "failed to execute get command"; - "region_id" => region.get_id(), - ); - response.response = cmd_resp::new_error(e); - return response; - } + CmdType::Get => match self.get_value(req, region.as_ref(), &local_read_ctx) { + Ok(resp) => resp, + Err(e) => { + error!(?e; + "failed to execute get command"; + "region_id" => region.get_id(), + ); + response.response = cmd_resp::new_error(e); + return response; } - } + }, CmdType::Snap => { - let snapshot = RegionSnapshot::from_snapshot( - self.get_snapshot(snap_ctx.clone(), &local_read_ctx), + let mut snapshot = RegionSnapshot::from_snapshot( + self.get_snapshot(&local_read_ctx), region.clone(), ); + if let Some(read_ts) = ctx.read_ts { + // We only observe snapshot when it has snapshot context. + // + // Currently, the snapshot context is set when caller + // wants an in-memory engine snapshot which requires + // the snapshot and some metadata in the context. + let seqno = snapshot.get_snapshot().sequence_number(); + if let Some(observed_snap) = host.on_snapshot(region, read_ts, seqno) { + snapshot.set_observed_snapshot(observed_snap); + } + } response.snapshot = Some(snapshot); Response::default() } @@ -231,16 +241,9 @@ where } } - // Update the snapshot in the `snap_cache` if the read_id is None or does - // not match. - // snap_ctx is used (if not None) to acquire the snapshot of the relevant region - // from region cache engine - fn maybe_update_snapshot( - &mut self, - engine: &E, - snap_ctx: Option, - delegate_last_valid_ts: Timespec, - ) -> bool { + /// Update the snapshot in the `snap_cache` if the read_id is None or does + /// not match. + fn maybe_update_snapshot(&mut self, engine: &E, delegate_last_valid_ts: Timespec) -> bool { // When the read_id is None, it means the `snap_cache` has been cleared // before and the `cached_read_id` of it is None because only a consecutive // requests will have the same cache and the cache will be cleared after the @@ -254,7 +257,7 @@ where } self.snap_cache.cached_read_id = self.read_id.clone(); - self.snap_cache.snapshot = Some(Arc::new(engine.snapshot(snap_ctx))); + self.snap_cache.snapshot = Some(Arc::new(engine.snapshot())); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); @@ -262,7 +265,7 @@ where } else { // read_id being None means the snapshot acquired will only be used in this // request - self.snapshot = Some(Arc::new(engine.snapshot(snap_ctx))); + self.snapshot = Some(Arc::new(engine.snapshot())); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); @@ -921,6 +924,7 @@ where snap_cache: SnapCache, // A channel to raftstore. router: C, + host: CoprocessorHost, } impl LocalReader @@ -928,12 +932,18 @@ where E: KvEngine, C: ProposalRouter + CasualRouter, { - pub fn new(kv_engine: E, store_meta: StoreMetaDelegate, router: C) -> Self { + pub fn new( + kv_engine: E, + store_meta: StoreMetaDelegate, + router: C, + host: CoprocessorHost, + ) -> Self { Self { local_reader: LocalReaderCore::new(store_meta), kv_engine, snap_cache: SnapCache::new(), router, + host, } } @@ -993,26 +1003,16 @@ where /// the read response is returned, otherwise None is returned. fn try_local_leader_read( &mut self, + ctx: &ReadContext, req: &RaftCmdRequest, delegate: &mut CachedReadDelegate, - snap_ctx: Option, - read_id: Option, snap_updated: &mut bool, last_valid_ts: Timespec, ) -> Option> { - let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, read_id); - if snap_ctx.is_some() { - // When `snap_ctx` is some, it means we want to acquire the range cache engine - // snapshot which cannot be used across different regions. So we don't use - // cache. - local_read_ctx.read_id.take(); - } + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, ctx.read_id.clone()); - (*snap_updated) = local_read_ctx.maybe_update_snapshot( - delegate.get_tablet(), - snap_ctx.clone(), - last_valid_ts, - ); + (*snap_updated) = + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); if !delegate.is_in_leader_lease(snapshot_ts) { @@ -1020,7 +1020,8 @@ where } let region = Arc::clone(&delegate.region); - let mut response = delegate.execute(req, ®ion, None, snap_ctx, Some(local_read_ctx)); + let mut response = + delegate.execute(ctx, req, ®ion, None, Some(local_read_ctx), &self.host); if let Some(snap) = response.snapshot.as_mut() { snap.bucket_meta = delegate.bucket_meta.clone(); } @@ -1034,6 +1035,7 @@ where /// `DataIsNotReady` error is returned. fn try_local_stale_read( &mut self, + ctx: &ReadContext, req: &RaftCmdRequest, delegate: &mut CachedReadDelegate, snap_updated: &mut bool, @@ -1045,11 +1047,12 @@ where // Stale read does not use cache, so we pass None for read_id let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); (*snap_updated) = - local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), None, last_valid_ts); + local_read_ctx.maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); let region = Arc::clone(&delegate.region); // Getting the snapshot - let mut response = delegate.execute(req, ®ion, None, None, Some(local_read_ctx)); + let mut response = + delegate.execute(ctx, req, ®ion, None, Some(local_read_ctx), &self.host); if let Some(snap) = response.snapshot.as_mut() { snap.bucket_meta = delegate.bucket_meta.clone(); } @@ -1063,27 +1066,21 @@ where pub fn propose_raft_command( &mut self, - mut snap_ctx: Option, - read_id: Option, + ctx: &ReadContext, mut req: RaftCmdRequest, cb: Callback, ) { match self.pre_propose_raft_command(&req) { Ok(Some((mut delegate, policy))) => { - if let Some(ref mut ctx) = snap_ctx { - ctx.set_range(CacheRange::from_region(&delegate.region)) - } - let mut snap_updated = false; let last_valid_ts = delegate.last_valid_ts; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { if let Some(read_resp) = self.try_local_leader_read( + ctx, &req, &mut delegate, - snap_ctx, - read_id, &mut snap_updated, last_valid_ts, ) { @@ -1098,6 +1095,7 @@ where // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { match self.try_local_stale_read( + ctx, &req, &mut delegate, &mut snap_updated, @@ -1127,10 +1125,9 @@ where return; } if let Some(read_resp) = self.try_local_leader_read( + ctx, &req, &mut delegate, - None, - None, &mut snap_updated, last_valid_ts, ) { @@ -1206,14 +1203,8 @@ where /// which left a snapshot cached in LocalReader. ThreadReadId is composed by /// thread_id and a thread_local incremental sequence. #[inline] - pub fn read( - &mut self, - snap_ctx: Option, - read_id: Option, - req: RaftCmdRequest, - cb: Callback, - ) { - self.propose_raft_command(snap_ctx, read_id, req, cb); + pub fn read(&mut self, ctx: ReadContext, req: RaftCmdRequest, cb: Callback) { + self.propose_raft_command(&ctx, req, cb); maybe_tls_local_read_metrics_flush(); } @@ -1233,6 +1224,7 @@ where kv_engine: self.kv_engine.clone(), snap_cache: self.snap_cache.clone(), router: self.router.clone(), + host: self.host.clone(), } } } @@ -1247,11 +1239,7 @@ where &self.kv_engine } - fn get_snapshot( - &mut self, - _: Option, - read_context: &Option>, - ) -> Arc { + fn get_snapshot(&mut self, read_context: &Option>) -> Arc { read_context.as_ref().unwrap().snapshot().unwrap() } } @@ -1298,15 +1286,10 @@ mod tests { use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; - use engine_traits::{CacheRange, MiscExt, Peekable, SyncMutable, ALL_CFS}; - use hybrid_engine::{HybridEngine, HybridEngineSnapshot}; - use keys::DATA_PREFIX; + use engine_traits::{MiscExt, Peekable, SyncMutable, ALL_CFS}; use kvproto::{metapb::RegionEpoch, raft_cmdpb::*}; - use range_cache_memory_engine::{ - RangeCacheEngineConfig, RangeCacheEngineContext, RangeCacheMemoryEngine, - }; use tempfile::{Builder, TempDir}; - use tikv_util::{codec::number::NumberEncoder, config::VersionTrack, time::monotonic_raw_now}; + use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; use time::Duration; use txn_types::WriteBatchFlags; @@ -1366,7 +1349,9 @@ mod tests { let path = Builder::new().prefix(path).tempdir().unwrap(); let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let (ch, rx, _) = MockRouter::new(); - let mut reader = LocalReader::new(db.clone(), StoreMetaDelegate::new(store_meta, db), ch); + let host = CoprocessorHost::default(); + let mut reader = + LocalReader::new(db.clone(), StoreMetaDelegate::new(store_meta, db), ch, host); reader.local_reader.store_id = Cell::new(Some(store_id)); (path, reader, rx) } @@ -1388,9 +1373,9 @@ mod tests { rx: &Receiver>, cmd: RaftCmdRequest, ) { + let read_ctx = &ReadContext::new(None, None); reader.propose_raft_command( - None, - None, + read_ctx, cmd.clone(), Callback::read(Box::new(|resp| { panic!("unexpected invoke, {:?}", resp); @@ -1418,7 +1403,8 @@ mod tests { task: RaftCommand, read_id: Option, ) { - reader.propose_raft_command(None, read_id, task.request, task.callback); + let ctx = ReadContext::new(read_id, None); + reader.propose_raft_command(&ctx, task.request, task.callback); assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); } @@ -1427,6 +1413,7 @@ mod tests { let store_id = 2; let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); + let read_ctx = &ReadContext::new(None, None); // region: 1, // peers: 2, 3, 4, @@ -1551,8 +1538,7 @@ mod tests { .mut_peer() .set_store_id(store_id + 1); reader.propose_raft_command( - None, - None, + read_ctx, cmd_store_id, Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); @@ -1576,8 +1562,7 @@ mod tests { .mut_peer() .set_id(leader2.get_id() + 1); reader.propose_raft_command( - None, - None, + read_ctx, cmd_peer_id, Callback::read(Box::new(move |resp: ReadResponse| { assert!( @@ -1602,8 +1587,7 @@ mod tests { let mut cmd_term = cmd.clone(); cmd_term.mut_header().set_term(term6 - 2); reader.propose_raft_command( - None, - None, + read_ctx, cmd_term, Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); @@ -1639,10 +1623,9 @@ mod tests { ); // Channel full. - reader.propose_raft_command(None, None, cmd.clone(), Callback::None); + reader.propose_raft_command(read_ctx, cmd.clone(), Callback::None); reader.propose_raft_command( - None, - None, + read_ctx, cmd.clone(), Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); @@ -1674,8 +1657,7 @@ mod tests { .update(Progress::applied_term(term6 + 3)); } reader.propose_raft_command( - None, - None, + read_ctx, cmd9.clone(), Callback::read(Box::new(|resp| { panic!("unexpected invoke, {:?}", resp); @@ -2051,7 +2033,7 @@ mod tests { let compare_ts = monotonic_raw_now(); // Case 1: snap_cache_context.read_id is None - assert!(read_context.maybe_update_snapshot(&db, None, Timespec::new(0, 0))); + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); assert!(read_context.snapshot_ts().unwrap() > compare_ts); assert_eq!( read_context @@ -2066,7 +2048,7 @@ mod tests { // snap_cache_context is *not* created with read_id, so calling // `maybe_update_snapshot` again will update the snapshot let compare_ts = monotonic_raw_now(); - assert!(read_context.maybe_update_snapshot(&db, None, Timespec::new(0, 0))); + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); assert!(read_context.snapshot_ts().unwrap() > compare_ts); let read_id = ThreadReadId::new(); @@ -2076,7 +2058,7 @@ mod tests { let compare_ts = monotonic_raw_now(); // Case 2: snap_cache_context.read_id is not None but not equals to the // snap_cache.cached_read_id - assert!(read_context.maybe_update_snapshot(&db, None, Timespec::new(0, 0))); + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); assert!(read_context.snapshot_ts().unwrap() > compare_ts); let snap_ts = read_context.snapshot_ts().unwrap(); assert_eq!( @@ -2094,7 +2076,7 @@ mod tests { // `maybe_update_snapshot` again will *not* update the snapshot // Case 3: snap_cache_context.read_id is not None and equals to the // snap_cache.cached_read_id - assert!(!read_context.maybe_update_snapshot(&db2, None, Timespec::new(0, 0))); + assert!(!read_context.maybe_update_snapshot(&db2, Timespec::new(0, 0))); assert_eq!(read_context.snapshot_ts().unwrap(), snap_ts); assert_eq!( read_context @@ -2109,7 +2091,7 @@ mod tests { // Case 4: delegate.last_valid_ts is larger than create_time of read_id let mut last_valid_ts = read_id_clone.create_time; last_valid_ts = last_valid_ts.add(Duration::nanoseconds(1)); - assert!(read_context.maybe_update_snapshot(&db2, None, last_valid_ts)); + assert!(read_context.maybe_update_snapshot(&db2, last_valid_ts)); assert!(read_context.snapshot_ts().unwrap() > snap_ts); assert!( read_context @@ -2429,264 +2411,4 @@ mod tests { .has_data_is_not_ready() ); } - - type HybridTestEnigne = HybridEngine; - type HybridEngineTestSnapshot = HybridEngineSnapshot; - - struct HybridEngineMockRouter { - p_router: SyncSender>, - c_router: SyncSender<(u64, CasualMessage)>, - } - - impl HybridEngineMockRouter { - #[allow(clippy::type_complexity)] - fn new() -> ( - HybridEngineMockRouter, - Receiver>, - Receiver<(u64, CasualMessage)>, - ) { - let (p_ch, p_rx) = sync_channel(1); - let (c_ch, c_rx) = sync_channel(1); - ( - HybridEngineMockRouter { - p_router: p_ch, - c_router: c_ch, - }, - p_rx, - c_rx, - ) - } - } - - impl ProposalRouter for HybridEngineMockRouter { - fn send( - &self, - cmd: RaftCommand, - ) -> std::result::Result<(), TrySendError>> { - ProposalRouter::send(&self.p_router, cmd) - } - } - - impl CasualRouter for HybridEngineMockRouter { - fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - CasualRouter::send(&self.c_router, region_id, msg) - } - } - - #[allow(clippy::type_complexity)] - fn new_hybrid_engine_reader( - path: &str, - store_id: u64, - store_meta: Arc>, - engine_config: RangeCacheEngineConfig, - ) -> ( - TempDir, - LocalReader, - Receiver>, - RangeCacheMemoryEngine, - ) { - let path = Builder::new().prefix(path).tempdir().unwrap(); - let disk_engine = - engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); - let (ch, rx, _) = HybridEngineMockRouter::new(); - let config = Arc::new(VersionTrack::new(engine_config)); - let memory_engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config)); - let engine = HybridEngine::new(disk_engine, memory_engine.clone()); - let mut reader = LocalReader::new( - engine.clone(), - StoreMetaDelegate::new(store_meta, engine), - ch, - ); - reader.local_reader.store_id = Cell::new(Some(store_id)); - (path, reader, rx, memory_engine) - } - - fn get_snapshot( - snap_ctx: Option, - reader: &mut LocalReader, - request: RaftCmdRequest, - rx: &Receiver>, - ) -> Arc { - let (sender, receiver) = channel(); - reader.propose_raft_command( - snap_ctx, - None, - request, - Callback::read(Box::new(move |snap| { - sender.send(snap).unwrap(); - })), - ); - // no direct is expected - assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); - receiver.recv().unwrap().snapshot.unwrap().snap() - } - - #[test] - fn test_hybrid_engine_read() { - let store_id = 2; - let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); - let (_tmp, mut reader, rx, memory_engine) = new_hybrid_engine_reader( - "test-local-hybrid-engine-reader", - store_id, - store_meta.clone(), - RangeCacheEngineConfig::config_for_test(), - ); - - // set up region so we can acquire snapshot from local reader - let mut region1 = metapb::Region::default(); - region1.set_id(1); - let prs = new_peers(store_id, vec![2, 3, 4]); - region1.set_peers(prs.clone().into()); - let epoch13 = { - let mut ep = metapb::RegionEpoch::default(); - ep.set_conf_ver(1); - ep.set_version(3); - ep - }; - let leader2 = prs[0].clone(); - region1.set_region_epoch(epoch13.clone()); - let range = CacheRange::from_region(®ion1); - memory_engine.new_range(range.clone()); - { - let mut core = memory_engine.core().write(); - core.mut_range_manager().set_safe_point(&range, 1); - } - let kv = (&[DATA_PREFIX, b'a'], b"b"); - reader.kv_engine.put(kv.0, kv.1).unwrap(); - let term6 = 6; - let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. - let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); - - lease.renew(monotonic_raw_now()); - let remote = lease.maybe_new_remote_lease(term6).unwrap(); - { - let mut meta = store_meta.lock().unwrap(); - let read_delegate = ReadDelegate { - tag: String::new(), - region: Arc::new(region1.clone()), - peer_id: leader2.get_id(), - term: term6, - applied_term: term6, - leader_lease: Some(remote), - last_valid_ts: Timespec::new(0, 0), - txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), - txn_ext: Arc::new(TxnExt::default()), - read_progress, - pending_remove: false, - wait_data: false, - track_ver: TrackVer::new(), - bucket_meta: None, - }; - meta.readers.insert(1, read_delegate); - } - - let mut cmd = RaftCmdRequest::default(); - let mut header = RaftRequestHeader::default(); - header.set_region_id(1); - header.set_peer(leader2); - header.set_region_epoch(epoch13); - header.set_term(term6); - cmd.set_header(header); - let mut req = Request::default(); - req.set_cmd_type(CmdType::Snap); - cmd.set_requests(vec![req].into()); - - let s = get_snapshot(None, &mut reader, cmd.clone(), &rx); - assert!(!s.range_cache_snapshot_available()); - - { - let mut core = memory_engine.core().write(); - core.mut_range_manager().set_safe_point(&range, 10); - } - - let snap_ctx = SnapshotContext { - read_ts: 15, - range: None, - }; - - let s = get_snapshot(Some(snap_ctx.clone()), &mut reader, cmd.clone(), &rx); - assert!(s.range_cache_snapshot_available()); - assert_eq!(s.get_value(kv.0).unwrap().unwrap(), kv.1); - } - - #[test] - fn test_not_use_snap_cache_in_hybrid_engine() { - let store_id = 2; - let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); - let (_tmp, mut reader, rx, _) = new_hybrid_engine_reader( - "test-not-use-snap-cache", - store_id, - store_meta.clone(), - RangeCacheEngineConfig::config_for_test(), - ); - - let epoch13 = { - let mut ep = metapb::RegionEpoch::default(); - ep.set_conf_ver(1); - ep.set_version(3); - ep - }; - let term6 = 6; - - // Register region1 - let pr_ids1 = vec![2, 3, 4]; - let prs1 = new_peers(store_id, pr_ids1.clone()); - prepare_read_delegate( - store_id, - 1, - term6, - pr_ids1, - epoch13.clone(), - store_meta.clone(), - ); - let leader1 = prs1[0].clone(); - - let mut cmd = RaftCmdRequest::default(); - let mut header = RaftRequestHeader::default(); - header.set_region_id(1); - header.set_peer(leader1); - header.set_region_epoch(epoch13.clone()); - header.set_term(term6); - cmd.set_header(header); - let mut req = Request::default(); - req.set_cmd_type(CmdType::Snap); - cmd.set_requests(vec![req].into()); - let (snap_tx, snap_rx) = channel(); - let task = RaftCommand::::new( - cmd.clone(), - Callback::read(Box::new( - move |resp: ReadResponse| { - snap_tx.send(resp.snapshot.unwrap()).unwrap(); - }, - )), - ); - - let read_id = Some(ThreadReadId::new()); - // If snap_ctx is None and read_id is Some, it will cache the snapshot. - reader.propose_raft_command(None, read_id.clone(), task.request, task.callback); - assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); - let _ = snap_rx.recv().unwrap(); - assert!(reader.snap_cache.snapshot.is_some()); - - // Release the snapshot and try with snap_ctx - let (snap_tx, snap_rx) = channel(); - let task = RaftCommand::::new( - cmd, - Callback::read(Box::new( - move |resp: ReadResponse| { - snap_tx.send(resp.snapshot.unwrap()).unwrap(); - }, - )), - ); - reader.release_snapshot_cache(); - let snap_ctx = SnapshotContext { - read_ts: 15, - range: None, - }; - reader.propose_raft_command(Some(snap_ctx), read_id, task.request, task.callback); - assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); - let _ = snap_rx.recv().unwrap(); - assert!(reader.snap_cache.snapshot.is_none()); - } } diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 05fdf5b746c..4af19c53035 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -10,7 +10,7 @@ use std::{ sync::{ atomic::{AtomicBool, AtomicUsize, Ordering}, mpsc::SyncSender, - Arc, Mutex, + Arc, }, time::Duration, u64, @@ -341,215 +341,13 @@ where } } -struct RegionCleaner -where - EK: KvEngine, -{ - use_delete_range: bool, - engine: EK, - // Ranges that have been logically destroyed at a specific sequence number. We can - // assume there will be no reader (engine snapshot) newer than that sequence number. Therefore, - // they can be physically deleted with `DeleteFiles` when we're sure there is no older - // reader as well. - // To protect this assumption, before a new snapshot is applied, the overlapping pending ranges - // must first be removed. - // The sole purpose of maintaining this list is to optimize deletion with `DeleteFiles` - // whenever we can. Errors while processing them can be ignored. - pending_delete_ranges: PendingDeleteRanges, - mgr: SnapManager, -} - -impl RegionCleaner -where - EK: KvEngine, -{ - /// Tries to clean up files in pending ranges overlapping with the given - /// bounds. These pending ranges will be removed. Returns an updated range - /// that also includes these ranges. Caller must ensure the remaining keys - /// in the returning range will be deleted properly. - fn clean_overlap_ranges_roughly( - &mut self, - mut start_key: Vec, - mut end_key: Vec, - ) -> (Vec, Vec) { - let overlap_ranges = self - .pending_delete_ranges - .drain_overlap_ranges(&start_key, &end_key); - if overlap_ranges.is_empty() { - return (start_key, end_key); - } - CLEAN_COUNTER_VEC.with_label_values(&["overlap"]).inc(); - let oldest_sequence = self - .engine - .get_oldest_snapshot_sequence_number() - .unwrap_or(u64::MAX); - let df_ranges: Vec<_> = overlap_ranges - .iter() - .filter_map(|(region_id, cur_start, cur_end, stale_sequence)| { - info!( - "delete data in range because of overlap"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(cur_start), - "end_key" => log_wrappers::Value::key(cur_end) - ); - if &start_key > cur_start { - start_key = cur_start.clone(); - } - if &end_key < cur_end { - end_key = cur_end.clone(); - } - if *stale_sequence < oldest_sequence { - Some(Range::new(cur_start, cur_end)) - } else { - SNAP_COUNTER_VEC - .with_label_values(&["overlap", "not_delete_files"]) - .inc(); - None - } - }) - .collect(); - self.engine - .delete_ranges_cfs( - &WriteOptions::default(), - DeleteStrategy::DeleteFiles, - &df_ranges, - ) - .map_err(|e| { - error!("failed to delete files in range"; "err" => %e); - }) - .unwrap(); - (start_key, end_key) - } - - /// Cleans up data in the given range and all pending ranges overlapping - /// with it. - fn clean_overlap_ranges(&mut self, start_key: Vec, end_key: Vec) -> Result<()> { - let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); - self.delete_all_in_range(&[Range::new(&start_key, &end_key)]) - } - - /// Inserts a new pending range, and it will be cleaned up with some delay. - fn insert_pending_delete_range( - &mut self, - region_id: u64, - start_key: Vec, - end_key: Vec, - ) { - let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); - info!("register deleting data in range"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(&start_key), - "end_key" => log_wrappers::Value::key(&end_key), - ); - let seq = self.engine.get_latest_sequence_number(); - self.pending_delete_ranges - .insert(region_id, start_key, end_key, seq); - } - - /// Cleans up stale ranges. - fn clean_stale_ranges(&mut self) { - STALE_PEER_PENDING_DELETE_RANGE_GAUGE.set(self.pending_delete_ranges.len() as f64); - if self.ingest_maybe_stall() { - return; - } - let oldest_sequence = self - .engine - .get_oldest_snapshot_sequence_number() - .unwrap_or(u64::MAX); - let mut region_ranges: Vec<(u64, Vec, Vec)> = self - .pending_delete_ranges - .stale_ranges(oldest_sequence) - .map(|(region_id, s, e)| (region_id, s.to_vec(), e.to_vec())) - .collect(); - if region_ranges.is_empty() { - return; - } - CLEAN_COUNTER_VEC.with_label_values(&["destroy"]).inc_by(1); - region_ranges.sort_by(|a, b| a.1.cmp(&b.1)); - region_ranges.truncate(CLEANUP_MAX_REGION_COUNT); - let ranges: Vec<_> = region_ranges - .iter() - .map(|(region_id, start, end)| { - info!("delete data in range because of stale"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start), - "end_key" => log_wrappers::Value::key(end)); - Range::new(start, end) - }) - .collect(); - - self.engine - .delete_ranges_cfs( - &WriteOptions::default(), - DeleteStrategy::DeleteFiles, - &ranges, - ) - .map_err(|e| { - error!("failed to delete files in range"; "err" => %e); - }) - .unwrap(); - if let Err(e) = self.delete_all_in_range(&ranges) { - error!("failed to cleanup stale range"; "err" => %e); - return; - } - self.engine - .delete_ranges_cfs( - &WriteOptions::default(), - DeleteStrategy::DeleteBlobs, - &ranges, - ) - .map_err(|e| { - error!("failed to delete blobs in range"; "err" => %e); - }) - .unwrap(); - - for (_, key, _) in region_ranges { - assert!( - self.pending_delete_ranges.remove(&key).is_some(), - "cleanup pending_delete_ranges {} should exist", - log_wrappers::Value::key(&key) - ); - } - } - - /// Checks the number of files at level 0 to avoid write stall after - /// ingesting sst. Returns true if the ingestion causes write stall. - fn ingest_maybe_stall(&self) -> bool { - for cf in SNAPSHOT_CFS { - // no need to check lock cf - if plain_file_used(cf) { - continue; - } - if self.engine.ingest_maybe_slowdown_writes(cf).expect("cf") { - return true; - } - } - false - } - - fn delete_all_in_range(&self, ranges: &[Range<'_>]) -> Result<()> { - let wopts = WriteOptions::default(); - for cf in self.engine.cf_names() { - // CF_LOCK usually contains fewer keys than other CFs, so we delete them by key. - let strategy = if cf == CF_LOCK { - DeleteStrategy::DeleteByKey - } else if self.use_delete_range { - DeleteStrategy::DeleteByRange - } else { - DeleteStrategy::DeleteByWriter { - sst_path: self.mgr.get_temp_path_for_ingest(), - } - }; - box_try!(self.engine.delete_ranges_cf(&wopts, cf, strategy, ranges)); - } - Ok(()) - } -} - pub struct Runner where EK: KvEngine, T: PdClient + 'static, { batch_size: usize, + use_delete_range: bool, ingest_copy_symlink: bool, clean_stale_tick: usize, clean_stale_check_interval: Duration, @@ -559,15 +357,22 @@ where // we may delay some apply tasks if level 0 files to write stall threshold, // pending_applies records all delayed apply task, and will check again later pending_applies: VecDeque>, + // Ranges that have been logically destroyed at a specific sequence number. We can + // assume there will be no reader (engine snapshot) newer than that sequence number. Therefore, + // they can be physically deleted with `DeleteFiles` when we're sure there is no older + // reader as well. + // To protect this assumption, before a new snapshot is applied, the overlapping pending ranges + // must first be removed. + // The sole purpose of maintaining this list is to optimize deletion with `DeleteFiles` + // whenever we can. Errors while processing them can be ignored. + pending_delete_ranges: PendingDeleteRanges, engine: EK, mgr: SnapManager, coprocessor_host: CoprocessorHost, router: R, pd_client: Option>, - snap_gen_pool: FuturePool, - region_cleanup_pool: FuturePool, - region_cleaner: Arc>>, + pool: FuturePool, } impl Runner @@ -586,6 +391,7 @@ where ) -> Runner { Runner { batch_size: cfg.value().snap_apply_batch_size.0 as usize, + use_delete_range: cfg.value().use_delete_range, ingest_copy_symlink: cfg.value().snap_apply_copy_symlink, clean_stale_tick: 0, clean_stale_check_interval: Duration::from_millis( @@ -594,12 +400,13 @@ where clean_stale_ranges_tick: cfg.value().clean_stale_ranges_tick, tiflash_stores: HashMap::default(), pending_applies: VecDeque::new(), - engine: engine.clone(), - mgr: mgr.clone(), + pending_delete_ranges: PendingDeleteRanges::default(), + engine, + mgr, coprocessor_host, router, pd_client, - snap_gen_pool: YatpPoolBuilder::new(DefaultTicker::default()) + pool: YatpPoolBuilder::new(DefaultTicker::default()) .name_prefix("snap-generator") .thread_count( 1, @@ -607,21 +414,11 @@ where SNAP_GENERATOR_MAX_POOL_SIZE, ) .build_future_pool(), - region_cleanup_pool: YatpPoolBuilder::new(DefaultTicker::default()) - .name_prefix("region-cleanup") - .thread_count(1, 1, 1) - .build_future_pool(), - region_cleaner: Arc::new(Mutex::new(RegionCleaner { - use_delete_range: cfg.value().use_delete_range, - engine, - pending_delete_ranges: PendingDeleteRanges::default(), - mgr, - })), } } pub fn snap_generator_pool(&self) -> FuturePool { - self.snap_gen_pool.clone() + self.pool.clone() } fn region_state(&self, region_id: u64) -> Result { @@ -669,10 +466,7 @@ where let start_key = keys::enc_start_key(®ion); let end_key = keys::enc_end_key(®ion); check_abort(&abort)?; - { - let mut region_cleaner = self.region_cleaner.lock().unwrap(); - region_cleaner.clean_overlap_ranges(start_key, end_key)?; - } + self.clean_overlap_ranges(start_key, end_key)?; check_abort(&abort)?; fail_point!("apply_snap_cleanup_range"); @@ -703,12 +497,15 @@ where self.coprocessor_host .post_apply_snapshot(®ion, peer_id, &snap_key, Some(&s)); - // delete snapshot state. - let mut wb = self.engine.write_batch(); + // Delete snapshot state and assure the relative region state and snapshot state + // is updated and flushed into kvdb. region_state.set_state(PeerState::Normal); + let mut wb = self.engine.write_batch(); box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)); box_try!(wb.delete_cf(CF_RAFT, &keys::snapshot_raft_state_key(region_id))); - wb.write().unwrap_or_else(|e| { + let mut wopts = WriteOptions::default(); + wopts.set_sync(true); + wb.write_opt(&wopts).unwrap_or_else(|e| { panic!("{} failed to save apply_snap result: {:?}", region_id, e); }); info!( @@ -772,6 +569,153 @@ where ); } + /// Tries to clean up files in pending ranges overlapping with the given + /// bounds. These pending ranges will be removed. Returns an updated range + /// that also includes these ranges. Caller must ensure the remaining keys + /// in the returning range will be deleted properly. + fn clean_overlap_ranges_roughly( + &mut self, + mut start_key: Vec, + mut end_key: Vec, + ) -> (Vec, Vec) { + let overlap_ranges = self + .pending_delete_ranges + .drain_overlap_ranges(&start_key, &end_key); + if overlap_ranges.is_empty() { + return (start_key, end_key); + } + CLEAN_COUNTER_VEC.with_label_values(&["overlap"]).inc(); + let oldest_sequence = self + .engine + .get_oldest_snapshot_sequence_number() + .unwrap_or(u64::MAX); + let df_ranges: Vec<_> = overlap_ranges + .iter() + .filter_map(|(region_id, cur_start, cur_end, stale_sequence)| { + info!( + "delete data in range because of overlap"; "region_id" => region_id, + "start_key" => log_wrappers::Value::key(cur_start), + "end_key" => log_wrappers::Value::key(cur_end) + ); + if &start_key > cur_start { + start_key = cur_start.clone(); + } + if &end_key < cur_end { + end_key = cur_end.clone(); + } + if *stale_sequence < oldest_sequence { + Some(Range::new(cur_start, cur_end)) + } else { + SNAP_COUNTER_VEC + .with_label_values(&["overlap", "not_delete_files"]) + .inc(); + None + } + }) + .collect(); + self.engine + .delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteFiles, + &df_ranges, + ) + .map_err(|e| { + error!("failed to delete files in range"; "err" => %e); + }) + .unwrap(); + (start_key, end_key) + } + + /// Cleans up data in the given range and all pending ranges overlapping + /// with it. + fn clean_overlap_ranges(&mut self, start_key: Vec, end_key: Vec) -> Result<()> { + let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); + self.delete_all_in_range(&[Range::new(&start_key, &end_key)]) + } + + /// Inserts a new pending range, and it will be cleaned up with some delay. + fn insert_pending_delete_range( + &mut self, + region_id: u64, + start_key: Vec, + end_key: Vec, + ) { + let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); + info!("register deleting data in range"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); + let seq = self.engine.get_latest_sequence_number(); + self.pending_delete_ranges + .insert(region_id, start_key, end_key, seq); + } + + /// Cleans up stale ranges. + fn clean_stale_ranges(&mut self) { + STALE_PEER_PENDING_DELETE_RANGE_GAUGE.set(self.pending_delete_ranges.len() as f64); + if self.ingest_maybe_stall() { + return; + } + let oldest_sequence = self + .engine + .get_oldest_snapshot_sequence_number() + .unwrap_or(u64::MAX); + let mut region_ranges: Vec<(u64, Vec, Vec)> = self + .pending_delete_ranges + .stale_ranges(oldest_sequence) + .map(|(region_id, s, e)| (region_id, s.to_vec(), e.to_vec())) + .collect(); + if region_ranges.is_empty() { + return; + } + CLEAN_COUNTER_VEC.with_label_values(&["destroy"]).inc_by(1); + region_ranges.sort_by(|a, b| a.1.cmp(&b.1)); + region_ranges.truncate(CLEANUP_MAX_REGION_COUNT); + let ranges: Vec<_> = region_ranges + .iter() + .map(|(region_id, start, end)| { + info!("delete data in range because of stale"; "region_id" => region_id, + "start_key" => log_wrappers::Value::key(start), + "end_key" => log_wrappers::Value::key(end)); + Range::new(start, end) + }) + .collect(); + + self.engine + .delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteFiles, + &ranges, + ) + .map_err(|e| { + error!("failed to delete files in range"; "err" => %e); + }) + .unwrap(); + if let Err(e) = self.delete_all_in_range(&ranges) { + error!("failed to cleanup stale range"; "err" => %e); + return; + } + self.engine + .delete_ranges_cfs( + &WriteOptions::default(), + DeleteStrategy::DeleteBlobs, + &ranges, + ) + .map_err(|e| { + error!("failed to delete blobs in range"; "err" => %e); + }) + .unwrap(); + + for (_, key, _) in region_ranges { + assert!( + self.pending_delete_ranges.remove(&key).is_some(), + "cleanup pending_delete_ranges {} should exist", + log_wrappers::Value::key(&key) + ); + } + } + /// Checks the number of files at level 0 to avoid write stall after /// ingesting sst. Returns true if the ingestion causes write stall. fn ingest_maybe_stall(&self) -> bool { @@ -787,6 +731,36 @@ where false } + fn delete_all_in_range(&self, ranges: &[Range<'_>]) -> Result<()> { + let wopts = WriteOptions::default(); + for cf in self.engine.cf_names() { + // CF_LOCK usually contains fewer keys than other CFs, so we delete them by key. + let (strategy, observer) = if cf == CF_LOCK { + ( + DeleteStrategy::DeleteByKey, + &CLEAR_OVERLAP_REGION_DURATION.by_key, + ) + } else if self.use_delete_range { + ( + DeleteStrategy::DeleteByRange, + &CLEAR_OVERLAP_REGION_DURATION.by_range, + ) + } else { + ( + DeleteStrategy::DeleteByWriter { + sst_path: self.mgr.get_temp_path_for_ingest(), + }, + &CLEAR_OVERLAP_REGION_DURATION.by_ingest_files, + ) + }; + let start = Instant::now(); + box_try!(self.engine.delete_ranges_cf(&wopts, cf, strategy, ranges)); + observer.observe(start.saturating_elapsed_secs()); + } + + Ok(()) + } + /// Calls observer `pre_apply_snapshot` for every task. /// Multiple task can be `pre_apply_snapshot` at the same time. fn pre_apply_snapshot(&self, task: &Task) -> Result<()> { @@ -920,7 +894,7 @@ where start: UnixSecs::now(), }; let scheduled_time = Instant::now_coarse(); - self.snap_gen_pool.spawn(async move { + self.pool.spawn(async move { SNAP_GEN_WAIT_DURATION_HISTOGRAM .observe(scheduled_time.saturating_elapsed_secs()); @@ -961,19 +935,11 @@ where start_key, end_key, } => { - let region_cleaner = self.region_cleaner.clone(); - self.region_cleanup_pool - .spawn(async move { - fail_point!("on_region_worker_destroy", region_id == 1000, |_| {}); - let mut region_cleaner = region_cleaner.lock().unwrap(); - // try to delay the range deletion because - // there might be a coprocessor request related to this range - region_cleaner.insert_pending_delete_range(region_id, start_key, end_key); - region_cleaner.clean_stale_ranges(); - }) - .unwrap_or_else(|e| { - error!("failed to destroy region"; "region_id" => region_id, "err" => ?e); - }); + fail_point!("on_region_worker_destroy", true, |_| {}); + // try to delay the range deletion because + // there might be a coprocessor request related to this range + self.insert_pending_delete_range(region_id, start_key, end_key); + self.clean_stale_ranges(); } } } @@ -989,7 +955,7 @@ where self.handle_pending_applies(true); self.clean_stale_tick += 1; if self.clean_stale_tick >= self.clean_stale_ranges_tick { - self.region_cleaner.lock().unwrap().clean_stale_ranges(); + self.clean_stale_ranges(); self.clean_stale_tick = 0; } } @@ -1165,7 +1131,7 @@ pub(crate) mod tests { ranges.push(key); } engine.kv.put(b"k1", b"v1").unwrap(); - let snap = engine.kv.snapshot(None); + let snap = engine.kv.snapshot(); engine.kv.put(b"k2", b"v2").unwrap(); sched @@ -1278,7 +1244,7 @@ pub(crate) mod tests { sched .schedule(Task::Gen { region_id: id, - kv_snap: engine.kv.snapshot(None), + kv_snap: engine.kv.snapshot(), last_applied_term: entry.get_term(), last_applied_state: apply_state, canceled: Arc::new(AtomicBool::new(false)), diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 32362ef4b8c..0a6a19acc91 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -14,7 +14,7 @@ use kvproto::{ metapb::{self, Peer}, pdpb::QueryKind, }; -use pd_client::{BucketMeta, BucketStat}; +use pd_client::{BucketMeta, BucketStat, RegionWriteCfCopDetail}; use rand::Rng; use resource_metering::RawRecords; use tikv_util::{ @@ -333,6 +333,7 @@ impl Recorder { pub struct RegionInfo { pub sample_num: usize, pub query_stats: QueryStats, + pub cop_detail: RegionWriteCfCopDetail, pub peer: Peer, pub key_ranges: Vec, pub flow: FlowStatistics, @@ -343,6 +344,7 @@ impl RegionInfo { RegionInfo { sample_num, query_stats: QueryStats::default(), + cop_detail: RegionWriteCfCopDetail::default(), key_ranges: Vec::with_capacity(sample_num), peer: Peer::default(), flow: FlowStatistics::default(), @@ -444,6 +446,7 @@ impl ReadStats { end: Option<&[u8]>, write: &FlowStatistics, data: &FlowStatistics, + write_cf_cop_detail: &RegionWriteCfCopDetail, ) { let num = self.sample_num; let region_info = self @@ -452,6 +455,7 @@ impl ReadStats { .or_insert_with(|| RegionInfo::new(num)); region_info.flow.add(write); region_info.flow.add(data); + region_info.cop_detail.add(write_cf_cop_detail); // the bucket of the follower only have the version info and not needs to be // recorded the hot bucket. if let Some(buckets) = buckets diff --git a/components/range_cache_memory_engine/src/background.rs b/components/range_cache_memory_engine/src/background.rs deleted file mode 100644 index 140f4645cbe..00000000000 --- a/components/range_cache_memory_engine/src/background.rs +++ /dev/null @@ -1,2631 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{collections::BTreeSet, fmt::Display, sync::Arc, thread::JoinHandle, time::Duration}; - -use bytes::Bytes; -use crossbeam::{ - channel::{bounded, tick, Sender}, - epoch, select, -}; -use engine_rocks::{RocksEngine, RocksSnapshot}; -use engine_traits::{ - CacheRange, IterOptions, Iterable, Iterator, MiscExt, RangeHintService, SnapshotMiscExt, - CF_DEFAULT, CF_WRITE, DATA_CFS, -}; -use parking_lot::RwLock; -use pd_client::{PdClient, RpcClient}; -use raftstore::coprocessor::RegionInfoProvider; -use slog_global::{error, info, warn}; -use tikv_util::{ - config::ReadableSize, - future::block_on_timeout, - keybuilder::KeyBuilder, - time::Instant, - worker::{Builder, Runnable, RunnableWithTimer, ScheduleError, Scheduler, Worker}, -}; -use txn_types::{Key, TimeStamp, WriteRef, WriteType}; -use yatp::Remote; - -use crate::{ - engine::{RangeCacheMemoryEngineCore, SkiplistHandle}, - keys::{ - decode_key, encode_key, encode_key_for_boundary_with_mvcc, encoding_for_filter, - InternalBytes, InternalKey, ValueType, - }, - memory_controller::{MemoryController, MemoryUsage}, - metrics::{ - GC_FILTERED_STATIC, RANGE_CACHE_COUNT, RANGE_CACHE_MEMORY_USAGE, RANGE_GC_TIME_HISTOGRAM, - RANGE_LOAD_TIME_HISTOGRAM, - }, - range_manager::LoadFailedReason, - range_stats::{RangeStatsManager, DEFAULT_EVICT_MIN_DURATION}, - region_label::{ - LabelRule, RegionLabelAddedCb, RegionLabelRulesManager, RegionLabelServiceBuilder, - }, - write_batch::RangeCacheWriteBatchEntry, -}; - -/// Try to extract the key and `u64` timestamp from `encoded_key`. -/// -/// See also: [`txn_types::Key::split_on_ts_for`] -fn split_ts(key: &[u8]) -> Result<(&[u8], u64), String> { - match Key::split_on_ts_for(key) { - Ok((key, ts)) => Ok((key, ts.into_inner())), - Err(_) => Err(format!( - "invalid write cf key: {}", - log_wrappers::Value(key) - )), - } -} - -fn parse_write(value: &[u8]) -> Result, String> { - match WriteRef::parse(value) { - Ok(write) => Ok(write), - Err(_) => Err(format!( - "invalid write cf value: {}", - log_wrappers::Value(value) - )), - } -} - -#[derive(Debug)] -pub enum BackgroundTask { - Gc(GcTask), - LoadRange, - MemoryCheckAndEvict, - DeleteRange(Vec), - TopRegionsLoadEvict, - CleanLockTombstone(u64), - SetRocksEngine(RocksEngine), -} - -impl Display for BackgroundTask { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - BackgroundTask::Gc(ref t) => t.fmt(f), - BackgroundTask::LoadRange => f.debug_struct("LoadTask").finish(), - BackgroundTask::MemoryCheckAndEvict => f.debug_struct("MemoryCheckAndEvict").finish(), - BackgroundTask::DeleteRange(ref r) => { - f.debug_struct("DeleteRange").field("range", r).finish() - } - BackgroundTask::TopRegionsLoadEvict => f.debug_struct("CheckTopRegions").finish(), - BackgroundTask::CleanLockTombstone(ref r) => f - .debug_struct("CleanLockTombstone") - .field("seqno", r) - .finish(), - BackgroundTask::SetRocksEngine(_) => f.debug_struct("SetDiskEngine").finish(), - } - } -} - -#[derive(Debug)] -pub struct GcTask { - pub safe_point: u64, -} - -impl Display for GcTask { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("GcTask") - .field("safe_point", &self.safe_point) - .finish() - } -} - -// BgWorkManager managers the worker inits, stops, and task schedules. When -// created, it starts a worker which receives tasks such as gc task, range -// delete task, range snapshot load and so on, and starts a thread for -// periodically schedule gc tasks. -pub struct BgWorkManager { - worker: Worker, - scheduler: Scheduler, - delete_range_scheduler: Scheduler, - tick_stopper: Option<(JoinHandle<()>, Sender)>, - core: Arc>, -} - -impl Drop for BgWorkManager { - fn drop(&mut self) { - let (h, tx) = self.tick_stopper.take().unwrap(); - let _ = tx.send(true); - let _ = h.join(); - self.worker.stop(); - } -} - -pub struct PdRangeHintService(Arc); - -impl RangeHintService for PdRangeHintService {} - -impl From> for PdRangeHintService { - fn from(pd_client: Arc) -> Self { - PdRangeHintService(pd_client) - } -} - -const CACHE_LABEL_RULE_KEY: &str = "cache"; -const CACHE_LABEL_RULE_ALWAYS: &str = "always"; - -/// This implementation starts a background task using to pull down region label -/// rules from PD. -impl PdRangeHintService { - /// Spawn a background task on `remote` to continuosly watch for region - /// label rules that contain the label `cache`; if a new added for which - /// `cache` is set to `always`, request loading the label's keyranges using - /// `range_manager_load_cb`. - /// - /// TODO (afeinberg): Add support for evicting key ranges when the `cache` - /// label is removed or no longer set to always. - pub fn start(&self, remote: Remote, range_manager_load_cb: F) - where - F: Fn(&CacheRange) -> Result<(), LoadFailedReason> + Send + Sync + 'static, - { - let pd_client = self.0.clone(); - let region_label_added_cb: RegionLabelAddedCb = Arc::new(move |label_rule: &LabelRule| { - if !label_rule - .labels - .iter() - .any(|e| e.key == CACHE_LABEL_RULE_KEY && e.value == CACHE_LABEL_RULE_ALWAYS) - { - // not related to caching, skip. - return; - } - for key_range in &label_rule.data { - match CacheRange::try_from(key_range) { - Ok(cache_range) => { - info!("Requested to cache range"; "cache_range" => ?&cache_range); - if let Err(reason) = range_manager_load_cb(&cache_range) { - error!("Cache range load failed"; "range" => ?&cache_range, "reason" => ?reason); - } - } - Err(e) => { - error!("Unable to convert key_range rule to cache range"; "err" => ?e); - } - } - } - }); - let mut region_label_svc = RegionLabelServiceBuilder::new( - Arc::new(RegionLabelRulesManager { - region_label_added_cb: Some(region_label_added_cb), - ..RegionLabelRulesManager::default() - }), - pd_client, - ) - .rule_filter_fn(|label_rule| { - label_rule - .labels - .iter() - .any(|e| e.key == CACHE_LABEL_RULE_KEY) - }) - .build() - .unwrap(); - remote.spawn(async move { region_label_svc.watch_region_labels().await }) - } -} - -impl BgWorkManager { - pub fn new( - core: Arc>, - pd_client: Arc, - gc_interval: Duration, - load_evict_interval: Duration, - expected_region_size: usize, - memory_controller: Arc, - region_info_provider: Option>, - ) -> Self { - let worker = Worker::new("range-cache-background-worker"); - let (runner, delete_range_scheduler) = BackgroundRunner::new( - core.clone(), - memory_controller, - region_info_provider, - expected_region_size, - ); - let scheduler = worker.start_with_timer("range-cache-engine-background", runner); - - let (h, tx) = BgWorkManager::start_tick( - scheduler.clone(), - pd_client, - gc_interval, - load_evict_interval, - ); - - Self { - worker, - scheduler, - delete_range_scheduler, - tick_stopper: Some((h, tx)), - core, - } - } - - pub fn schedule_task(&self, task: BackgroundTask) -> Result<(), ScheduleError> { - match task { - task @ BackgroundTask::DeleteRange(_) => { - self.delete_range_scheduler.schedule_force(task) - } - task => self.scheduler.schedule_force(task), - } - } - - pub fn start_bg_hint_service(&self, range_hint_service: PdRangeHintService) { - let core = self.core.clone(); - range_hint_service.start(self.worker.remote(), move |cache_range: &CacheRange| { - let mut engine = core.write(); - engine.mut_range_manager().load_range(cache_range.clone())?; - // TODO (afeinberg): This does not actually load the range. The load happens - // the apply thread begins to apply raft entries. To force this (for read-only - // use-cases) we should propose a No-Op command. - Ok(()) - }); - } - - fn start_tick( - scheduler: Scheduler, - pd_client: Arc, - gc_interval: Duration, - load_evict_interval: Duration, - ) -> (JoinHandle<()>, Sender) { - let (tx, rx) = bounded(0); - // TODO: Instead of spawning a new thread, we should run this task - // in a shared background thread. - let h = std::thread::spawn(move || { - let gc_ticker = tick(gc_interval); - let load_evict_ticker = tick(load_evict_interval); // TODO (afeinberg): Use a real value. - // 5 seconds should be long enough for getting a TSO from PD. - let tso_timeout = std::cmp::min(gc_interval, Duration::from_secs(5)); - 'LOOP: loop { - select! { - recv(gc_ticker) -> _ => { - let now = match block_on_timeout(pd_client.get_tso(), tso_timeout) { - Ok(Ok(ts)) => ts, - err => { - error!( - "schedule range cache engine gc failed "; - "timeout_duration" => ?tso_timeout, - "error" => ?err, - ); - continue 'LOOP; - } - }; - let safe_point = now.physical() - gc_interval.as_millis() as u64; - let safe_point = TimeStamp::compose(safe_point, 0).into_inner(); - if let Err(e) = scheduler.schedule(BackgroundTask::Gc(GcTask {safe_point})) { - error!( - "schedule range cache engine gc failed"; - "err" => ?e, - ); - } - }, - recv(load_evict_ticker) -> _ => { - if let Err(e) = scheduler.schedule(BackgroundTask::TopRegionsLoadEvict) { - error!( - "schedule load evict failed"; - "err" => ?e, - ); - } - }, - recv(rx) -> r => { - if let Err(e) = r { - error!( - "receive error in range cache engien gc ticker"; - "err" => ?e, - ); - } - return; - }, - } - } - }); - (h, tx) - } -} - -#[derive(Clone)] -struct BackgroundRunnerCore { - engine: Arc>, - memory_controller: Arc, - range_stats_manager: Option, -} - -impl BackgroundRunnerCore { - /// Returns the ranges that are eligible for garbage collection. - /// - /// Returns `None` if there are no ranges cached or the previous gc is not - /// finished. - fn ranges_for_gc(&self) -> Option> { - let ranges: BTreeSet = { - let core = self.engine.read(); - if core.range_manager().has_ranges_in_gc() { - return None; - } - core.range_manager().ranges().keys().cloned().collect() - }; - let ranges_clone = ranges.clone(); - if ranges_clone.is_empty() { - return None; - } - { - let mut core = self.engine.write(); - core.mut_range_manager().set_ranges_in_gc(ranges_clone); - } - Some(ranges) - } - - fn gc_range(&self, range: &CacheRange, safe_point: u64, oldest_seqno: u64) -> FilterMetrics { - let (skiplist_engine, safe_ts) = { - let mut core = self.engine.write(); - let Some(range_meta) = core.mut_range_manager().mut_range_meta(range) else { - return FilterMetrics::default(); - }; - let min_snapshot = range_meta - .range_snapshot_list() - .min_snapshot_ts() - .unwrap_or(u64::MAX); - let safe_point = u64::min(safe_point, min_snapshot); - - if safe_point <= range_meta.safe_point() { - info!( - "safe point not large enough"; - "prev" => range_meta.safe_point(), - "current" => safe_point, - ); - return FilterMetrics::default(); - } - - // todo: change it to debug! - info!( - "safe point update"; - "prev" => range_meta.safe_point(), - "current" => safe_point, - "range" => ?range, - ); - range_meta.set_safe_point(safe_point); - (core.engine(), safe_point) - }; - - let start = Instant::now(); - let write_cf_handle = skiplist_engine.cf_handle(CF_WRITE); - let default_cf_handle = skiplist_engine.cf_handle(CF_DEFAULT); - let mut filter = Filter::new( - safe_ts, - oldest_seqno, - default_cf_handle, - write_cf_handle.clone(), - ); - - let mut iter = write_cf_handle.iterator(); - let guard = &epoch::pin(); - let (start_key, end_key) = encode_key_for_boundary_with_mvcc(range); - iter.seek(&start_key, guard); - while iter.valid() && iter.key() < &end_key { - let k = iter.key(); - let v = iter.value(); - if let Err(e) = filter.filter(k.as_bytes(), v.as_bytes()) { - warn!( - "Something Wrong in memory engine GC"; - "error" => ?e, - ); - } - iter.next(guard); - } - - let duration = start.saturating_elapsed(); - RANGE_GC_TIME_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - "range gc complete"; - "range" => ?range, - "gc_duration" => ?duration, - "total_version" => filter.metrics.total, - "filtered_version" => filter.metrics.filtered, - "below_safe_point_unique_keys" => filter.metrics.unique_key, - "below_safe_point_version" => filter.metrics.versions, - "below_safe_point_delete_version" => filter.metrics.delete_versions, - "current_safe_point" => safe_ts, - ); - - let mut metrics = std::mem::take(&mut filter.metrics); - if filter.cached_mvcc_delete_key.is_some() { - metrics.filtered += 1; - } - if filter.cached_skiplist_delete_key.is_some() { - metrics.filtered += 1; - } - metrics - } - - fn on_gc_finished(&mut self, ranges: BTreeSet) { - let mut core = self.engine.write(); - core.mut_range_manager().on_gc_finished(ranges); - } - - /// Returns the first range to load with RocksDB snapshot. The `bool` - /// returned indicates whether the task has been canceled due to memory - /// issue. - /// - /// Returns `None` if there are no ranges to load. - fn get_range_to_load(&self) -> Option<(CacheRange, Arc, bool)> { - let core = self.engine.read(); - core.range_manager() - .pending_ranges_loading_data - .front() - .cloned() - } - - // if `false` is returned, the load is canceled - fn on_snapshot_load_finished( - &mut self, - range: CacheRange, - delete_range_scheduler: &Scheduler, - ) -> bool { - fail::fail_point!("on_snapshot_load_finished"); - fail::fail_point!("on_snapshot_load_finished2"); - loop { - // Consume the cached write batch after the snapshot is acquired. - let mut core = self.engine.write(); - // We still need to check whether the snapshot is canceled during the load - let canceled = core - .range_manager() - .pending_ranges_loading_data - .front() - .unwrap() - .2; - if canceled { - let (r, ..) = core - .mut_range_manager() - .pending_ranges_loading_data - .pop_front() - .unwrap(); - assert_eq!(r, range); - core.mut_range_manager() - .ranges_being_deleted - .insert(r.clone()); - core.remove_cached_write_batch(&range); - drop(core); - fail::fail_point!("in_memory_engine_snapshot_load_canceled"); - - if let Err(e) = - delete_range_scheduler.schedule_force(BackgroundTask::DeleteRange(vec![r])) - { - error!( - "schedule delete range failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - - return false; - } - - if core.has_cached_write_batch(&range) { - let (cache_batch, skiplist_engine) = { - ( - core.take_cached_write_batch_entries(&range), - core.engine().clone(), - ) - }; - drop(core); - let guard = &epoch::pin(); - for (seq, entry) in cache_batch { - entry - .write_to_memory( - seq, - &skiplist_engine, - self.memory_controller.clone(), - guard, - ) - .unwrap(); - } - fail::fail_point!("on_cached_write_batch_consumed"); - } else { - core.remove_cached_write_batch(&range); - RangeCacheMemoryEngineCore::pending_range_completes_loading(&mut core, &range); - drop(core); - - fail::fail_point!("pending_range_completes_loading"); - break; - } - } - true - } - - fn on_snapshot_load_canceled( - &mut self, - range: CacheRange, - delete_range_scheduler: &Scheduler, - ) { - let mut core = self.engine.write(); - let (r, ..) = core - .mut_range_manager() - .pending_ranges_loading_data - .pop_front() - .unwrap(); - assert_eq!(r, range); - core.remove_cached_write_batch(&range); - core.mut_range_manager() - .ranges_being_deleted - .insert(r.clone()); - - if let Err(e) = delete_range_scheduler.schedule_force(BackgroundTask::DeleteRange(vec![r])) - { - error!( - "schedule delete range failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - - /// Eviction on soft limit reached: - /// - /// When soft limit is reached, collect the candidates for eviction, and - /// keep evicting until either all candidates are evicted, or the total - /// approximated size of evicted regions is equal to or greater than the - /// excess memory usage. - fn evict_on_soft_limit_reached(&self, delete_range_scheduler: &Scheduler) { - if self.range_stats_manager.is_none() { - warn!("range stats manager is not initialized, cannot evict on soft limit reached"); - return; - } - let range_stats_manager = self.range_stats_manager.as_ref().unwrap(); - let to_shrink_by = self - .memory_controller - .mem_usage() - .checked_sub(self.memory_controller.soft_limit_threshold()); - if to_shrink_by.is_none() { - return; - } - let mut remaining = to_shrink_by.unwrap(); - let mut ranges_to_evict = Vec::<(CacheRange, u64)>::with_capacity(256); - - // TODO (afeinberg, low): consider returning just an iterator and using scan - // below for cleaner code. - range_stats_manager.collect_candidates_for_eviction(&mut ranges_to_evict, |range| { - self.engine.read().range_manager().contains_range(range) - }); - - let mut ranges_to_delete = vec![]; - // TODO (afeinberg): approximate size may differ from size in in-memory cache, - // consider taking the actual size into account. - for (range, approx_size) in &ranges_to_evict { - if remaining == 0 { - break; - } - let evicted_range = { - let mut engine_wr = self.engine.write(); - let mut ranges = engine_wr.mut_range_manager().evict_range(range); - if !ranges.is_empty() { - info!( - "evict on soft limit reached"; - "range_to_evict" => ?&range, - "ranges_evicted" => ?ranges, - "approx_size" => approx_size, - "remaining" => remaining - ); - remaining = remaining - .checked_sub(*approx_size as usize) - .unwrap_or_default(); - ranges_to_delete.append(&mut ranges); - true - } else { - false - } - }; - if evicted_range { - range_stats_manager.handle_range_evicted(range); - } - } - - if !ranges_to_delete.is_empty() { - if let Err(e) = - delete_range_scheduler.schedule_force(BackgroundTask::DeleteRange(ranges_to_delete)) - { - error!( - "schedule deletet range failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - } - - /// Periodically load top regions. - /// - /// If the soft limit is exceeded, evict (some) regions no longer considered - /// top. - /// - /// See: [`RangeStatsManager::collect_changes_ranges`] for - /// algorithm details. - fn top_regions_load_evict(&self, delete_range_scheduler: &Scheduler) { - if self.range_stats_manager.is_none() { - return; - } - let range_stats_manager: &RangeStatsManager = self.range_stats_manager.as_ref().unwrap(); - if range_stats_manager.checking_top_regions() { - return; - } - range_stats_manager.set_checking_top_regions(true); - - let curr_memory_usage = self.memory_controller.mem_usage(); - let threshold = self.memory_controller.soft_limit_threshold(); - range_stats_manager.adjust_max_num_regions(curr_memory_usage, threshold); - - let mut ranges_to_add = Vec::::with_capacity(256); - let mut ranges_to_remove = Vec::::with_capacity(256); - range_stats_manager.collect_changed_ranges(&mut ranges_to_add, &mut ranges_to_remove); - let mut ranges_to_delete = vec![]; - info!("load_evict"; "ranges_to_add" => ?&ranges_to_add, "may_evict" => ?&ranges_to_remove); - for evict_range in ranges_to_remove { - if self.memory_controller.reached_soft_limit() { - let mut core = self.engine.write(); - let mut ranges = core.mut_range_manager().evict_range(&evict_range); - info!( - "load_evict: soft limit reached"; - "range_to_evict" => ?&evict_range, - "ranges_evicted" => ?ranges - ); - ranges_to_delete.append(&mut ranges); - } - } - if !ranges_to_delete.is_empty() { - if let Err(e) = - delete_range_scheduler.schedule_force(BackgroundTask::DeleteRange(ranges_to_delete)) - { - error!( - "schedule deletet range failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - for cache_range in ranges_to_add { - let mut core = self.engine.write(); - if let Err(e) = core.mut_range_manager().load_range(cache_range.clone()) { - error!("error loading range"; "cache_range" => ?&cache_range, "err" => ?e); - } - } - range_stats_manager.set_checking_top_regions(false); - info!("load_evict complete"); - } -} - -// Flush epoch and pin enough times to make the delayed operations be executed -#[cfg(test)] -pub(crate) fn flush_epoch() { - { - let guard = &epoch::pin(); - guard.flush(); - } - // Local epoch tries to advance the global epoch every 128 pins. When global - // epoch advances, the operations(here, means delete) in the older epoch can be - // executed. - for _ in 0..128 { - let _ = &epoch::pin(); - } -} - -pub struct BackgroundRunner { - core: BackgroundRunnerCore, - - // We have following four separate workers so that each type of task would not block each - // others - range_load_remote: Remote, - range_load_worker: Worker, - - delete_range_scheduler: Scheduler, - delete_range_worker: Worker, - - gc_range_remote: Remote, - gc_range_worker: Worker, - - // Region load and eviction worker. - // TODO: this can be consolidated, possibly with the GC worker. - load_evict_remote: Remote, - load_evict_worker: Worker, - - lock_cleanup_remote: Remote, - lock_cleanup_worker: Worker, - - // The last sequence number for the lock cf tombstone cleanup - last_seqno: u64, - // RocksEngine is used to get the oldest snapshot sequence number. - rocks_engine: Option, -} - -impl Drop for BackgroundRunner { - fn drop(&mut self) { - self.range_load_worker.stop(); - self.delete_range_worker.stop(); - self.gc_range_worker.stop(); - self.load_evict_worker.stop(); - self.lock_cleanup_worker.stop(); - } -} - -impl BackgroundRunner { - pub fn new( - engine: Arc>, - memory_controller: Arc, - region_info_provider: Option>, - expected_region_size: usize, - ) -> (Self, Scheduler) { - let range_load_worker = Builder::new("background-range-load-worker") - // Range load now is implemented sequentially, so we must use exactly one thread to handle it. - // todo(SpadeA): if the load speed is a bottleneck, we may consider to use multiple threads to load ranges. - .thread_count(1) - .create(); - let range_load_remote = range_load_worker.remote(); - - let delete_range_worker = Worker::new("background-delete-range-worker"); - let delete_range_runner = DeleteRangeRunner::new(engine.clone()); - let delete_range_scheduler = - delete_range_worker.start_with_timer("delete-range-runner", delete_range_runner); - - let lock_cleanup_worker = Worker::new("lock-cleanup-worker"); - let lock_cleanup_remote = lock_cleanup_worker.remote(); - - let gc_range_worker = Builder::new("background-range-load-worker") - // Gc must also use exactly one thread to handle it. - .thread_count(1) - .create(); - let gc_range_remote = gc_range_worker.remote(); - - let load_evict_worker = Worker::new("background-region-load-evict-worker"); - let load_evict_remote = load_evict_worker.remote(); - - let num_regions_to_cache = memory_controller.soft_limit_threshold() / expected_region_size; - let range_stats_manager = region_info_provider.map(|region_info_provider| { - RangeStatsManager::new( - num_regions_to_cache, - DEFAULT_EVICT_MIN_DURATION, - expected_region_size, - region_info_provider, - ) - }); - ( - Self { - core: BackgroundRunnerCore { - engine, - memory_controller, - range_stats_manager, - }, - range_load_worker, - range_load_remote, - delete_range_worker, - delete_range_scheduler: delete_range_scheduler.clone(), - gc_range_worker, - gc_range_remote, - load_evict_worker, - load_evict_remote, - lock_cleanup_remote, - lock_cleanup_worker, - last_seqno: 0, - rocks_engine: None, - }, - delete_range_scheduler, - ) - } -} - -impl Runnable for BackgroundRunner { - type Task = BackgroundTask; - - fn run(&mut self, task: Self::Task) { - match task { - BackgroundTask::SetRocksEngine(rocks_engine) => { - self.rocks_engine = Some(rocks_engine); - fail::fail_point!("in_memory_engine_set_rocks_engine"); - } - BackgroundTask::Gc(t) => { - let seqno = (|| { - fail::fail_point!("in_memory_engine_gc_oldest_seqno", |t| { - Some(t.unwrap().parse::().unwrap()) - }); - - let Some(ref rocks_engine) = self.rocks_engine else { - return None; - }; - let latest_seqno = rocks_engine.get_latest_sequence_number(); - Some( - rocks_engine - .get_oldest_snapshot_sequence_number() - .unwrap_or(latest_seqno), - ) - })(); - - let Some(seqno) = seqno else { - return; - }; - - info!( - "start a new round of gc for range cache engine"; - "safe_point" => t.safe_point, - "oldest_sequence" => seqno, - ); - let mut core = self.core.clone(); - if let Some(ranges) = core.ranges_for_gc() { - let f = async move { - let mut metrics = FilterMetrics::default(); - for range in &ranges { - let m = core.gc_range(range, t.safe_point, seqno); - metrics.merge(&m); - } - core.on_gc_finished(ranges); - metrics.flush(); - fail::fail_point!("in_memory_engine_gc_finish"); - }; - self.gc_range_remote.spawn(f); - } - } - BackgroundTask::LoadRange => { - let mut core = self.core.clone(); - let delete_range_scheduler = self.delete_range_scheduler.clone(); - let f = async move { - let skiplist_engine = { - let core = core.engine.read(); - core.engine().clone() - }; - while let Some((range, snap, mut canceled)) = core.get_range_to_load() { - info!("Loading range"; "range" => ?&range); - let iter_opt = IterOptions::new( - Some(KeyBuilder::from_vec(range.start.clone(), 0, 0)), - Some(KeyBuilder::from_vec(range.end.clone(), 0, 0)), - false, - ); - if core.memory_controller.reached_soft_limit() { - // We are running out of memory, so cancel the load. - canceled = true; - } - - if canceled { - info!( - "snapshot load canceled due to memory reaching soft limit"; - "range" => ?range, - ); - core.on_snapshot_load_canceled(range, &delete_range_scheduler); - continue; - } - - let snapshot_load = || -> bool { - for &cf in DATA_CFS { - let handle = skiplist_engine.cf_handle(cf); - let seq = snap.sequence_number(); - let guard = &epoch::pin(); - match snap.iterator_opt(cf, iter_opt.clone()) { - Ok(mut iter) => { - iter.seek_to_first().unwrap(); - while iter.valid().unwrap() { - // use the sequence number from RocksDB snapshot here as - // the kv is clearly visible - let mut encoded_key = - encode_key(iter.key(), seq, ValueType::Value); - let mut val = - InternalBytes::from_vec(iter.value().to_vec()); - - let mem_size = - RangeCacheWriteBatchEntry::calc_put_entry_size( - iter.key(), - val.as_bytes(), - ); - - // todo(SpadeA): we can batch acquire the memory size - // here. - if let MemoryUsage::HardLimitReached(n) = - core.memory_controller.acquire(mem_size) - { - warn!( - "stop loading snapshot due to memory reaching hard limit"; - "range" => ?range, - "memory_usage(MB)" => ReadableSize(n as u64).as_mb_f64(), - ); - return false; - } - - encoded_key.set_memory_controller( - core.memory_controller.clone(), - ); - val.set_memory_controller( - core.memory_controller.clone(), - ); - handle.insert(encoded_key, val, guard); - iter.next().unwrap(); - } - } - Err(e) => { - error!("creating rocksdb iterator failed"; "cf" => cf, "err" => %e); - return false; - } - } - } - true - }; - - let start = Instant::now(); - if !snapshot_load() { - info!( - "snapshot load failed"; - "range" => ?range, - ); - core.on_snapshot_load_canceled(range, &delete_range_scheduler); - continue; - } - - if core.on_snapshot_load_finished(range.clone(), &delete_range_scheduler) { - let duration = start.saturating_elapsed(); - RANGE_LOAD_TIME_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - "Loading range finished"; - "range" => ?range, - "duration(sec)" => ?duration, - ); - } else { - info!("Loading range canceled";"range" => ?range); - } - } - }; - self.range_load_remote.spawn(f); - } - BackgroundTask::MemoryCheckAndEvict => { - let mem_usage = self.core.memory_controller.mem_usage(); - info!( - "start memory usage check and evict"; - "mem_usage(MB)" => ReadableSize(mem_usage as u64).as_mb() - ); - if mem_usage > self.core.memory_controller.soft_limit_threshold() { - let delete_range_scheduler = self.delete_range_scheduler.clone(); - let core = self.core.clone(); - let task = async move { - core.evict_on_soft_limit_reached(&delete_range_scheduler); - core.memory_controller.set_memory_checking(false); - }; - self.load_evict_remote.spawn(task); - } else { - self.core.memory_controller.set_memory_checking(false); - } - } - // DeleteRange task is executed by `DeleteRangeRunner` with a different scheduler so - // that the task will not be scheduled to here. - BackgroundTask::DeleteRange(_) => unreachable!(), - BackgroundTask::TopRegionsLoadEvict => { - let delete_range_scheduler = self.delete_range_scheduler.clone(); - let core = self.core.clone(); - let task = async move { core.top_regions_load_evict(&delete_range_scheduler) }; - self.load_evict_remote.spawn(task); - } - BackgroundTask::CleanLockTombstone(snapshot_seqno) => { - if snapshot_seqno < self.last_seqno { - return; - } - self.last_seqno = snapshot_seqno; - let core = self.core.clone(); - - let f = async move { - info!( - "begin to cleanup tombstones in lock cf"; - "seqno" => snapshot_seqno, - ); - - let mut last_user_key = vec![]; - let mut remove_rest = false; - let mut cached_to_remove: Option> = None; - - let mut removed = 0; - let mut total = 0; - let now = Instant::now(); - let lock_handle = core.engine.read().engine().cf_handle("lock"); - let guard = &epoch::pin(); - let mut iter = lock_handle.iterator(); - iter.seek_to_first(guard); - while iter.valid() { - total += 1; - let InternalKey { - user_key, - v_type, - sequence, - } = decode_key(iter.key().as_bytes()); - if user_key != last_user_key { - if let Some(remove) = cached_to_remove.take() { - removed += 1; - lock_handle.remove(&InternalBytes::from_vec(remove), guard); - } - last_user_key = user_key.to_vec(); - if sequence >= snapshot_seqno { - remove_rest = false; - } else { - remove_rest = true; - if v_type == ValueType::Deletion { - cached_to_remove = Some(iter.key().as_bytes().to_vec()); - } - } - } else if remove_rest { - assert!(sequence < snapshot_seqno); - removed += 1; - lock_handle.remove(iter.key(), guard); - } else if sequence < snapshot_seqno { - remove_rest = true; - if v_type == ValueType::Deletion { - assert!(cached_to_remove.is_none()); - cached_to_remove = Some(iter.key().as_bytes().to_vec()); - } - } - - iter.next(guard); - } - if let Some(remove) = cached_to_remove.take() { - removed += 1; - lock_handle.remove(&InternalBytes::from_vec(remove), guard); - } - - info!( - "cleanup tombstones in lock cf"; - "seqno" => snapshot_seqno, - "total" => total, - "removed" => removed, - "duration" => ?now.saturating_elapsed(), - "current_count" => lock_handle.len(), - ); - - fail::fail_point!("clean_lock_tombstone_done"); - }; - - self.lock_cleanup_remote.spawn(f); - } - } - } -} - -impl RunnableWithTimer for BackgroundRunner { - fn on_timeout(&mut self) { - let mem_usage = self.core.memory_controller.mem_usage(); - RANGE_CACHE_MEMORY_USAGE.set(mem_usage as i64); - - let core = self.core.engine.read(); - let pending = core.range_manager.pending_ranges.len(); - let cached = core.range_manager.ranges().len(); - let loading = core.range_manager.pending_ranges_loading_data.len(); - let evictions = core.range_manager.get_and_reset_range_evictions(); - drop(core); - RANGE_CACHE_COUNT - .with_label_values(&["pending_range"]) - .set(pending as i64); - RANGE_CACHE_COUNT - .with_label_values(&["cached_range"]) - .set(cached as i64); - RANGE_CACHE_COUNT - .with_label_values(&["loading_range"]) - .set(loading as i64); - RANGE_CACHE_COUNT - .with_label_values(&["range_evictions"]) - .set(evictions as i64); - } - - fn get_interval(&self) -> Duration { - Duration::from_secs(1) - } -} - -pub struct DeleteRangeRunner { - engine: Arc>, - // It is possible that when `DeleteRangeRunner` begins to delete a range, the range is being - // written by apply threads. In that case, we have to delay the delete range task to avoid race - // condition between them. Periodically, these delayed ranges will be checked to see if it is - // ready to be deleted. - delay_ranges: Vec, -} - -impl DeleteRangeRunner { - fn new(engine: Arc>) -> Self { - Self { - engine, - delay_ranges: vec![], - } - } - - fn delete_ranges(&mut self, ranges: &[CacheRange]) { - let skiplist_engine = self.engine.read().engine(); - for r in ranges { - skiplist_engine.delete_range(r); - } - self.engine - .write() - .mut_range_manager() - .on_delete_ranges(ranges); - - fail::fail_point!("in_memory_engine_delete_range_done"); - - #[cfg(test)] - flush_epoch(); - } -} - -impl Runnable for DeleteRangeRunner { - type Task = BackgroundTask; - fn run(&mut self, task: Self::Task) { - match task { - BackgroundTask::DeleteRange(ranges) => { - let (mut ranges_to_delay, ranges_to_delete) = { - let core = self.engine.read(); - let mut ranges_to_delay = vec![]; - let mut ranges_to_delete = vec![]; - for r in ranges { - // If the range is overlapped with ranges in `ranges_being_written`, the - // range has to be delayed to delete. See comment on `delay_ranges`. - if core - .range_manager - .is_overlapped_with_ranges_being_written(&r) - { - ranges_to_delay.push(r); - } else { - ranges_to_delete.push(r); - } - } - (ranges_to_delay, ranges_to_delete) - }; - self.delay_ranges.append(&mut ranges_to_delay); - if !ranges_to_delete.is_empty() { - self.delete_ranges(&ranges_to_delete); - } - } - _ => unreachable!(), - } - } -} - -impl RunnableWithTimer for DeleteRangeRunner { - fn on_timeout(&mut self) { - if self.delay_ranges.is_empty() { - return; - } - let ranges = std::mem::take(&mut self.delay_ranges); - self.run(BackgroundTask::DeleteRange(ranges)); - } - - fn get_interval(&self) -> Duration { - Duration::from_millis(500) - } -} - -#[derive(Default)] -struct FilterMetrics { - total: usize, - versions: usize, - delete_versions: usize, - filtered: usize, - unique_key: usize, - mvcc_rollback_and_locks: usize, -} - -impl FilterMetrics { - fn merge(&mut self, other: &FilterMetrics) { - self.total += other.total; - self.versions += other.versions; - self.delete_versions += other.delete_versions; - self.filtered += other.filtered; - self.unique_key += other.unique_key; - self.mvcc_rollback_and_locks += other.mvcc_rollback_and_locks; - } - - fn flush(&self) { - GC_FILTERED_STATIC.total.inc_by(self.total as u64); - GC_FILTERED_STATIC - .below_safe_point_total - .inc_by(self.versions as u64); - GC_FILTERED_STATIC.filtered.inc_by(self.filtered as u64); - GC_FILTERED_STATIC - .below_safe_point_unique - .inc_by(self.unique_key as u64); - } -} - -struct Filter { - safe_point: u64, - oldest_seqno: u64, - mvcc_key_prefix: Vec, - remove_older: bool, - - default_cf_handle: SkiplistHandle, - write_cf_handle: SkiplistHandle, - - // When deleting some keys, the latest one should be deleted at last to avoid the older - // version appears. - cached_mvcc_delete_key: Option>, - cached_skiplist_delete_key: Option>, - - metrics: FilterMetrics, - - last_user_key: Vec, -} - -impl Drop for Filter { - fn drop(&mut self) { - if let Some(cached_delete_key) = self.cached_mvcc_delete_key.take() { - let guard = &epoch::pin(); - self.write_cf_handle - .remove(&InternalBytes::from_vec(cached_delete_key), guard); - } - if let Some(cached_delete_key) = self.cached_skiplist_delete_key.take() { - let guard = &epoch::pin(); - self.write_cf_handle - .remove(&InternalBytes::from_vec(cached_delete_key), guard); - } - } -} - -impl Filter { - fn new( - safe_point: u64, - oldest_seqno: u64, - default_cf_handle: SkiplistHandle, - write_cf_handle: SkiplistHandle, - ) -> Self { - Self { - safe_point, - oldest_seqno, - default_cf_handle, - write_cf_handle, - mvcc_key_prefix: vec![], - cached_mvcc_delete_key: None, - cached_skiplist_delete_key: None, - remove_older: false, - metrics: FilterMetrics::default(), - last_user_key: vec![], - } - } - - fn filter(&mut self, key: &Bytes, value: &Bytes) -> Result<(), String> { - self.metrics.total += 1; - let InternalKey { - user_key, - v_type, - sequence, - } = decode_key(key); - - if sequence > self.oldest_seqno { - // skip those under read by some snapshots - return Ok(()); - } - - let (mvcc_key_prefix, commit_ts) = split_ts(user_key)?; - if commit_ts > self.safe_point { - return Ok(()); - } - - // Just like what rocksdb compaction filter does, we do not handle internal - // keys (representing different MVCC versions of the same user key) that have - // been marked as tombstones. However, these keys need to be deleted. Since they - // are below the safe point, we can safely delete them directly now. - // For each user key, we cache the first ValueType::Deletion and delete all the - // older internal keys of the same user keys. The cached ValueType::Delete is - // deleted at last to avoid these older keys visible. - if v_type == ValueType::Deletion { - if let Some(cache_skiplist_delete_key) = self.cached_skiplist_delete_key.take() { - self.metrics.filtered += 1; - // Reaching here in two cases: - // 1. There are two ValueType::Deletion in the same user key. - // 2. Two consecutive ValueType::Deletion of different user keys. - // In either cases, we can delete the previous one directly. - let guard = &epoch::pin(); - self.write_cf_handle - .remove(&InternalBytes::from_vec(cache_skiplist_delete_key), guard) - } - self.cached_skiplist_delete_key = Some(key.to_vec()); - return Ok(()); - } else if let Some(ref cache_skiplist_delete_key) = self.cached_skiplist_delete_key { - let InternalKey { - user_key: cache_skiplist_delete_user_key, - .. - } = decode_key(cache_skiplist_delete_key); - let guard = &epoch::pin(); - if cache_skiplist_delete_user_key == user_key { - self.metrics.filtered += 1; - self.write_cf_handle - .remove(&InternalBytes::from_bytes(key.clone()), guard); - return Ok(()); - } else { - self.metrics.filtered += 1; - self.write_cf_handle.remove( - &InternalBytes::from_vec(self.cached_skiplist_delete_key.take().unwrap()), - guard, - ) - } - } - - let guard = &epoch::pin(); - // Also, we only handle the same user_key once (user_key here refers to the key - // with MVCC version but without sequence number). - if user_key != self.last_user_key { - self.last_user_key = user_key.to_vec(); - } else { - self.metrics.filtered += 1; - self.write_cf_handle - .remove(&InternalBytes::from_bytes(key.clone()), guard); - return Ok(()); - } - - self.metrics.versions += 1; - if self.mvcc_key_prefix != mvcc_key_prefix { - self.metrics.unique_key += 1; - self.mvcc_key_prefix.clear(); - self.mvcc_key_prefix.extend_from_slice(mvcc_key_prefix); - self.remove_older = false; - if let Some(cached_delete_key) = self.cached_mvcc_delete_key.take() { - self.metrics.filtered += 1; - self.write_cf_handle - .remove(&InternalBytes::from_vec(cached_delete_key), guard); - } - } - - let mut filtered = self.remove_older; - let write = parse_write(value)?; - if !self.remove_older { - match write.write_type { - WriteType::Rollback | WriteType::Lock => { - self.metrics.mvcc_rollback_and_locks += 1; - filtered = true; - } - WriteType::Put => self.remove_older = true, - WriteType::Delete => { - self.metrics.delete_versions += 1; - self.remove_older = true; - - // The first mvcc type below safe point is the mvcc delete. We should delay to - // remove it until all the followings with the same user key have been deleted - // to avoid older version apper. - self.cached_mvcc_delete_key = Some(key.to_vec()); - } - } - } - - if !filtered { - return Ok(()); - } - self.metrics.filtered += 1; - self.write_cf_handle - .remove(&InternalBytes::from_bytes(key.clone()), guard); - self.handle_filtered_write(write, guard)?; - - Ok(()) - } - - fn handle_filtered_write( - &mut self, - write: WriteRef<'_>, - guard: &epoch::Guard, - ) -> std::result::Result<(), String> { - if write.short_value.is_none() && write.write_type == WriteType::Put { - // todo(SpadeA): We don't know the sequence number of the key in the skiplist so - // we cannot delete it directly. So we encoding a key with MAX sequence number - // so we can find the mvcc key with sequence number in the skiplist by using - // get_with_key and delete it with the result key. It involes more than one - // seek(both get and remove invovle seek). Maybe we can provide the API to - // delete the mvcc keys with all sequence numbers. - let default_key = encoding_for_filter(&self.mvcc_key_prefix, write.start_ts); - let mut iter = self.default_cf_handle.iterator(); - iter.seek(&default_key, guard); - while iter.valid() && iter.key().same_user_key_with(&default_key) { - self.default_cf_handle.remove(iter.key(), guard); - iter.next(guard); - } - } - Ok(()) - } -} - -#[cfg(test)] -pub mod tests { - use std::{ - sync::{ - mpsc::{channel, Sender}, - Arc, Mutex, - }, - time::Duration, - }; - - use crossbeam::epoch; - use engine_rocks::util::new_engine; - use engine_traits::{ - CacheRange, IterOptions, Iterable, Iterator, RangeCacheEngine, SyncMutable, CF_DEFAULT, - CF_LOCK, CF_WRITE, DATA_CFS, - }; - use futures::future::ready; - use keys::{data_key, DATA_MAX_KEY, DATA_MIN_KEY}; - use online_config::{ConfigChange, ConfigManager, ConfigValue}; - use pd_client::PdClient; - use tempfile::Builder; - use tikv_util::{ - config::{ReadableDuration, ReadableSize, VersionTrack}, - worker::dummy_scheduler, - }; - use txn_types::{Key, TimeStamp, Write, WriteType}; - - use super::*; - use crate::{ - background::BackgroundRunner, - config::RangeCacheConfigManager, - engine::{SkiplistEngine, SkiplistHandle}, - keys::{ - construct_key, construct_user_key, construct_value, encode_key, encode_seek_key, - encoding_for_filter, InternalBytes, ValueType, - }, - memory_controller::MemoryController, - region_label::{ - region_label_meta_client, - tests::{add_region_label_rule, new_region_label_rule, new_test_server_and_client}, - }, - test_util::{put_data, put_data_with_overwrite}, - write_batch::RangeCacheWriteBatchEntry, - RangeCacheEngineConfig, RangeCacheEngineContext, RangeCacheMemoryEngine, - }; - - fn delete_data( - key: &[u8], - ts: u64, - seq_num: u64, - write_cf: &SkiplistHandle, - mem_controller: Arc, - ) { - let raw_write_k = Key::from_raw(key) - .append_ts(TimeStamp::new(ts)) - .into_encoded(); - let mut write_k = encode_key(&raw_write_k, seq_num, ValueType::Value); - write_k.set_memory_controller(mem_controller.clone()); - let write_v = Write::new(WriteType::Delete, TimeStamp::new(ts), None); - let mut val = InternalBytes::from_vec(write_v.as_ref().to_bytes()); - val.set_memory_controller(mem_controller.clone()); - let guard = &epoch::pin(); - let _ = mem_controller.acquire(RangeCacheWriteBatchEntry::calc_put_entry_size( - &raw_write_k, - val.as_bytes(), - )); - write_cf.insert(write_k, val, guard); - } - - fn rollback_data( - key: &[u8], - ts: u64, - seq_num: u64, - write_cf: &SkiplistHandle, - mem_controller: Arc, - ) { - let raw_write_k = Key::from_raw(key) - .append_ts(TimeStamp::new(ts)) - .into_encoded(); - let mut write_k = encode_key(&raw_write_k, seq_num, ValueType::Value); - write_k.set_memory_controller(mem_controller.clone()); - let write_v = Write::new(WriteType::Rollback, TimeStamp::new(ts), None); - let mut val = InternalBytes::from_vec(write_v.as_ref().to_bytes()); - val.set_memory_controller(mem_controller.clone()); - let guard = &epoch::pin(); - let _ = mem_controller.acquire(RangeCacheWriteBatchEntry::calc_put_entry_size( - &raw_write_k, - val.as_bytes(), - )); - write_cf.insert(write_k, val, guard); - } - - fn element_count(sklist: &SkiplistHandle) -> u64 { - let guard = &epoch::pin(); - let mut count = 0; - let mut iter = sklist.iterator(); - iter.seek_to_first(guard); - while iter.valid() { - count += 1; - iter.next(guard); - } - count - } - - // We should not use skiplist.get directly as we only cares keys without - // sequence number suffix - fn key_exist(sl: &SkiplistHandle, key: &InternalBytes, guard: &epoch::Guard) -> bool { - let mut iter = sl.iterator(); - iter.seek(key, guard); - if iter.valid() && iter.key().same_user_key_with(key) { - return true; - } - false - } - - // We should not use skiplist.get directly as we only cares keys without - // sequence number suffix - fn get_value( - sl: &SkiplistHandle, - key: &InternalBytes, - guard: &epoch::Guard, - ) -> Option> { - let mut iter = sl.iterator(); - iter.seek(key, guard); - if iter.valid() && iter.key().same_user_key_with(key) { - return Some(iter.value().as_slice().to_vec()); - } - None - } - - fn dummy_controller(skip_engine: SkiplistEngine) -> Arc { - let mut config = RangeCacheEngineConfig::config_for_test(); - config.soft_limit_threshold = Some(ReadableSize(u64::MAX)); - config.hard_limit_threshold = Some(ReadableSize(u64::MAX)); - let config = Arc::new(VersionTrack::new(config)); - Arc::new(MemoryController::new(config, skip_engine)) - } - - fn encode_raw_key_for_filter(key: &[u8], ts: TimeStamp) -> InternalBytes { - let key = Key::from_raw(key); - encoding_for_filter(key.as_encoded(), ts) - } - - #[test] - fn test_filter() { - let skiplist_engine = SkiplistEngine::new(); - let write = skiplist_engine.cf_handle(CF_WRITE); - let default = skiplist_engine.cf_handle(CF_DEFAULT); - - let memory_controller = dummy_controller(skiplist_engine.clone()); - - put_data( - b"key1", - b"value1", - 10, - 15, - 10, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key2", - b"value21", - 10, - 15, - 12, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key2", - b"value22", - 20, - 25, - 14, - false, - &default, - &write, - memory_controller.clone(), - ); - // mock repeate apply - put_data( - b"key2", - b"value22", - 20, - 25, - 15, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key2", - b"value23", - 30, - 35, - 16, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key3", - b"value31", - 20, - 25, - 18, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key3", - b"value32", - 30, - 35, - 20, - false, - &default, - &write, - memory_controller.clone(), - ); - delete_data(b"key3", 40, 22, &write, memory_controller.clone()); - assert_eq!(7, element_count(&default)); - assert_eq!(8, element_count(&write)); - - let mut filter = Filter::new(50, 100, default.clone(), write.clone()); - let mut count = 0; - let mut iter = write.iterator(); - let guard = &epoch::pin(); - iter.seek_to_first(guard); - while iter.valid() { - let k = iter.key(); - let v = iter.value(); - filter.filter(k.as_bytes(), v.as_bytes()).unwrap(); - count += 1; - iter.next(guard); - } - assert_eq!(count, 8); - assert_eq!(5, filter.metrics.filtered); - drop(filter); - - assert_eq!(2, element_count(&write)); - assert_eq!(2, element_count(&default)); - - let key = encode_raw_key_for_filter(b"key1", TimeStamp::new(15)); - assert!(key_exist(&write, &key, guard)); - - let key = encode_raw_key_for_filter(b"key2", TimeStamp::new(35)); - assert!(key_exist(&write, &key, guard)); - - let key = encode_raw_key_for_filter(b"key3", TimeStamp::new(35)); - assert!(!key_exist(&write, &key, guard)); - - let key = encode_raw_key_for_filter(b"key1", TimeStamp::new(10)); - assert!(key_exist(&default, &key, guard)); - - let key = encode_raw_key_for_filter(b"key2", TimeStamp::new(30)); - assert!(key_exist(&default, &key, guard)); - - let key = encode_raw_key_for_filter(b"key3", TimeStamp::new(30)); - assert!(!key_exist(&default, &key, guard)); - } - - #[test] - fn test_filter_with_delete() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let memory_controller = engine.memory_controller(); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - let (write, default) = { - let skiplist_engine = engine.core().write().engine(); - ( - skiplist_engine.cf_handle(CF_WRITE), - skiplist_engine.cf_handle(CF_DEFAULT), - ) - }; - - put_data( - b"key1", - b"value11", - 10, - 15, - 10, - false, - &default, - &write, - memory_controller.clone(), - ); - - // Delete the above key - let guard = &epoch::pin(); - let raw_write_k = Key::from_raw(b"key1") - .append_ts(TimeStamp::new(15)) - .into_encoded(); - let mut write_k = encode_key(&raw_write_k, 15, ValueType::Deletion); - write_k.set_memory_controller(memory_controller.clone()); - let mut val = InternalBytes::from_vec(b"".to_vec()); - val.set_memory_controller(memory_controller.clone()); - write.insert(write_k, val, guard); - - put_data( - b"key2", - b"value22", - 20, - 25, - 14, - false, - &default, - &write, - memory_controller.clone(), - ); - - // Delete the above key - let raw_write_k = Key::from_raw(b"key2") - .append_ts(TimeStamp::new(25)) - .into_encoded(); - let mut write_k = encode_key(&raw_write_k, 15, ValueType::Deletion); - write_k.set_memory_controller(memory_controller.clone()); - let mut val = InternalBytes::from_vec(b"".to_vec()); - val.set_memory_controller(memory_controller.clone()); - write.insert(write_k, val, guard); - - put_data( - b"key2", - b"value23", - 30, - 35, - 16, - false, - &default, - &write, - memory_controller.clone(), - ); - delete_data(b"key2", 40, 18, &write, memory_controller.clone()); - - let snap = engine.snapshot(range.clone(), u64::MAX, u64::MAX).unwrap(); - let mut iter_opts = IterOptions::default(); - iter_opts.set_lower_bound(&range.start, 0); - iter_opts.set_upper_bound(&range.end, 0); - - let (worker, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller.clone(), - None, - engine.expected_region_size(), - ); - worker.core.gc_range(&range, 40, 100); - - let mut iter = snap.iterator_opt("write", iter_opts).unwrap(); - iter.seek_to_first().unwrap(); - assert!(!iter.valid().unwrap()); - - let mut iter = write.iterator(); - iter.seek_to_first(guard); - assert!(!iter.valid()); - } - - #[test] - fn test_gc() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let memory_controller = engine.memory_controller(); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - let (write, default) = { - let skiplist_engine = engine.core().write().engine(); - ( - skiplist_engine.cf_handle(CF_WRITE), - skiplist_engine.cf_handle(CF_DEFAULT), - ) - }; - - let encode_key = |key, ts| { - let key = Key::from_raw(key); - encoding_for_filter(key.as_encoded(), ts) - }; - - put_data( - b"key1", - b"value1", - 10, - 11, - 10, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key1", - b"value2", - 12, - 13, - 12, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key1", - b"value3", - 14, - 15, - 14, - false, - &default, - &write, - memory_controller.clone(), - ); - assert_eq!(3, element_count(&default)); - assert_eq!(3, element_count(&write)); - - let (worker, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller.clone(), - None, - engine.expected_region_size(), - ); - - // gc should not hanlde keys with larger seqno than oldest seqno - worker.core.gc_range(&range, 13, 10); - assert_eq!(3, element_count(&default)); - assert_eq!(3, element_count(&write)); - - // gc will not remove the latest mvcc put below safe point - worker.core.gc_range(&range, 14, 100); - assert_eq!(2, element_count(&default)); - assert_eq!(2, element_count(&write)); - - worker.core.gc_range(&range, 16, 100); - assert_eq!(1, element_count(&default)); - assert_eq!(1, element_count(&write)); - - // rollback will not make the first older version be filtered - rollback_data(b"key1", 17, 16, &write, memory_controller.clone()); - worker.core.gc_range(&range, 17, 100); - assert_eq!(1, element_count(&default)); - assert_eq!(1, element_count(&write)); - let key = encode_key(b"key1", TimeStamp::new(15)); - let guard = &epoch::pin(); - assert!(key_exist(&write, &key, guard)); - let key = encode_key(b"key1", TimeStamp::new(14)); - assert!(key_exist(&default, &key, guard)); - - // unlike in WriteCompactionFilter, the latest mvcc delete below safe point will - // be filtered - delete_data(b"key1", 19, 18, &write, memory_controller.clone()); - worker.core.gc_range(&range, 19, 100); - assert_eq!(0, element_count(&write)); - assert_eq!(0, element_count(&default)); - } - - // The GC of one range should not impact other ranges - #[test] - fn test_gc_one_range() { - let config = RangeCacheEngineConfig::config_for_test(); - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(config), - ))); - let memory_controller = engine.memory_controller(); - let (write, default, range1, range2) = { - let mut core = engine.core().write(); - - let start1 = Key::from_raw(b"k00").into_encoded(); - let end1 = Key::from_raw(b"k10").into_encoded(); - let range1 = CacheRange::new(start1, end1); - core.mut_range_manager().new_range(range1.clone()); - - let start2 = Key::from_raw(b"k30").into_encoded(); - let end2 = Key::from_raw(b"k40").into_encoded(); - let range2 = CacheRange::new(start2, end2); - core.mut_range_manager().new_range(range2.clone()); - - let engine = core.engine(); - ( - engine.cf_handle(CF_WRITE), - engine.cf_handle(CF_DEFAULT), - range1, - range2, - ) - }; - - put_data( - b"k05", - b"val1", - 10, - 11, - 10, - false, - &default, - &write, - memory_controller.clone(), - ); - - put_data( - b"k05", - b"val2", - 12, - 13, - 14, - false, - &default, - &write, - memory_controller.clone(), - ); - - put_data( - b"k05", - b"val1", - 14, - 15, - 18, - false, - &default, - &write, - memory_controller.clone(), - ); - - put_data( - b"k35", - b"val1", - 10, - 11, - 12, - false, - &default, - &write, - memory_controller.clone(), - ); - - put_data( - b"k35", - b"val2", - 12, - 13, - 16, - false, - &default, - &write, - memory_controller.clone(), - ); - - put_data( - b"k35", - b"val1", - 14, - 15, - 20, - false, - &default, - &write, - memory_controller.clone(), - ); - - let encode_key = |key, commit_ts, seq_num| -> InternalBytes { - let raw_write_k = Key::from_raw(key) - .append_ts(TimeStamp::new(commit_ts)) - .into_encoded(); - encode_key(&raw_write_k, seq_num, ValueType::Value) - }; - - let verify = |key, mvcc, seq, handle: &SkiplistHandle| { - let guard = &epoch::pin(); - let key = encode_key(key, mvcc, seq); - let mut iter = handle.iterator(); - iter.seek(&key, guard); - assert_eq!(iter.key(), &key); - iter.next(guard); - assert!(!iter.valid() || !iter.key().same_user_key_with(&key)); - }; - - assert_eq!(6, element_count(&default)); - assert_eq!(6, element_count(&write)); - - let (worker, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller.clone(), - None, - engine.expected_region_size(), - ); - let filter = worker.core.gc_range(&range1, 100, 100); - assert_eq!(2, filter.filtered); - - verify(b"k05", 15, 18, &write); - verify(b"k05", 14, 19, &default); - - assert_eq!(4, element_count(&default)); - assert_eq!(4, element_count(&write)); - - let (worker, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller.clone(), - None, - engine.expected_region_size(), - ); - worker.core.gc_range(&range2, 100, 100); - assert_eq!(2, filter.filtered); - - verify(b"k35", 15, 20, &write); - verify(b"k35", 14, 21, &default); - - assert_eq!(2, element_count(&default)); - assert_eq!(2, element_count(&write)); - } - - #[test] - fn test_gc_for_overwrite_write() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let memory_controller = engine.memory_controller(); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - let (write, default) = { - let skiplist_engine = engine.core().write().engine(); - ( - skiplist_engine.cf_handle(CF_WRITE), - skiplist_engine.cf_handle(CF_DEFAULT), - ) - }; - - put_data_with_overwrite( - b"key1", - b"value1", - 10, - 11, - 100, - 101, - false, - &default, - &write, - memory_controller.clone(), - ); - - assert_eq!(1, element_count(&default)); - assert_eq!(2, element_count(&write)); - - let (worker, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller.clone(), - None, - engine.expected_region_size(), - ); - - let filter = worker.core.gc_range(&range, 20, 200); - assert_eq!(1, filter.filtered); - assert_eq!(1, element_count(&default)); - assert_eq!(1, element_count(&write)); - } - - #[test] - fn test_snapshot_block_gc() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let memory_controller = engine.memory_controller(); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(range.clone()); - let (write, default) = { - let skiplist_engine = engine.core().write().engine(); - ( - skiplist_engine.cf_handle(CF_WRITE), - skiplist_engine.cf_handle(CF_DEFAULT), - ) - }; - - put_data( - b"key1", - b"value1", - 10, - 11, - 10, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key2", - b"value21", - 10, - 11, - 12, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key2", - b"value22", - 15, - 16, - 14, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key2", - b"value23", - 20, - 21, - 16, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key3", - b"value31", - 5, - 6, - 18, - false, - &default, - &write, - memory_controller.clone(), - ); - put_data( - b"key3", - b"value32", - 10, - 11, - 20, - false, - &default, - &write, - memory_controller.clone(), - ); - assert_eq!(6, element_count(&default)); - assert_eq!(6, element_count(&write)); - - let (worker, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller, - None, - engine.expected_region_size(), - ); - let s1 = engine.snapshot(range.clone(), 10, u64::MAX); - let s2 = engine.snapshot(range.clone(), 11, u64::MAX); - let s3 = engine.snapshot(range.clone(), 20, u64::MAX); - - // nothing will be removed due to snapshot 5 - let filter = worker.core.gc_range(&range, 30, 100); - assert_eq!(0, filter.filtered); - assert_eq!(6, element_count(&default)); - assert_eq!(6, element_count(&write)); - - drop(s1); - let filter = worker.core.gc_range(&range, 30, 100); - assert_eq!(1, filter.filtered); - assert_eq!(5, element_count(&default)); - assert_eq!(5, element_count(&write)); - - drop(s2); - let filter = worker.core.gc_range(&range, 30, 100); - assert_eq!(1, filter.filtered); - assert_eq!(4, element_count(&default)); - assert_eq!(4, element_count(&write)); - - drop(s3); - let filter = worker.core.gc_range(&range, 30, 100); - assert_eq!(1, filter.filtered); - assert_eq!(3, element_count(&default)); - assert_eq!(3, element_count(&write)); - } - - #[test] - fn test_background_worker_load() { - let mut engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( - Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())), - )); - let path = Builder::new().prefix("test_load").tempdir().unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - engine.set_disk_engine(rocks_engine.clone()); - - for i in 10..20 { - let key = construct_key(i, 1); - let key = data_key(&key); - let value = construct_value(i, i); - rocks_engine - .put_cf(CF_DEFAULT, &key, value.as_bytes()) - .unwrap(); - rocks_engine - .put_cf(CF_WRITE, &key, value.as_bytes()) - .unwrap(); - } - - let k = format!("zk{:08}", 15).into_bytes(); - let r1 = CacheRange::new(DATA_MIN_KEY.to_vec(), k.clone()); - let r2 = CacheRange::new(k, DATA_MAX_KEY.to_vec()); - { - let mut core = engine.core.write(); - core.mut_range_manager().pending_ranges.push(r1.clone()); - core.mut_range_manager().pending_ranges.push(r2.clone()); - } - engine.prepare_for_apply(1, &r1); - engine.prepare_for_apply(1, &r2); - - // concurrent write to rocksdb, but the key will not be loaded in the memory - // engine - let key = construct_key(20, 1); - let key20 = data_key(&key); - let value = construct_value(20, 20); - rocks_engine - .put_cf(CF_DEFAULT, &key20, value.as_bytes()) - .unwrap(); - rocks_engine - .put_cf(CF_WRITE, &key20, value.as_bytes()) - .unwrap(); - - let (write, default) = { - let core = engine.core().write(); - let skiplist_engine = core.engine(); - ( - skiplist_engine.cf_handle(CF_WRITE), - skiplist_engine.cf_handle(CF_DEFAULT), - ) - }; - - // wait for background load - std::thread::sleep(Duration::from_secs(1)); - - let _ = engine.snapshot(r1, u64::MAX, u64::MAX).unwrap(); - let _ = engine.snapshot(r2, u64::MAX, u64::MAX).unwrap(); - - let guard = &epoch::pin(); - for i in 10..20 { - let key = construct_key(i, 1); - let key = data_key(&key); - let value = construct_value(i, i); - let key = encode_seek_key(&key, u64::MAX); - assert_eq!( - get_value(&write, &key, guard).unwrap().as_slice(), - value.as_bytes() - ); - assert_eq!( - get_value(&default, &key, guard).unwrap().as_slice(), - value.as_bytes() - ); - } - - let key20 = encode_seek_key(&key20, u64::MAX); - assert!(!key_exist(&write, &key20, guard)); - assert!(!key_exist(&default, &key20, guard)); - } - - #[test] - fn test_ranges_for_gc() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let memory_controller = engine.memory_controller(); - let r1 = CacheRange::new(b"a".to_vec(), b"b".to_vec()); - let r2 = CacheRange::new(b"b".to_vec(), b"c".to_vec()); - engine.new_range(r1); - engine.new_range(r2); - - let (mut runner, _) = BackgroundRunner::new( - engine.core.clone(), - memory_controller, - None, - engine.expected_region_size(), - ); - let ranges = runner.core.ranges_for_gc().unwrap(); - assert_eq!(2, ranges.len()); - - // until the previous gc finished, node ranges will be returned - assert!(runner.core.ranges_for_gc().is_none()); - runner.core.on_gc_finished(ranges); - - let ranges = runner.core.ranges_for_gc().unwrap(); - assert_eq!(2, ranges.len()); - } - - // Test creating and loading cache hint using a region label rule: - // 1. Insert some data into rocks engine, which is set as disk engine for the - // memory engine. - // 2. Use test pd client server to create a label rule for portion of the data. - // 3. Wait until data is loaded. - // 4. Verify that only the labeled key range has been loaded. - #[test] - fn test_load_from_pd_hint_service() { - let mut engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests( - Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())), - )); - let path = Builder::new() - .prefix("test_load_from_pd_hint_service") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - engine.set_disk_engine(rocks_engine.clone()); - - for i in 10..20 { - let key = construct_key(i, 1); - let key = data_key(&key); - let value = construct_value(i, i); - rocks_engine - .put_cf(CF_DEFAULT, &key, value.as_bytes()) - .unwrap(); - rocks_engine - .put_cf(CF_WRITE, &key, value.as_bytes()) - .unwrap(); - } - - let (mut pd_server, pd_client) = new_test_server_and_client(ReadableDuration::millis(100)); - let cluster_id = pd_client.get_cluster_id().unwrap(); - let pd_client = Arc::new(pd_client); - engine.start_hint_service(PdRangeHintService::from(pd_client.clone())); - let meta_client = region_label_meta_client(pd_client.clone()); - let label_rule = new_region_label_rule( - "cache/0", - &hex::encode(format!("k{:08}", 10).into_bytes()), - &hex::encode(format!("k{:08}", 15).into_bytes()), - ); - add_region_label_rule(meta_client, cluster_id, &label_rule); - - // Wait for the watch to fire. - std::thread::sleep(Duration::from_millis(200)); - let r1 = CacheRange::try_from(&label_rule.data[0]).unwrap(); - engine.prepare_for_apply(1, &r1); - - // Wait for the range to be loaded. - std::thread::sleep(Duration::from_secs(1)); - let _ = engine.snapshot(r1, u64::MAX, u64::MAX).unwrap(); - - let (write, default) = { - let core = engine.core().write(); - let skiplist_engine = core.engine(); - ( - skiplist_engine.cf_handle(CF_WRITE), - skiplist_engine.cf_handle(CF_DEFAULT), - ) - }; - - let guard = &epoch::pin(); - for i in 10..15 { - let key = construct_key(i, 1); - let key = data_key(&key); - let value = construct_value(i, i); - let key = encode_seek_key(&key, u64::MAX); - assert_eq!( - get_value(&write, &key, guard).unwrap().as_slice(), - value.as_bytes() - ); - assert_eq!( - get_value(&default, &key, guard).unwrap().as_slice(), - value.as_bytes() - ); - } - for i in 15..=20 { - let key = construct_key(i, 1); - let key = data_key(&key); - let key = encode_seek_key(&key, u64::MAX); - assert!(!key_exist(&write, &key, guard)); - assert!(!key_exist(&default, &key, guard)); - } - - pd_server.stop(); - } - - #[test] - fn test_snapshot_load_reaching_limit() { - let mut config = RangeCacheEngineConfig::config_for_test(); - config.soft_limit_threshold = Some(ReadableSize(1000)); - config.hard_limit_threshold = Some(ReadableSize(1500)); - let config = Arc::new(VersionTrack::new(config)); - let mut engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config)); - let path = Builder::new() - .prefix("test_snapshot_load_reaching_limit") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - engine.set_disk_engine(rocks_engine.clone()); - let mem_controller = engine.memory_controller(); - - let range1 = CacheRange::new(construct_user_key(1), construct_user_key(3)); - // Memory for one put is 17(key) + 3(val) + 8(Seqno) + 16(Memory controller in - // key and val) + 96(Node overhead) = 140 - let key = construct_key(1, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - - let key = construct_key(2, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - // After loading range1, the memory usage should be 140*6=840 - - let range2 = CacheRange::new(construct_user_key(3), construct_user_key(5)); - let key = construct_key(3, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - - let key = construct_key(4, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - // 840*2 > hard limit 1500, so the load will fail and the loaded keys should be - // removed - - let range3 = CacheRange::new(construct_user_key(5), construct_user_key(6)); - let key = construct_key(5, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - let key = construct_key(6, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - - for r in [&range1, &range2, &range3] { - engine.load_range(r.clone()).unwrap(); - engine.prepare_for_apply(1, r); - } - - // ensure all ranges are finshed - { - let mut count = 0; - while count < 20 { - { - let core = engine.core.read(); - let range_manager = core.range_manager(); - if range_manager.pending_ranges.is_empty() - && range_manager.pending_ranges_loading_data.is_empty() - { - break; - } - } - std::thread::sleep(Duration::from_millis(100)); - count += 1; - } - } - - let verify = |range: CacheRange, exist, expect_count| { - if exist { - let snap = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); - let mut count = 0; - for cf in DATA_CFS { - let mut iter = IterOptions::default(); - iter.set_lower_bound(&range.start, 0); - iter.set_upper_bound(&range.end, 0); - let mut iter = snap.iterator_opt(cf, iter).unwrap(); - let _ = iter.seek_to_first(); - while iter.valid().unwrap() { - let _ = iter.next(); - count += 1; - } - } - assert_eq!(count, expect_count); - } else { - engine.snapshot(range, 10, 10).unwrap_err(); - } - }; - verify(range1, true, 6); - verify(range2, false, 0); - verify(range3, false, 3); - assert_eq!(mem_controller.mem_usage(), 1540); - } - - #[test] - fn test_soft_hard_limit_change() { - let mut config = RangeCacheEngineConfig::config_for_test(); - config.soft_limit_threshold = Some(ReadableSize(1000)); - config.hard_limit_threshold = Some(ReadableSize(1500)); - let config = Arc::new(VersionTrack::new(config)); - let mut engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config.clone())); - let path = Builder::new() - .prefix("test_snapshot_load_reaching_limit") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - engine.set_disk_engine(rocks_engine.clone()); - let mem_controller = engine.memory_controller(); - - let range1 = CacheRange::new(construct_user_key(1), construct_user_key(3)); - // Memory for one put is 17(key) + 3(val) + 8(Seqno) + 16(Memory controller in - // key and val) + 96(Node overhead) = 140 - let key = construct_key(1, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - - let key = construct_key(2, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - // After loading range1, the memory usage should be 140*6=840 - engine.load_range(range1.clone()).unwrap(); - engine.prepare_for_apply(1, &range1); - - let range2 = CacheRange::new(construct_user_key(3), construct_user_key(5)); - let key = construct_key(3, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - - let key = construct_key(4, 10); - rocks_engine.put_cf(CF_DEFAULT, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_LOCK, &key, b"val").unwrap(); - rocks_engine.put_cf(CF_WRITE, &key, b"val").unwrap(); - // 840*2 > hard limit 1500, so the load will fail and the loaded keys should be - // removed. However now we change the memory quota to 2000, so the range2 can be - // cached. - let mut config_manager = RangeCacheConfigManager(config.clone()); - let mut config_change = ConfigChange::new(); - config_change.insert( - String::from("hard_limit_threshold"), - ConfigValue::Size(2000), - ); - config_manager.dispatch(config_change).unwrap(); - assert_eq!(config.value().hard_limit_threshold(), 2000); - - engine.load_range(range2.clone()).unwrap(); - engine.prepare_for_apply(1, &range2); - - // ensure all ranges are finshed - { - let mut count = 0; - while count < 20 { - { - let core = engine.core.read(); - let range_manager = core.range_manager(); - if range_manager.pending_ranges.is_empty() - && range_manager.pending_ranges_loading_data.is_empty() - { - break; - } - } - std::thread::sleep(Duration::from_millis(100)); - count += 1; - } - } - - let verify = |range: CacheRange, exist, expect_count| { - if exist { - let snap = engine.snapshot(range.clone(), 10, u64::MAX).unwrap(); - let mut count = 0; - for cf in DATA_CFS { - let mut iter = IterOptions::default(); - iter.set_lower_bound(&range.start, 0); - iter.set_upper_bound(&range.end, 0); - let mut iter = snap.iterator_opt(cf, iter).unwrap(); - let _ = iter.seek_to_first(); - while iter.valid().unwrap() { - let _ = iter.next(); - count += 1; - } - } - assert_eq!(count, expect_count); - } else { - engine.snapshot(range, 10, 10).unwrap_err(); - } - }; - verify(range1, true, 6); - verify(range2, true, 6); - assert_eq!(mem_controller.mem_usage(), 1680); - } - - #[test] - fn test_gc_use_pd_tso() { - struct MockPdClient { - tx: Mutex>, - } - impl PdClient for MockPdClient { - fn get_tso(&self) -> pd_client::PdFuture { - self.tx.lock().unwrap().send(()).unwrap(); - Box::pin(ready(Ok(TimeStamp::compose(TimeStamp::physical_now(), 0)))) - } - } - - let start_time = TimeStamp::compose(TimeStamp::physical_now(), 0); - let (tx, pd_client_rx) = channel(); - let pd_client = Arc::new(MockPdClient { tx: Mutex::new(tx) }); - let gc_interval = Duration::from_millis(100); - let load_evict_interval = Duration::from_millis(200); - let (scheduler, mut rx) = dummy_scheduler(); - let (handle, stop) = - BgWorkManager::start_tick(scheduler, pd_client, gc_interval, load_evict_interval); - - let Some(BackgroundTask::Gc(GcTask { safe_point })) = - rx.recv_timeout(10 * gc_interval).unwrap() - else { - panic!("must be a GcTask"); - }; - let safe_point = TimeStamp::from(safe_point); - // Make sure it is a reasonable timestamp. - assert!(safe_point > start_time, "{safe_point}, {start_time}"); - let now = TimeStamp::compose(TimeStamp::physical_now(), 0); - assert!(safe_point < now, "{safe_point}, {now}"); - // Must get ts from PD. - pd_client_rx.try_recv().unwrap(); - - stop.send(true).unwrap(); - handle.join().unwrap(); - } -} diff --git a/components/range_cache_memory_engine/src/config.rs b/components/range_cache_memory_engine/src/config.rs deleted file mode 100644 index f7c0e52ab56..00000000000 --- a/components/range_cache_memory_engine/src/config.rs +++ /dev/null @@ -1,41 +0,0 @@ -use std::sync::Arc; - -use online_config::{ConfigChange, ConfigManager, OnlineConfig}; -use tikv_util::{config::VersionTrack, info}; - -use crate::RangeCacheEngineConfig; - -#[derive(Clone)] -pub struct RangeCacheConfigManager(pub Arc>); - -impl RangeCacheConfigManager { - pub fn new(config: Arc>) -> Self { - Self(config) - } -} - -impl ConfigManager for RangeCacheConfigManager { - fn dispatch( - &mut self, - change: ConfigChange, - ) -> std::result::Result<(), Box> { - { - let change = change.clone(); - self.0 - .update(move |cfg: &mut RangeCacheEngineConfig| cfg.update(change))?; - } - info!( - "range cache config changed"; - "change" => ?change, - ); - Ok(()) - } -} - -impl std::ops::Deref for RangeCacheConfigManager { - type Target = Arc>; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} diff --git a/components/range_cache_memory_engine/src/engine.rs b/components/range_cache_memory_engine/src/engine.rs deleted file mode 100644 index 58db7117977..00000000000 --- a/components/range_cache_memory_engine/src/engine.rs +++ /dev/null @@ -1,765 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - collections::BTreeMap, - fmt::{self, Debug}, - ops::Bound, - result, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, -}; - -use crossbeam::epoch::{self, default_collector, Guard}; -use crossbeam_skiplist::{ - base::{Entry, OwnedIter}, - SkipList, -}; -use engine_rocks::RocksEngine; -use engine_traits::{ - CacheRange, FailedReason, IterOptions, Iterable, KvEngine, RangeCacheEngine, Result, - CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, -}; -use parking_lot::{lock_api::RwLockUpgradableReadGuard, RwLock, RwLockWriteGuard}; -use raftstore::coprocessor::RegionInfoProvider; -use slog_global::error; -use tikv_util::{config::VersionTrack, info}; - -use crate::{ - background::{BackgroundTask, BgWorkManager, PdRangeHintService}, - keys::{ - encode_key_for_boundary_with_mvcc, encode_key_for_boundary_without_mvcc, InternalBytes, - }, - memory_controller::MemoryController, - range_manager::{LoadFailedReason, RangeCacheStatus, RangeManager}, - read::{RangeCacheIterator, RangeCacheSnapshot}, - statistics::Statistics, - write_batch::{group_write_batch_entries, RangeCacheWriteBatchEntry}, - RangeCacheEngineConfig, RangeCacheEngineContext, -}; - -pub(crate) const CF_DEFAULT_USIZE: usize = 0; -pub(crate) const CF_LOCK_USIZE: usize = 1; -pub(crate) const CF_WRITE_USIZE: usize = 2; - -pub(crate) fn cf_to_id(cf: &str) -> usize { - match cf { - CF_DEFAULT => CF_DEFAULT_USIZE, - CF_LOCK => CF_LOCK_USIZE, - CF_WRITE => CF_WRITE_USIZE, - _ => panic!("unrecognized cf {}", cf), - } -} - -pub(crate) fn id_to_cf(id: usize) -> &'static str { - match id { - CF_DEFAULT_USIZE => CF_DEFAULT, - CF_LOCK_USIZE => CF_LOCK, - CF_WRITE_USIZE => CF_WRITE, - _ => panic!("unrecognized id {}", id), - } -} - -#[inline] -pub(crate) fn is_lock_cf(cf: usize) -> bool { - cf == CF_LOCK_USIZE -} - -// A wrapper for skiplist to provide some check and clean up worker -#[derive(Clone)] -pub struct SkiplistHandle(Arc>); - -impl SkiplistHandle { - pub fn get<'a: 'g, 'g>( - &'a self, - key: &InternalBytes, - guard: &'g Guard, - ) -> Option> { - self.0.get(key, guard) - } - - pub fn get_with_user_key<'a: 'g, 'g>( - &'a self, - key: &InternalBytes, - guard: &'g Guard, - ) -> Option> { - let n = self.0.lower_bound(Bound::Included(key), guard)?; - if n.key().same_user_key_with(key) { - Some(n) - } else { - None - } - } - - pub fn insert(&self, key: InternalBytes, value: InternalBytes, guard: &Guard) { - assert!(key.memory_controller_set() && value.memory_controller_set()); - self.0.insert(key, value, guard).release(guard); - } - - pub fn remove(&self, key: &InternalBytes, guard: &Guard) { - if let Some(entry) = self.0.remove(key, guard) { - entry.release(guard); - } - } - - pub fn iterator( - &self, - ) -> OwnedIter>, InternalBytes, InternalBytes> { - self.0.owned_iter() - } - - pub fn len(&self) -> usize { - self.0.len() - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -/// A single global set of skiplists shared by all cached ranges -#[derive(Clone)] -pub struct SkiplistEngine { - pub(crate) data: [Arc>; 3], -} - -impl Default for SkiplistEngine { - fn default() -> Self { - Self::new() - } -} - -impl SkiplistEngine { - pub fn new() -> Self { - let collector = default_collector().clone(); - SkiplistEngine { - data: [ - Arc::new(SkipList::new(collector.clone())), - Arc::new(SkipList::new(collector.clone())), - Arc::new(SkipList::new(collector)), - ], - } - } - - pub fn cf_handle(&self, cf: &str) -> SkiplistHandle { - SkiplistHandle(self.data[cf_to_id(cf)].clone()) - } - - pub fn node_count(&self) -> usize { - let mut count = 0; - self.data.iter().for_each(|s| count += s.len()); - count - } - - pub(crate) fn delete_range_cf(&self, cf: &str, range: &CacheRange) { - let (start, end) = if cf == CF_LOCK { - encode_key_for_boundary_without_mvcc(range) - } else { - encode_key_for_boundary_with_mvcc(range) - }; - - let handle = self.cf_handle(cf); - let mut iter = handle.iterator(); - let guard = &epoch::pin(); - iter.seek(&start, guard); - while iter.valid() && iter.key() < &end { - handle.remove(iter.key(), guard); - iter.next(guard); - } - // guard will buffer 8 drop methods, flush here to clear the buffer. - guard.flush(); - } - - pub(crate) fn delete_range(&self, range: &CacheRange) { - DATA_CFS.iter().for_each(|&cf| { - self.delete_range_cf(cf, range); - }); - } -} - -impl Debug for SkiplistEngine { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "Range Memory Engine") - } -} - -pub struct RangeCacheMemoryEngineCore { - pub(crate) engine: SkiplistEngine, - pub(crate) range_manager: RangeManager, - pub(crate) cached_write_batch: BTreeMap>, -} - -impl Default for RangeCacheMemoryEngineCore { - fn default() -> Self { - Self::new() - } -} - -impl RangeCacheMemoryEngineCore { - pub fn new() -> RangeCacheMemoryEngineCore { - RangeCacheMemoryEngineCore { - engine: SkiplistEngine::new(), - range_manager: RangeManager::default(), - cached_write_batch: BTreeMap::default(), - } - } - - pub fn engine(&self) -> SkiplistEngine { - self.engine.clone() - } - - pub fn range_manager(&self) -> &RangeManager { - &self.range_manager - } - - pub fn mut_range_manager(&mut self) -> &mut RangeManager { - &mut self.range_manager - } - - // `cached_range` must not exist in the `cached_write_batch` - pub fn init_cached_write_batch(&mut self, cache_range: &CacheRange) { - assert!( - self.cached_write_batch - .insert(cache_range.clone(), vec![]) - .is_none() - ); - } - - pub fn has_cached_write_batch(&self, cache_range: &CacheRange) -> bool { - self.cached_write_batch - .get(cache_range) - .map_or(false, |entries| !entries.is_empty()) - } - - pub(crate) fn take_cached_write_batch_entries( - &mut self, - cache_range: &CacheRange, - ) -> Vec<(u64, RangeCacheWriteBatchEntry)> { - std::mem::take(self.cached_write_batch.get_mut(cache_range).unwrap()) - } - - pub fn remove_cached_write_batch(&mut self, cache_range: &CacheRange) { - self.cached_write_batch - .remove(cache_range) - .unwrap_or_else(|| { - panic!( - "range cannot be found in cached_write_batch: {:?}", - cache_range - ); - }); - } - - // ensure that the transfer from `pending_ranges_loading_data` to - // `range` is atomic with cached_write_batch empty - pub(crate) fn pending_range_completes_loading( - core: &mut RwLockWriteGuard<'_, Self>, - range: &CacheRange, - ) { - assert!(!core.has_cached_write_batch(range)); - let range_manager = core.mut_range_manager(); - let (r, _, canceled) = range_manager - .pending_ranges_loading_data - .pop_front() - .unwrap(); - assert_eq!(&r, range); - assert!(!canceled); - range_manager.new_range(r); - } -} - -/// The RangeCacheMemoryEngine serves as a range cache, storing hot ranges in -/// the leaders' store. Incoming writes that are written to disk engine (now, -/// RocksDB) are also written to the RangeCacheMemoryEngine, leading to a -/// mirrored data set in the cached ranges with the disk engine. -/// -/// A load/evict unit manages the memory, deciding which ranges should be -/// evicted when the memory used by the RangeCacheMemoryEngine reaches a -/// certain limit, and determining which ranges should be loaded when there is -/// spare memory capacity. -/// -/// The safe point lifetime differs between RangeCacheMemoryEngine and the disk -/// engine, often being much shorter in RangeCacheMemoryEngine. This means that -/// RangeCacheMemoryEngine may filter out some keys that still exist in the -/// disk engine, thereby improving read performance as fewer duplicated keys -/// will be read. If there's a need to read keys that may have been filtered by -/// RangeCacheMemoryEngine (as indicated by read_ts and safe_point of the -/// cached region), we resort to using a the disk engine's snapshot instead. -#[derive(Clone)] -pub struct RangeCacheMemoryEngine { - bg_work_manager: Arc, - pub(crate) core: Arc>, - pub(crate) rocks_engine: Option, - memory_controller: Arc, - statistics: Arc, - config: Arc>, - - // The increment amount of tombstones in the lock cf. - // When reaching to the threshold, a CleanLockTombstone task will be scheduled to clean lock cf - // tombstones. - pub(crate) lock_modification_bytes: Arc, - - // `write_batch_id_allocator` is used to allocate id for each write batch - write_batch_id_allocator: Arc, -} - -impl RangeCacheMemoryEngine { - pub fn new(range_cache_engine_context: RangeCacheEngineContext) -> Self { - RangeCacheMemoryEngine::with_region_info_provider(range_cache_engine_context, None) - } - - pub fn with_region_info_provider( - range_cache_engine_context: RangeCacheEngineContext, - region_info_provider: Option>, - ) -> Self { - info!("init range cache memory engine";); - let core = Arc::new(RwLock::new(RangeCacheMemoryEngineCore::new())); - let skiplist_engine = { core.read().engine().clone() }; - - let RangeCacheEngineContext { - config, - statistics, - pd_client, - } = range_cache_engine_context; - assert!(config.value().enabled); - let memory_controller = Arc::new(MemoryController::new(config.clone(), skiplist_engine)); - - let bg_work_manager = Arc::new(BgWorkManager::new( - core.clone(), - pd_client, - config.value().gc_interval.0, - config.value().load_evict_interval.0, - config.value().expected_region_size(), - memory_controller.clone(), - region_info_provider, - )); - - Self { - core, - rocks_engine: None, - bg_work_manager, - memory_controller, - statistics, - config, - lock_modification_bytes: Arc::default(), - write_batch_id_allocator: Arc::default(), - } - } - - pub fn expected_region_size(&self) -> usize { - self.config.value().expected_region_size() - } - - pub fn new_range(&self, range: CacheRange) { - let mut core = self.core.write(); - core.range_manager.new_range(range); - } - - /// Load the range in the in-memory engine. - // This method only push the range in the `pending_range` where sometime - // later in `prepare_for_apply`, the range will be scheduled to load snapshot - // data into engine. - pub fn load_range(&self, range: CacheRange) -> result::Result<(), LoadFailedReason> { - let mut core = self.core.write(); - core.mut_range_manager().load_range(range) - } - - /// Evict a range from the in-memory engine. After this call, the range will - /// not be readable, but the data of the range may not be deleted - /// immediately due to some ongoing snapshots. - pub fn evict_range(&self, range: &CacheRange) { - let mut core = self.core.write(); - let ranges_to_delete = core.range_manager.evict_range(range); - if !ranges_to_delete.is_empty() { - drop(core); - // The range can be deleted directly. - if let Err(e) = self - .bg_worker_manager() - .schedule_task(BackgroundTask::DeleteRange(ranges_to_delete)) - { - error!( - "schedule delete range failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - } - - // It handles the pending range and check whether to buffer write for this - // range. - pub(crate) fn prepare_for_apply( - &self, - write_batch_id: u64, - range: &CacheRange, - ) -> RangeCacheStatus { - let mut core = self.core.write(); - let range_manager = core.mut_range_manager(); - if range_manager.pending_ranges_in_loading_contains(range) { - range_manager.record_in_ranges_being_written(write_batch_id, range); - return RangeCacheStatus::Loading; - } - if range_manager.contains_range(range) { - range_manager.record_in_ranges_being_written(write_batch_id, range); - return RangeCacheStatus::Cached; - } - - let mut overlapped = false; - // check whether the range is in pending_range and we can schedule load task if - // it is - if let Some((idx, (left_range, right_range))) = range_manager - .pending_ranges - .iter() - .enumerate() - .find_map(|(idx, pending_range)| { - if pending_range.contains_range(range) { - // The `range` may be a proper subset of `r` and we should split it in this case - // and push the rest back to `pending_range` so that each range only schedules - // load task of its own. - Some((idx, pending_range.split_off(range))) - } else if range.overlaps(pending_range) { - // Pending range `range` does not contains the applying range `r` but overlap - // with it, which means the pending range is out dated, we remove it directly. - info!( - "out of date pending ranges"; - "applying_range" => ?range, - "pending_range" => ?pending_range, - ); - overlapped = true; - Some((idx, (None, None))) - } else { - None - } - }) - { - if overlapped { - core.mut_range_manager().pending_ranges.swap_remove(idx); - return RangeCacheStatus::NotInCache; - } - - let range_manager = core.mut_range_manager(); - if let Some(left_range) = left_range { - range_manager.pending_ranges.push(left_range); - } - - if let Some(right_range) = right_range { - range_manager.pending_ranges.push(right_range); - } - - let range_manager = core.mut_range_manager(); - range_manager.pending_ranges.swap_remove(idx); - let rocks_snap = Arc::new(self.rocks_engine.as_ref().unwrap().snapshot(None)); - // Here, we use the range in `pending_ranges` rather than the parameter range as - // the region may be splitted. - range_manager - .pending_ranges_loading_data - .push_back((range.clone(), rocks_snap, false)); - - range_manager.record_in_ranges_being_written(write_batch_id, range); - - info!( - "Range to load"; - "Tag" => &range.tag, - "Cached" => range_manager.ranges().len(), - "Pending" => range_manager.pending_ranges_loading_data.len(), - ); - - // init cached write batch to cache the writes before loading complete - core.init_cached_write_batch(range); - - if let Err(e) = self - .bg_worker_manager() - .schedule_task(BackgroundTask::LoadRange) - { - error!( - "schedule range load failed"; - "err" => ?e, - "tag" => &range.tag, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - // We have scheduled the range to loading data, so the writes of the range - // should be buffered - return RangeCacheStatus::Loading; - } - - RangeCacheStatus::NotInCache - } - - // The writes in `handle_pending_range_in_loading_buffer` indicating the ranges - // of the writes are pending_ranges that are still loading data at the time of - // `prepare_for_apply`. But some of them may have been finished the load and - // become a normal range so that the writes should be written to the engine - // directly rather than cached. This method decides which writes should be - // cached and which writes should be written directly. - pub(crate) fn handle_pending_range_in_loading_buffer( - &self, - seq: &mut u64, - pending_range_in_loading_buffer: Vec, - ) -> (Vec, SkiplistEngine) { - if !pending_range_in_loading_buffer.is_empty() { - let core = self.core.upgradable_read(); - let (group_entries_to_cache, entries_to_write) = - group_write_batch_entries(pending_range_in_loading_buffer, core.range_manager()); - let engine = core.engine().clone(); - if !group_entries_to_cache.is_empty() { - let mut core = RwLockUpgradableReadGuard::upgrade(core); - for (range, write_batches) in group_entries_to_cache { - core.cached_write_batch.entry(range).or_default().extend( - write_batches.into_iter().map(|e| { - // We should confirm the sequence number for cached entries, and - // also increased for each of them. - *seq += 1; - (*seq - 1, e) - }), - ); - } - } - (entries_to_write, engine) - } else { - let core = self.core.read(); - (vec![], core.engine().clone()) - } - } - - pub fn bg_worker_manager(&self) -> &BgWorkManager { - &self.bg_work_manager - } - - pub fn memory_controller(&self) -> Arc { - self.memory_controller.clone() - } - - pub fn statistics(&self) -> Arc { - self.statistics.clone() - } - - pub fn alloc_write_batch_id(&self) -> u64 { - self.write_batch_id_allocator - .fetch_add(1, Ordering::Relaxed) - } -} - -impl RangeCacheMemoryEngine { - pub fn core(&self) -> &Arc> { - &self.core - } -} - -impl Debug for RangeCacheMemoryEngine { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "Range Cache Memory Engine") - } -} - -impl RangeCacheEngine for RangeCacheMemoryEngine { - type Snapshot = RangeCacheSnapshot; - - fn snapshot( - &self, - range: CacheRange, - read_ts: u64, - seq_num: u64, - ) -> result::Result { - RangeCacheSnapshot::new(self.clone(), range, read_ts, seq_num) - } - - type DiskEngine = RocksEngine; - fn set_disk_engine(&mut self, disk_engine: Self::DiskEngine) { - self.rocks_engine = Some(disk_engine.clone()); - if let Err(e) = self - .bg_worker_manager() - .schedule_task(BackgroundTask::SetRocksEngine(disk_engine)) - { - error!( - "schedule set rocks_engine failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - - type RangeHintService = PdRangeHintService; - fn start_hint_service(&self, range_hint_service: Self::RangeHintService) { - self.bg_worker_manager() - .start_bg_hint_service(range_hint_service) - } - - fn get_range_for_key(&self, key: &[u8]) -> Option { - let core = self.core.read(); - core.range_manager().get_range_for_key(key) - } - - fn enabled(&self) -> bool { - self.config.value().enabled - } - - fn evict_range(&self, range: &CacheRange) { - self.evict_range(range) - } -} - -impl Iterable for RangeCacheMemoryEngine { - type Iterator = RangeCacheIterator; - - fn iterator_opt(&self, _: &str, _: IterOptions) -> Result { - // This engine does not support creating iterators directly by the engine. - panic!("iterator_opt is not supported on creating by RangeCacheMemoryEngine directly") - } -} - -#[cfg(test)] -pub mod tests { - use std::sync::Arc; - - use crossbeam::epoch; - use engine_traits::{CacheRange, CF_DEFAULT, CF_LOCK, CF_WRITE}; - use tikv_util::config::{ReadableSize, VersionTrack}; - - use super::SkiplistEngine; - use crate::{ - keys::{construct_key, construct_user_key, encode_key}, - memory_controller::MemoryController, - InternalBytes, RangeCacheEngineConfig, RangeCacheEngineContext, RangeCacheMemoryEngine, - ValueType, - }; - - #[test] - fn test_overlap_with_pending() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let range1 = CacheRange::new(b"k1".to_vec(), b"k3".to_vec()); - engine.load_range(range1).unwrap(); - - let range2 = CacheRange::new(b"k1".to_vec(), b"k5".to_vec()); - engine.prepare_for_apply(1, &range2); - assert!( - engine.core.read().range_manager().pending_ranges.is_empty() - && engine - .core - .read() - .range_manager() - .pending_ranges_loading_data - .is_empty() - ); - - let range1 = CacheRange::new(b"k1".to_vec(), b"k3".to_vec()); - engine.load_range(range1).unwrap(); - - let range2 = CacheRange::new(b"k2".to_vec(), b"k5".to_vec()); - engine.prepare_for_apply(1, &range2); - assert!( - engine.core.read().range_manager().pending_ranges.is_empty() - && engine - .core - .read() - .range_manager() - .pending_ranges_loading_data - .is_empty() - ); - } - - #[test] - fn test_delete_range() { - let delete_range_cf = |cf| { - let skiplist = SkiplistEngine::default(); - let handle = skiplist.cf_handle(cf); - - let config = Arc::new(VersionTrack::new(RangeCacheEngineConfig { - enabled: true, - gc_interval: Default::default(), - load_evict_interval: Default::default(), - soft_limit_threshold: Some(ReadableSize(300)), - hard_limit_threshold: Some(ReadableSize(500)), - expected_region_size: Some(ReadableSize::mb(20)), - })); - let mem_controller = Arc::new(MemoryController::new(config.clone(), skiplist.clone())); - - let guard = &epoch::pin(); - - let insert_kv = |k, mvcc, v: &[u8], seq| { - let user_key = construct_key(k, mvcc); - let mut key = encode_key(&user_key, seq, ValueType::Value); - let mut val = InternalBytes::from_vec(v.to_vec()); - key.set_memory_controller(mem_controller.clone()); - val.set_memory_controller(mem_controller.clone()); - handle.insert(key, val, guard); - }; - - insert_kv(0, 1, b"val", 100); - insert_kv(1, 2, b"val", 101); - insert_kv(1, 3, b"val", 102); - insert_kv(2, 2, b"val", 103); - insert_kv(9, 2, b"val", 104); - insert_kv(10, 2, b"val", 105); - - let start = construct_user_key(1); - let end = construct_user_key(10); - let range = CacheRange::new(start, end); - skiplist.delete_range(&range); - - let mut iter = handle.iterator(); - iter.seek_to_first(guard); - let expect = construct_key(0, 1); - let expect = encode_key(&expect, 100, ValueType::Value); - assert_eq!(iter.key(), &expect); - iter.next(guard); - - let expect = construct_key(10, 2); - let expect = encode_key(&expect, 105, ValueType::Value); - assert_eq!(iter.key(), &expect); - iter.next(guard); - assert!(!iter.valid()); - }; - delete_range_cf(CF_DEFAULT); - delete_range_cf(CF_WRITE); - } - - #[test] - fn test_delete_range_for_lock_cf() { - let skiplist = SkiplistEngine::default(); - let lock_handle = skiplist.cf_handle(CF_LOCK); - - let config = Arc::new(VersionTrack::new(RangeCacheEngineConfig { - enabled: true, - gc_interval: Default::default(), - load_evict_interval: Default::default(), - soft_limit_threshold: Some(ReadableSize(300)), - hard_limit_threshold: Some(ReadableSize(500)), - expected_region_size: Some(ReadableSize::mb(20)), - })); - let mem_controller = Arc::new(MemoryController::new(config.clone(), skiplist.clone())); - - let guard = &epoch::pin(); - - let insert_kv = |k, v: &[u8], seq| { - let mut key = encode_key(k, seq, ValueType::Value); - let mut val = InternalBytes::from_vec(v.to_vec()); - key.set_memory_controller(mem_controller.clone()); - val.set_memory_controller(mem_controller.clone()); - lock_handle.insert(key, val, guard); - }; - - insert_kv(b"k", b"val", 100); - insert_kv(b"k1", b"val1", 101); - insert_kv(b"k2", b"val2", 102); - insert_kv(b"k3", b"val3", 103); - insert_kv(b"k4", b"val4", 104); - - let range = CacheRange::new(b"k1".to_vec(), b"k4".to_vec()); - skiplist.delete_range(&range); - - let mut iter = lock_handle.iterator(); - iter.seek_to_first(guard); - let expect = encode_key(b"k", 100, ValueType::Value); - assert_eq!(iter.key(), &expect); - - iter.next(guard); - let expect = encode_key(b"k4", 104, ValueType::Value); - assert_eq!(iter.key(), &expect); - - iter.next(guard); - assert!(!iter.valid()); - } -} diff --git a/components/range_cache_memory_engine/src/lib.rs b/components/range_cache_memory_engine/src/lib.rs deleted file mode 100644 index 7bcabb8e033..00000000000 --- a/components/range_cache_memory_engine/src/lib.rs +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -#![feature(let_chains)] -#![allow(internal_features)] -#![feature(core_intrinsics)] -#![feature(slice_pattern)] - -use std::{sync::Arc, time::Duration}; - -use futures::future::ready; -use online_config::OnlineConfig; -use pd_client::PdClient; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tikv_util::config::{ReadableDuration, ReadableSize, VersionTrack}; - -mod background; -pub mod config; -mod engine; -mod keys; -mod memory_controller; -mod metrics; -mod perf_context; -#[cfg(test)] -mod prop_test; -mod range_manager; -mod range_stats; -mod read; -mod region_label; -mod statistics; -pub mod test_util; -mod write_batch; - -pub use background::{BackgroundRunner, BackgroundTask, GcTask}; -pub use engine::{RangeCacheMemoryEngine, SkiplistHandle}; -pub use keys::{ - decode_key, encode_key_for_boundary_without_mvcc, encoding_for_filter, InternalBytes, - InternalKey, ValueType, -}; -pub use metrics::flush_range_cache_engine_statistics; -pub use range_manager::RangeCacheStatus; -pub use statistics::Statistics as RangeCacheMemoryEngineStatistics; -use txn_types::TimeStamp; -pub use write_batch::RangeCacheWriteBatch; - -#[derive(Debug, Error)] -pub enum Error { - #[error("Invalid Argument: {0}")] - InvalidArgument(String), -} - -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, OnlineConfig)] -#[serde(default, rename_all = "kebab-case")] -pub struct RangeCacheEngineConfig { - pub enabled: bool, - pub gc_interval: ReadableDuration, - pub load_evict_interval: ReadableDuration, - pub soft_limit_threshold: Option, - pub hard_limit_threshold: Option, - pub expected_region_size: Option, -} - -impl Default for RangeCacheEngineConfig { - fn default() -> Self { - Self { - enabled: false, - gc_interval: ReadableDuration(Duration::from_secs(180)), - load_evict_interval: ReadableDuration(Duration::from_secs(300)), /* Each load/evict - * operation should - * run within five - * minutes. */ - soft_limit_threshold: None, - hard_limit_threshold: None, - expected_region_size: None, - } - } -} - -impl RangeCacheEngineConfig { - pub fn validate(&mut self) -> Result<(), Box> { - if !self.enabled { - return Ok(()); - } - - Ok(self.sanitize()?) - } - - pub fn sanitize(&mut self) -> Result<(), Error> { - if self.soft_limit_threshold.is_none() || self.hard_limit_threshold.is_none() { - return Err(Error::InvalidArgument( - "soft-limit-threshold or hard-limit-threshold not set".to_string(), - )); - } - - if self.soft_limit_threshold.as_ref().unwrap() - >= self.hard_limit_threshold.as_ref().unwrap() - { - return Err(Error::InvalidArgument(format!( - "soft-limit-threshold {:?} is larger or equal to hard-limit-threshold {:?}", - self.soft_limit_threshold.as_ref().unwrap(), - self.hard_limit_threshold.as_ref().unwrap() - ))); - } - - Ok(()) - } - - pub fn soft_limit_threshold(&self) -> usize { - self.soft_limit_threshold.map_or(0, |r| r.0 as usize) - } - - pub fn hard_limit_threshold(&self) -> usize { - self.hard_limit_threshold.map_or(0, |r| r.0 as usize) - } - - pub fn expected_region_size(&self) -> usize { - self.expected_region_size.map_or( - raftstore::coprocessor::config::SPLIT_SIZE.0 as usize, - |r: ReadableSize| r.0 as usize, - ) - } - - pub fn config_for_test() -> RangeCacheEngineConfig { - RangeCacheEngineConfig { - enabled: true, - gc_interval: ReadableDuration(Duration::from_secs(180)), - load_evict_interval: ReadableDuration(Duration::from_secs(300)), /* Should run within - * five minutes */ - soft_limit_threshold: Some(ReadableSize::gb(1)), - hard_limit_threshold: Some(ReadableSize::gb(2)), - expected_region_size: Some(ReadableSize::mb(20)), - } - } -} - -pub struct RangeCacheEngineContext { - config: Arc>, - statistics: Arc, - pd_client: Arc, -} - -impl RangeCacheEngineContext { - pub fn new( - config: Arc>, - pd_client: Arc, - ) -> RangeCacheEngineContext { - RangeCacheEngineContext { - config, - statistics: Arc::default(), - pd_client, - } - } - - pub fn new_for_tests( - config: Arc>, - ) -> RangeCacheEngineContext { - struct MockPdClient; - impl PdClient for MockPdClient { - fn get_tso(&self) -> pd_client::PdFuture { - Box::pin(ready(Ok(TimeStamp::compose(TimeStamp::physical_now(), 0)))) - } - } - RangeCacheEngineContext { - config, - statistics: Arc::default(), - pd_client: Arc::new(MockPdClient), - } - } - - pub fn statistics(&self) -> Arc { - self.statistics.clone() - } -} diff --git a/components/range_cache_memory_engine/src/metrics.rs b/components/range_cache_memory_engine/src/metrics.rs deleted file mode 100644 index b4cf285e0c8..00000000000 --- a/components/range_cache_memory_engine/src/metrics.rs +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use lazy_static::lazy_static; -use prometheus::*; -use prometheus_static_metric::*; - -use crate::{ - statistics::{Tickers, ENGINE_TICKER_TYPES}, - RangeCacheMemoryEngineStatistics, -}; - -make_auto_flush_static_metric! { - pub label_enum KeyCountType { - total, - filtered, - below_safe_point_total, - below_safe_point_unique, - } - - pub label_enum TickerEnum { - bytes_read, - iter_bytes_read, - number_db_seek, - number_db_seek_found, - number_db_next, - number_db_next_found, - number_db_prev, - number_db_prev_found, - } - - pub struct GcFilteredCountVec: LocalIntCounter { - "type" => KeyCountType, - } - - pub struct InMemoryEngineTickerMetrics: LocalIntCounter { - "type" => TickerEnum, - } -} - -lazy_static! { - pub static ref GC_FILTERED: IntCounterVec = register_int_counter_vec!( - "tikv_range_cache_memory_engine_gc_filtered", - "Filtered version by GC", - &["type"] - ) - .unwrap(); - pub static ref RANGE_CACHE_MEMORY_USAGE: IntGauge = register_int_gauge!( - "tikv_range_cache_memory_usage_bytes", - "The memory usage of the range cache engine", - ) - .unwrap(); - pub static ref RANGE_LOAD_TIME_HISTOGRAM: Histogram = register_histogram!( - "tikv_range_load_duration_secs", - "Bucketed histogram of range load time duration.", - exponential_buckets(0.001, 2.0, 20).unwrap() - ) - .unwrap(); - pub static ref RANGE_GC_TIME_HISTOGRAM: Histogram = register_histogram!( - "tikv_range_gc_duration_secs", - "Bucketed histogram of range gc time duration.", - exponential_buckets(0.001, 2.0, 20).unwrap() - ) - .unwrap(); - pub static ref WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( - "tikv_range_cache_engine_write_duration_seconds", - "Bucketed histogram of write duration in range cache engine.", - exponential_buckets(0.00001, 2.0, 20).unwrap() - ) - .unwrap(); - pub static ref RANGE_PREPARE_FOR_WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( - "tikv_range_cache_engine_prepare_for_write_duration_seconds", - "Bucketed histogram of prepare for write duration in range cache engine.", - exponential_buckets(0.00001, 2.0, 20).unwrap() - ) - .unwrap(); - pub static ref RANGE_CACHE_COUNT: IntGaugeVec = register_int_gauge_vec!( - "tikv_range_cache_count", - "The count of each type on range cache.", - &["type"] - ) - .unwrap(); - pub static ref IN_MEMORY_ENGINE_FLOW: IntCounterVec = register_int_counter_vec!( - "tikv_range_cache_memory_engine_flow", - "Bytes and keys of read/written of range cache memory engine", - &["type"] - ) - .unwrap(); - pub static ref IN_MEMORY_ENGINE_LOCATE: IntCounterVec = register_int_counter_vec!( - "tikv_range_cache_memory_engine_locate", - "Number of calls to seek/next/prev", - &["type"] - ) - .unwrap(); - pub static ref IN_MEMORY_ENGINE_SEEK_DURATION: Histogram = register_histogram!( - "tikv_range_cache_memory_engine_seek_duration", - "Histogram of seek duration", - exponential_buckets(0.00001, 2.0, 26).unwrap() - ) - .unwrap(); -} - -lazy_static! { - pub static ref GC_FILTERED_STATIC: GcFilteredCountVec = - auto_flush_from!(GC_FILTERED, GcFilteredCountVec); - pub static ref IN_MEMORY_ENGINE_FLOW_STATIC: InMemoryEngineTickerMetrics = - auto_flush_from!(IN_MEMORY_ENGINE_FLOW, InMemoryEngineTickerMetrics); - pub static ref IN_MEMORY_ENGINE_LOCATE_STATIC: InMemoryEngineTickerMetrics = - auto_flush_from!(IN_MEMORY_ENGINE_LOCATE, InMemoryEngineTickerMetrics); -} - -pub fn flush_range_cache_engine_statistics(statistics: &Arc) { - for t in ENGINE_TICKER_TYPES { - let v = statistics.get_and_reset_ticker_count(*t); - flush_engine_ticker_metrics(*t, v); - } -} - -fn flush_engine_ticker_metrics(t: Tickers, value: u64) { - match t { - Tickers::BytesRead => { - IN_MEMORY_ENGINE_FLOW_STATIC.bytes_read.inc_by(value); - } - Tickers::IterBytesRead => { - IN_MEMORY_ENGINE_FLOW_STATIC.iter_bytes_read.inc_by(value); - } - Tickers::NumberDbSeek => { - IN_MEMORY_ENGINE_LOCATE_STATIC.number_db_seek.inc_by(value); - } - Tickers::NumberDbSeekFound => { - IN_MEMORY_ENGINE_LOCATE_STATIC - .number_db_seek_found - .inc_by(value); - } - Tickers::NumberDbNext => { - IN_MEMORY_ENGINE_LOCATE_STATIC.number_db_next.inc_by(value); - } - Tickers::NumberDbNextFound => { - IN_MEMORY_ENGINE_LOCATE_STATIC - .number_db_next_found - .inc_by(value); - } - Tickers::NumberDbPrev => { - IN_MEMORY_ENGINE_LOCATE_STATIC.number_db_prev.inc_by(value); - } - Tickers::NumberDbPrevFound => { - IN_MEMORY_ENGINE_LOCATE_STATIC - .number_db_prev_found - .inc_by(value); - } - _ => { - unreachable!() - } - } -} diff --git a/components/range_cache_memory_engine/src/range_manager.rs b/components/range_cache_memory_engine/src/range_manager.rs deleted file mode 100644 index c649f72463a..00000000000 --- a/components/range_cache_memory_engine/src/range_manager.rs +++ /dev/null @@ -1,704 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - collections::{BTreeMap, BTreeSet, VecDeque}, - result, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, -}; - -use collections::HashMap; -use engine_rocks::RocksSnapshot; -use engine_traits::{CacheRange, FailedReason}; -use tikv_util::info; - -use crate::read::RangeCacheSnapshotMeta; - -// read_ts -> ref_count -#[derive(Default, Debug)] -pub(crate) struct SnapshotList(pub(crate) BTreeMap); - -impl SnapshotList { - pub(crate) fn new_snapshot(&mut self, read_ts: u64) { - // snapshot with this ts may be granted before - let count = self.0.get(&read_ts).unwrap_or(&0) + 1; - self.0.insert(read_ts, count); - } - - pub(crate) fn remove_snapshot(&mut self, read_ts: u64) { - let count = self.0.get_mut(&read_ts).unwrap(); - assert!(*count >= 1); - if *count == 1 { - self.0.remove(&read_ts).unwrap(); - } else { - *count -= 1; - } - } - - // returns the min snapshot_ts (read_ts) if there's any - pub fn min_snapshot_ts(&self) -> Option { - self.0.first_key_value().map(|(ts, _)| *ts) - } - - pub(crate) fn is_empty(&self) -> bool { - self.0.is_empty() - } -} - -#[derive(Debug, Default)] -pub struct RangeMeta { - // start_key and end_key cannot uniquely identify a range as range can split and merge, so we - // need a range id. - id: u64, - range_snapshot_list: SnapshotList, - safe_point: u64, -} - -impl RangeMeta { - fn new(id: u64) -> Self { - Self { - id, - range_snapshot_list: SnapshotList::default(), - safe_point: 0, - } - } - - pub(crate) fn safe_point(&self) -> u64 { - self.safe_point - } - - pub(crate) fn set_safe_point(&mut self, safe_point: u64) { - assert!(self.safe_point <= safe_point); - self.safe_point = safe_point; - } - - fn derive_from(id: u64, r: &RangeMeta) -> Self { - Self { - id, - range_snapshot_list: SnapshotList::default(), - safe_point: r.safe_point, - } - } - - pub(crate) fn range_snapshot_list(&self) -> &SnapshotList { - &self.range_snapshot_list - } -} - -#[derive(Default)] -struct IdAllocator(u64); - -impl IdAllocator { - fn allocate_id(&mut self) -> u64 { - self.0 += 1; - self.0 - } -} - -// RangeManger manges the ranges for RangeCacheMemoryEngine. Every new ranges -// (whether created by new_range or by splitted due to eviction) has an unique -// id so that range + id can exactly identify which range it is. -// When an eviction occured, say we now have k1-k10 in self.ranges and the -// eviction range is k3-k5. k1-k10 will be splitted to three ranges: k1-k3, -// k3-k5, and k5-k10. -// k1-k3 and k5-k10 will be new ranges inserted in self.ranges with meta dervied -// from meta of k1-k10 (only safe_ts will be derived). k1-k10 will be removed -// from self.ranges and inserted to self.historical_ranges. Then, k3-k5 will be -// in the self.evicted_ranges. Now, we cannot remove the data of k3-k5 as there -// may be some snapshot of k1-k10. After these snapshot are dropped, k3-k5 can -// be acutally removed. -#[derive(Default)] -pub struct RangeManager { - // Each new range will increment it by one. - id_allocator: IdAllocator, - // Range before an eviction. It is recorded due to some undropped snapshot, which block the - // evicted range deleting the relevant data. - historical_ranges: BTreeMap, - // `ranges_being_deleted` contains ranges that are evicted but not finished the delete (or even - // not start to delete due to ongoing snapshot) - pub(crate) ranges_being_deleted: BTreeSet, - // ranges that are cached now - ranges: BTreeMap, - - // `pending_ranges` contains ranges that will be loaded into the memory engine. To guarantee - // the completeness of the data, we also need to write the data that is applied after the - // snapshot is acquired. And to ensure the data is written by order, we should cache the data - // that is applied after the snapshot acquired and only consume them when snapshot load - // finishes. - // So, at sometime in the apply thread, the pending ranges, coupled with rocksdb - // snapshot, will be poped and pushed into `pending_ranges_loading_data` (data here means the - // data in snapshot and in further applied write). Then the data in the snapshot of the - // given ranges will be loaded in the memory engine in the background worker. When the - // snapshot load is finished, we begin to consume the write batch that is cached after the - // snapshot is acquired. - // - // Note: as we will release lock during the consuming of the cached write batch, there could be - // further write batch being cached. We must ensure the cached write batch is empty at the time - // the range becoming accessable range. - // - // Note: the region with range equaling to the range in the `pending_range` may have been - // split. This is fine, we just let the first child region that calls the prepare_for_apply - // to schedule it. We should cache writes for all child regions, and the load task - // completes as long as the snapshot has been loaded and the cached write batches for this - // super range have all been consumed. - pub(crate) pending_ranges: Vec, - // The bool indicates the loading is canceled due to memory capcity issue - pub(crate) pending_ranges_loading_data: VecDeque<(CacheRange, Arc, bool)>, - - ranges_in_gc: BTreeSet, - // Record the ranges that are being written. - // - // It is used to avoid the conccurency issue between delete range and write to memory: after - // the range is evicted or failed to load, the range is recorded in `ranges_being_deleted` - // which means no further write of it is allowed, and a DeleteRange task of the range will be - // scheduled to cleanup the dirty data. However, it is possible that the apply thread is - // writting data for this range. Therefore, we have to delay the DeleteRange task until the - // range leaves the `ranges_being_written`. - // - // The key in this map is the id of the write batch, and the value is a collection - // the ranges of this batch. So, when the write batch is consumed by the in-memory engine, - // all ranges of it are cleared from `ranges_being_written`. - ranges_being_written: HashMap>, - range_evictions: AtomicU64, -} - -impl RangeManager { - pub(crate) fn ranges(&self) -> &BTreeMap { - &self.ranges - } - - pub fn new_range(&mut self, range: CacheRange) { - assert!(!self.overlap_with_range(&range)); - let range_meta = RangeMeta::new(self.id_allocator.allocate_id()); - self.ranges.insert(range, range_meta); - } - - pub fn mut_range_meta(&mut self, range: &CacheRange) -> Option<&mut RangeMeta> { - self.ranges.get_mut(range) - } - - pub fn set_safe_point(&mut self, range: &CacheRange, safe_ts: u64) -> bool { - if let Some(meta) = self.ranges.get_mut(range) { - if meta.safe_point > safe_ts { - return false; - } - meta.safe_point = safe_ts; - true - } else { - false - } - } - - pub fn contains(&self, key: &[u8]) -> bool { - self.ranges.keys().any(|r| r.contains_key(key)) - } - - pub fn get_range_for_key(&self, key: &[u8]) -> Option { - self.ranges.keys().find_map(|r| { - if r.contains_key(key) { - Some(r.clone()) - } else { - None - } - }) - } - - pub fn contains_range(&self, range: &CacheRange) -> bool { - self.ranges.keys().any(|r| r.contains_range(range)) - } - - pub fn pending_ranges_in_loading_contains(&self, range: &CacheRange) -> bool { - self.pending_ranges_loading_data - .iter() - .any(|(r, ..)| r.contains_range(range)) - } - - fn overlap_with_range(&self, range: &CacheRange) -> bool { - self.ranges.keys().any(|r| r.overlaps(range)) - } - - fn overlap_with_evicting_range(&self, range: &CacheRange) -> bool { - self.ranges_being_deleted.iter().any(|r| r.overlaps(range)) - } - - fn overlap_with_range_in_gc(&self, range: &CacheRange) -> bool { - self.ranges_in_gc.iter().any(|r| r.overlaps(range)) - } - - fn overlap_with_pending_range(&self, range: &CacheRange) -> bool { - self.pending_ranges.iter().any(|r| r.overlaps(range)) - || self - .pending_ranges_loading_data - .iter() - .any(|(r, ..)| r.overlaps(range)) - } - - // Acquire a snapshot of the `range` with `read_ts`. If the range is not - // accessable, None will be returned. Otherwise, the range id will be returned. - pub(crate) fn range_snapshot( - &mut self, - range: &CacheRange, - read_ts: u64, - ) -> result::Result { - let Some(range_key) = self - .ranges - .keys() - .find(|&r| r.contains_range(range)) - .cloned() - else { - return Err(FailedReason::NotCached); - }; - let meta = self.ranges.get_mut(&range_key).unwrap(); - - if read_ts <= meta.safe_point { - return Err(FailedReason::TooOldRead); - } - - meta.range_snapshot_list.new_snapshot(read_ts); - Ok(meta.id) - } - - // If the snapshot is the last one in the snapshot list of one cache range in - // historical_ranges, it means one or some evicted_ranges may be ready to be - // removed physically. - // So, we return a vector of ranges to denote the ranges that are ready to be - // removed. - pub(crate) fn remove_range_snapshot( - &mut self, - snapshot_meta: &RangeCacheSnapshotMeta, - ) -> Vec { - if let Some(range_key) = self - .historical_ranges - .iter() - .find(|&(range, meta)| { - range.contains_range(&snapshot_meta.range) && meta.id == snapshot_meta.range_id - }) - .map(|(r, _)| r.clone()) - { - let meta = self.historical_ranges.get_mut(&range_key).unwrap(); - meta.range_snapshot_list - .remove_snapshot(snapshot_meta.snapshot_ts); - if meta.range_snapshot_list.is_empty() { - self.historical_ranges.remove(&range_key); - } - - return self - .ranges_being_deleted - .iter() - .filter(|evicted_range| { - !self - .historical_ranges - .keys() - .any(|r| r.overlaps(evicted_range)) - }) - .cloned() - .collect::>(); - } - - // It must belong to the `self.ranges` if not found in `self.historical_ranges` - let range_key = self - .ranges - .iter() - .find(|&(range, meta)| { - range.contains_range(&snapshot_meta.range) && meta.id == snapshot_meta.range_id - }) - .map(|(r, _)| r.clone()) - .unwrap(); - let meta = self.ranges.get_mut(&range_key).unwrap(); - meta.range_snapshot_list - .remove_snapshot(snapshot_meta.snapshot_ts); - vec![] - } - - /// Return ranges that can be deleted now (no ongoing snapshot). - // There are two cases based on the relationship between `evict_range` and - // cached ranges: - // 1. `evict_range` is contained(including equals) by a cached range (at most - // one due to non-overlapping in cached ranges) - // 2. `evict_range` is overlapped with (including contains but not be contained) - // one or more cached ranges - // - // For 1, if the `evict_range` is a proper subset of the cached_range, we will - // split the cached_range so that only the `evict_range` part will be evicted - // and deleted. - // - // For 2, this is caused by some special operations such as merge and delete - // range. So, conservatively, we evict all ranges overlap with it. - pub(crate) fn evict_range(&mut self, evict_range: &CacheRange) -> Vec { - info!( - "try to evict range"; - "evict_range" => ?evict_range, - ); - - // cancel loading ranges overlapped with `evict_range` - self.pending_ranges_loading_data - .iter_mut() - .for_each(|(r, _, canceled)| { - if evict_range.overlaps(r) { - info!( - "evict range that overlaps with loading range"; - "evicted_range" => ?evict_range, - "overlapped_range" => ?r, - ); - *canceled = true; - } - }); - - let mut overlapped_ranges = vec![]; - for r in self.ranges.keys() { - if r.contains_range(evict_range) { - if self.evict_within_range(evict_range, &r.clone()) { - return vec![evict_range.clone()]; - } else { - return vec![]; - } - } else if r.overlaps(evict_range) { - overlapped_ranges.push(r.clone()); - } - } - - if overlapped_ranges.is_empty() { - info!( - "evict a range that is not cached"; - "range" => ?evict_range, - ); - return vec![]; - } - - overlapped_ranges - .into_iter() - .filter(|r| self.evict_within_range(r, r)) - .collect() - } - - // Return true means there is no ongoing snapshot, the evicted_range can be - // deleted now. - fn evict_within_range(&mut self, evict_range: &CacheRange, cached_range: &CacheRange) -> bool { - assert!(cached_range.contains_range(evict_range)); - info!( - "evict range in cache range engine"; - "evict_range" => ?evict_range, - "cached_range" => ?cached_range, - ); - self.range_evictions.fetch_add(1, Ordering::Relaxed); - let meta = self.ranges.remove(cached_range).unwrap(); - let (left_range, right_range) = cached_range.split_off(evict_range); - assert!((left_range.is_some() || right_range.is_some()) || evict_range == cached_range); - - if let Some(left_range) = left_range { - let left_meta = RangeMeta::derive_from(self.id_allocator.allocate_id(), &meta); - self.ranges.insert(left_range, left_meta); - } - - if let Some(right_range) = right_range { - let right_meta = RangeMeta::derive_from(self.id_allocator.allocate_id(), &meta); - self.ranges.insert(right_range, right_meta); - } - - self.ranges_being_deleted.insert(evict_range.clone()); - - if !meta.range_snapshot_list.is_empty() { - self.historical_ranges.insert(cached_range.clone(), meta); - return false; - } - - // we also need to check with previous historical_ranges - !self - .historical_ranges - .keys() - .any(|r| r.overlaps(evict_range)) - } - - pub fn has_ranges_in_gc(&self) -> bool { - !self.ranges_in_gc.is_empty() - } - - pub fn on_delete_ranges(&mut self, ranges: &[CacheRange]) { - for r in ranges { - self.ranges_being_deleted.remove(r); - } - } - - pub fn set_ranges_in_gc(&mut self, ranges_in_gc: BTreeSet) { - self.ranges_in_gc = ranges_in_gc; - } - - pub(crate) fn is_overlapped_with_ranges_being_written(&self, range: &CacheRange) -> bool { - self.ranges_being_written.iter().any(|(_, ranges)| { - ranges - .iter() - .any(|range_being_written| range_being_written.overlaps(range)) - }) - } - - pub(crate) fn record_in_ranges_being_written( - &mut self, - write_batch_id: u64, - range: &CacheRange, - ) { - self.ranges_being_written - .entry(write_batch_id) - .or_default() - .push(range.clone()) - } - - pub(crate) fn clear_ranges_in_being_written( - &mut self, - write_batch_id: u64, - has_entry_applied: bool, - ) { - let ranges = self.ranges_being_written.remove(&write_batch_id); - if has_entry_applied { - assert!(!ranges.unwrap().is_empty()); - } - } - - pub fn on_gc_finished(&mut self, range: BTreeSet) { - assert_eq!(range, std::mem::take(&mut self.ranges_in_gc)); - } - - pub fn load_range(&mut self, cache_range: CacheRange) -> Result<(), LoadFailedReason> { - if self.overlap_with_range(&cache_range) { - return Err(LoadFailedReason::Overlapped); - }; - if self.overlap_with_pending_range(&cache_range) { - return Err(LoadFailedReason::PendingRange); - } - if self.overlap_with_range_in_gc(&cache_range) { - return Err(LoadFailedReason::InGc); - } - if self.overlap_with_evicting_range(&cache_range) { - return Err(LoadFailedReason::Evicting); - } - self.pending_ranges.push(cache_range); - Ok(()) - } - - pub fn get_and_reset_range_evictions(&self) -> u64 { - self.range_evictions.swap(0, Ordering::Relaxed) - } -} - -#[derive(Debug, PartialEq)] -pub enum LoadFailedReason { - Overlapped, - PendingRange, - InGc, - Evicting, -} - -pub enum RangeCacheStatus { - NotInCache, - Cached, - Loading, -} - -#[cfg(test)] -mod tests { - use std::collections::BTreeSet; - - use engine_traits::{CacheRange, FailedReason}; - - use super::RangeManager; - use crate::range_manager::LoadFailedReason; - - #[test] - fn test_range_manager() { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - - range_mgr.new_range(r1.clone()); - range_mgr.set_safe_point(&r1, 5); - assert_eq!( - range_mgr.range_snapshot(&r1, 5).unwrap_err(), - FailedReason::TooOldRead - ); - range_mgr.range_snapshot(&r1, 8).unwrap(); - range_mgr.range_snapshot(&r1, 10).unwrap(); - let tmp_r = CacheRange::new(b"k08".to_vec(), b"k15".to_vec()); - assert_eq!( - range_mgr.range_snapshot(&tmp_r, 8).unwrap_err(), - FailedReason::NotCached - ); - let tmp_r = CacheRange::new(b"k10".to_vec(), b"k11".to_vec()); - assert_eq!( - range_mgr.range_snapshot(&tmp_r, 8).unwrap_err(), - FailedReason::NotCached - ); - - let r_evict = CacheRange::new(b"k03".to_vec(), b"k06".to_vec()); - let r_left = CacheRange::new(b"k00".to_vec(), b"k03".to_vec()); - let r_right = CacheRange::new(b"k06".to_vec(), b"k10".to_vec()); - range_mgr.evict_range(&r_evict); - let meta1 = range_mgr.historical_ranges.get(&r1).unwrap(); - assert!(range_mgr.ranges_being_deleted.contains(&r_evict)); - assert!(range_mgr.ranges.get(&r1).is_none()); - let meta2 = range_mgr.ranges.get(&r_left).unwrap(); - let meta3 = range_mgr.ranges.get(&r_right).unwrap(); - assert!(meta1.safe_point == meta2.safe_point && meta1.safe_point == meta3.safe_point); - - // evict a range with accurate match - let _ = range_mgr.range_snapshot(&r_left, 10); - range_mgr.evict_range(&r_left); - assert!(range_mgr.historical_ranges.get(&r_left).is_some()); - assert!(range_mgr.ranges_being_deleted.contains(&r_left)); - assert!(range_mgr.ranges.get(&r_left).is_none()); - - assert!(range_mgr.evict_range(&r_right).is_empty()); - assert!(range_mgr.historical_ranges.get(&r_right).is_none()); - } - - #[test] - fn test_range_load() { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k10".to_vec(), b"k20".to_vec()); - let r3 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let r4 = CacheRange::new(b"k25".to_vec(), b"k35".to_vec()); - range_mgr.new_range(r1.clone()); - range_mgr.new_range(r3.clone()); - range_mgr.evict_range(&r1); - - let mut gced = BTreeSet::default(); - gced.insert(r2.clone()); - range_mgr.set_ranges_in_gc(gced); - - assert_eq!( - range_mgr.load_range(r1).unwrap_err(), - LoadFailedReason::Evicting - ); - - assert_eq!( - range_mgr.load_range(r2).unwrap_err(), - LoadFailedReason::InGc - ); - - assert_eq!( - range_mgr.load_range(r4).unwrap_err(), - LoadFailedReason::Overlapped - ); - } - - #[test] - fn test_range_load_overlapped() { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let r3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - range_mgr.new_range(r1.clone()); - range_mgr.evict_range(&r1); - - let mut gced = BTreeSet::default(); - gced.insert(r2); - range_mgr.set_ranges_in_gc(gced); - - range_mgr.load_range(r3).unwrap(); - - let r = CacheRange::new(b"".to_vec(), b"k05".to_vec()); - assert_eq!( - range_mgr.load_range(r).unwrap_err(), - LoadFailedReason::Evicting - ); - let r = CacheRange::new(b"k05".to_vec(), b"k15".to_vec()); - assert_eq!( - range_mgr.load_range(r).unwrap_err(), - LoadFailedReason::Evicting - ); - - let r = CacheRange::new(b"k15".to_vec(), b"k25".to_vec()); - assert_eq!(range_mgr.load_range(r).unwrap_err(), LoadFailedReason::InGc); - let r = CacheRange::new(b"k25".to_vec(), b"k35".to_vec()); - assert_eq!(range_mgr.load_range(r).unwrap_err(), LoadFailedReason::InGc); - - let r = CacheRange::new(b"k35".to_vec(), b"k45".to_vec()); - assert_eq!( - range_mgr.load_range(r).unwrap_err(), - LoadFailedReason::PendingRange - ); - let r = CacheRange::new(b"k45".to_vec(), b"k55".to_vec()); - assert_eq!( - range_mgr.load_range(r).unwrap_err(), - LoadFailedReason::PendingRange - ); - } - - #[test] - fn test_evict_ranges() { - { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let r3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - range_mgr.new_range(r1.clone()); - range_mgr.new_range(r2.clone()); - range_mgr.new_range(r3.clone()); - range_mgr.contains_range(&r1); - range_mgr.contains_range(&r2); - range_mgr.contains_range(&r3); - - let r4 = CacheRange::new(b"k00".to_vec(), b"k05".to_vec()); - let r5 = CacheRange::new(b"k05".to_vec(), b"k10".to_vec()); - assert_eq!(range_mgr.evict_range(&r4), vec![r4]); - assert_eq!( - range_mgr.ranges().keys().collect::>(), - vec![&r5, &r2, &r3] - ); - - let r6 = CacheRange::new(b"k24".to_vec(), b"k27".to_vec()); - let r7 = CacheRange::new(b"k20".to_vec(), b"k24".to_vec()); - let r8 = CacheRange::new(b"k27".to_vec(), b"k30".to_vec()); - assert_eq!(range_mgr.evict_range(&r6), vec![r6]); - assert_eq!( - range_mgr.ranges().keys().collect::>(), - vec![&r5, &r7, &r8, &r3] - ); - } - - { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let r3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - range_mgr.new_range(r1.clone()); - range_mgr.new_range(r2.clone()); - range_mgr.new_range(r3.clone()); - range_mgr.contains_range(&r1); - range_mgr.contains_range(&r2); - range_mgr.contains_range(&r3); - - let r4 = CacheRange::new(b"k".to_vec(), b"k51".to_vec()); - assert_eq!(range_mgr.evict_range(&r4), vec![r1, r2, r3]); - assert!(range_mgr.ranges().is_empty()); - } - - { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let r3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - range_mgr.new_range(r1.clone()); - range_mgr.new_range(r2.clone()); - range_mgr.new_range(r3.clone()); - - let r4 = CacheRange::new(b"k25".to_vec(), b"k55".to_vec()); - assert_eq!(range_mgr.evict_range(&r4), vec![r2, r3]); - assert_eq!(range_mgr.ranges().len(), 1); - } - - { - let mut range_mgr = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k30".to_vec(), b"k40".to_vec()); - let r3 = CacheRange::new(b"k50".to_vec(), b"k60".to_vec()); - range_mgr.new_range(r1.clone()); - range_mgr.new_range(r2.clone()); - range_mgr.new_range(r3.clone()); - - let r4 = CacheRange::new(b"k25".to_vec(), b"k75".to_vec()); - assert_eq!(range_mgr.evict_range(&r4), vec![r2, r3]); - assert_eq!(range_mgr.ranges().len(), 1); - } - } -} diff --git a/components/range_cache_memory_engine/src/range_stats.rs b/components/range_cache_memory_engine/src/range_stats.rs deleted file mode 100644 index 555319a4a20..00000000000 --- a/components/range_cache_memory_engine/src/range_stats.rs +++ /dev/null @@ -1,441 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - cmp, - collections::BTreeMap, - num::NonZeroUsize, - sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, - Arc, - }, - time::{Duration, Instant}, -}; - -use crossbeam::sync::ShardedLock; -use engine_traits::CacheRange; -use kvproto::metapb::Region; -use parking_lot::Mutex; -use raftstore::coprocessor::RegionInfoProvider; -use tikv_util::info; - -#[derive(Clone)] -pub(crate) struct RangeStatsManager { - num_regions: Arc, - info_provider: Arc, - prev_top_regions: Arc>>, - checking_top_regions: Arc, - region_loaded_at: Arc>>, - evict_min_duration: Duration, - expected_region_size: usize, -} - -/// Do not evict a region if has been cached for less than this duration. -pub const DEFAULT_EVICT_MIN_DURATION: Duration = Duration::from_secs(60 * 3); - -impl RangeStatsManager { - /// Creates a new RangeStatsManager that retrieves state from - /// `info_provider`. - /// - /// * `num_regions` Initial number of top regions to track and cache. This - /// may change, see `adjust_max_num_regions` below. - /// * `evict_min_duration` - do not evict regions that have been loaded for - /// less than this duration. - pub fn new( - num_regions: usize, - evict_min_duration: Duration, - expected_region_size: usize, - info_provider: Arc, - ) -> Self { - RangeStatsManager { - num_regions: Arc::new(AtomicUsize::new(num_regions)), - info_provider, - prev_top_regions: Arc::new(Mutex::new(BTreeMap::new())), - checking_top_regions: Arc::new(AtomicBool::new(false)), - region_loaded_at: Arc::new(ShardedLock::new(BTreeMap::new())), - evict_min_duration, - expected_region_size, - } - } - - /// Prevents two instances of this from running concurrently. - pub fn set_checking_top_regions(&self, v: bool) { - self.checking_top_regions.store(v, Ordering::Relaxed); - } - - /// Returns true if another thread is checking top regions. - pub fn checking_top_regions(&self) -> bool { - self.checking_top_regions.load(Ordering::Relaxed) - } - - fn set_max_num_regions(&self, v: usize) { - self.num_regions.store(v, Ordering::Relaxed); - } - - /// Returns the maximum number of regions that can be cached. - /// - /// See also `adjust_max_num_regions` below. - pub fn max_num_regions(&self) -> usize { - self.num_regions.load(Ordering::Relaxed) - } - - /// Collect candidates for eviction sorted by activity in creasing order: - /// - /// 1. Get all the regions sorted (decreasing) by region activity using - /// [raftstore::coprocessor::RegionCollector::handle_get_top_regions]. - /// 2. Remove all regions where `is_cached_pred` returns false when passed - /// the region's range or those that have been loaded for less than - /// `self.evict_min_duration`. - /// 3. Reverse the list so that it is now sorted in the order of increasing - /// activity. - /// 4. Store the results in `ranges_out` using [Vec::extend]. - pub fn collect_candidates_for_eviction( - &self, - ranges_out: &mut Vec<(CacheRange, u64)>, - is_cached_pred: F, - ) where - F: Fn(&CacheRange) -> bool, - { - // Gets all of the regions, sorted by activity. - let all_regions = self.info_provider.get_top_regions(None).unwrap(); - let regions_loaded = self.region_loaded_at.read().unwrap(); - ranges_out.extend( - all_regions - .iter() - .filter_map(|(region, approx_size)| { - let r = CacheRange::from_region(region); - is_cached_pred(&r) - .then(|| { - match regions_loaded.get(®ion.get_id()) { - // Do not evict ranges that were loaded less than - // `EVICT_MIN_DURATION` ago. - Some(&time_loaded) - if Instant::now() - time_loaded < self.evict_min_duration => - { - None - } - Some(_) | None => - // None indicates range loaded from a hint, not by this manager. - { - Some((r, *approx_size)) - } - } - }) - .flatten() - }) - .rev(), - ); - } - - /// This method should be called when `evicted_range` is succesfully evicted - /// to remove any internal `RegionStatsManager` that corresponds to the - /// range. - /// - /// Calls [raftstore::coprocessor::region_info_accessor::RegionInfoProvider::find_region_by_key] to - /// find the region corresponding to the range. - /// - /// TODO (afeinberg): This is inefficient, either make this method bulk, or - /// find another way to avoid calling `find_region_by_key` in a loop. - pub fn handle_range_evicted(&self, evicted_range: &CacheRange) { - // TODO (afeinberg): This is inefficient. - let _ = self - .info_provider - .find_region_by_key(&evicted_range.start) - .map(|region| { - let id = region.get_id(); - let _ = self.prev_top_regions.lock().remove(&id); - let _ = { - let mut regions_loaded = self.region_loaded_at.write().unwrap(); - regions_loaded.remove(&id) - }; - }); - } - - /// Attempt to adjust the maximum number of cached regions based on memory - /// usage: - /// - /// If `curr_memory_usage` is LESS THAN `threshold` by 3 * - /// self.expected_region_size bytes, *increase* the maximum - /// by `threshold - curr_memory_usage / 3 * self.expected_region_size`. - /// - /// If `curr_memory_usage` is GREATER THAN `threshold`, *decrease* the - /// maximum by `(curr_memory_usage - threshold) / - /// self.expected_region_size`. - pub fn adjust_max_num_regions(&self, curr_memory_usage: usize, threshold: usize) { - match curr_memory_usage.cmp(&threshold) { - cmp::Ordering::Less => { - let room_to_grow = threshold - curr_memory_usage; - if room_to_grow > self.expected_region_size * 3 { - let curr_num_regions = self.max_num_regions(); - let next_num_regions = - curr_num_regions + room_to_grow / (self.expected_region_size * 3); - info!("increasing number of top regions to cache"; - "from" => curr_num_regions, - "to" => next_num_regions, - ); - self.set_max_num_regions(next_num_regions); - } - } - cmp::Ordering::Greater => { - let to_shrink_by = curr_memory_usage - threshold; - let curr_num_regions = self.max_num_regions(); - let next_num_regions = curr_num_regions - .checked_sub(1.max(to_shrink_by / self.expected_region_size)) - .unwrap_or(1) - .max(1); - info!("decreasing number of top regions to cache"; - "from" => curr_num_regions, - "to" => next_num_regions, - ); - self.set_max_num_regions(next_num_regions); - } - _ => (), - }; - } - - /// Collects changes to top regions since the previous time this method was - /// called. This method should be called by background tasks responsing - /// for algorithmic loading and eviction. - /// - /// 1. Calls [raftstore::coprocessor::RegionCollector::handle_get_top_regions] to - /// request the top `self.max_num_regions()` regions. - /// - /// 2. If this is the first time this method has been called on this - /// instance, stores the results of previous step in `ranges_added_out` - /// and returns. - /// - /// 3. If this method has been called before, compare results of step 1 with - /// previous results: - /// - Newly added ranges (regions missing from previous results) are - /// stored in `ranges_added_out`. This can happen when - /// `max_num_regions()` increases, or when `max_num_regions()` is - /// unchanged but the activity order changed. - /// - Removed regions - regions included in previous results - but not the - /// current ones are stored in `ranges_removed_out`. - pub fn collect_changed_ranges( - &self, - ranges_added_out: &mut Vec, - ranges_removed_out: &mut Vec, - ) { - info!("collect_changed_ranges"; "num_regions" => self.max_num_regions()); - let curr_top_regions = self - .info_provider - .get_top_regions(Some(NonZeroUsize::try_from(self.max_num_regions()).unwrap())) - .unwrap() // TODO (afeinberg): Potentially custom error handling here. - .iter() - .map(|(r, _)| (r.id, r.clone())) - .collect::>(); - { - let mut region_loaded_map = self.region_loaded_at.write().unwrap(); - for ®ion_id in curr_top_regions.keys() { - let _ = region_loaded_map.insert(region_id, Instant::now()); - } - } - let prev_top_regions = { - let mut mut_prev_top_regions = self.prev_top_regions.lock(); - let ret = mut_prev_top_regions.clone(); - *mut_prev_top_regions = curr_top_regions.clone(); - ret - }; - if prev_top_regions.is_empty() { - ranges_added_out.extend(curr_top_regions.values().map(CacheRange::from_region)); - return; - } - let added_ranges = curr_top_regions - .iter() - .filter(|(id, _)| !prev_top_regions.contains_key(id)) - .map(|(_, region)| CacheRange::from_region(region)); - let regions_loaded = self.region_loaded_at.read().unwrap(); - let removed_ranges = prev_top_regions.iter().filter_map(|(&id, region)| { - if !curr_top_regions.contains_key(&id) { - match regions_loaded.get(&id) { - // Do not evict ranges that were loaded less than `EVICT_MIN_DURATION` ago. - Some(&time_loaded) - if Instant::now() - time_loaded < self.evict_min_duration => - { - let mut mut_prev_top_regions = self.prev_top_regions.lock(); - let _ = mut_prev_top_regions.insert(id, region.clone()); - None - } - _ => Some(CacheRange::from_region(region)), - } - } else { - None - } - }); - ranges_added_out.extend(added_ranges); - ranges_removed_out.extend(removed_ranges); - } -} - -#[cfg(test)] -pub mod tests { - use kvproto::metapb::Peer; - use raftstore::coprocessor::{self, region_info_accessor::TopRegions, RegionInfoProvider}; - use tikv_util::box_err; - - use super::*; - use crate::RangeCacheEngineConfig; - - struct RegionInfoSimulator { - regions: Mutex, - } - - impl RegionInfoSimulator { - fn set_top_regions(&self, top_regions: &TopRegions) { - *self.regions.lock() = top_regions.clone() - } - } - impl RegionInfoProvider for RegionInfoSimulator { - fn find_region_by_key(&self, key: &[u8]) -> coprocessor::Result { - self.regions - .lock() - .iter() - .find(|(region, _)| region.start_key == key) - .cloned() - .map_or_else( - || Err(box_err!(format!("key {:?} not found", key))), - |(region, _)| Ok(region), - ) - } - - fn get_top_regions(&self, count: Option) -> coprocessor::Result { - Ok(count.map_or_else( - || self.regions.lock().clone(), - |count| { - self.regions - .lock() - .iter() - .take(count.get()) - .cloned() - .collect::>() - }, - )) - } - } - - fn new_region(id: u64, start_key: &[u8], end_key: &[u8], version: u64) -> Region { - let mut region = Region::default(); - region.set_id(id); - region.set_start_key(start_key.to_vec()); - region.set_end_key(end_key.to_vec()); - region.mut_region_epoch().set_version(version); - region.mut_peers().push(Peer::default()); - region - } - - #[test] - fn test_collect_changed_regions() { - let region_1 = new_region(1, b"k1", b"k2", 0); - - let region_2 = new_region(2, b"k3", b"k4", 0); - let sim = Arc::new(RegionInfoSimulator { - regions: Mutex::new(vec![(region_1.clone(), 42)]), - }); - // 10 ms min duration eviction for testing purposes. - let rsm = RangeStatsManager::new( - 5, - Duration::from_millis(10), - RangeCacheEngineConfig::config_for_test().expected_region_size(), - sim.clone(), - ); - let mut added = Vec::::new(); - let mut removed = Vec::::new(); - rsm.collect_changed_ranges(&mut added, &mut removed); - assert_eq!(&added, &[CacheRange::from_region(®ion_1)]); - assert!(removed.is_empty()); - let top_regions = vec![(region_1.clone(), 42), (region_2.clone(), 7)]; - sim.set_top_regions(&top_regions); - added.clear(); - removed.clear(); - rsm.collect_changed_ranges(&mut added, &mut removed); - assert_eq!(&added, &[CacheRange::from_region(®ion_2)]); - assert!(removed.is_empty()); - let region_3 = new_region(3, b"k5", b"k6", 0); - let region_4 = new_region(4, b"k7", b"k8", 0); - let region_5 = new_region(5, b"k9", b"k10", 0); - let region_6 = new_region(6, b"k11", b"k12", 0); - let top_regions = vec![ - (region_6.clone(), 42), - (region_2.clone(), 7), - (region_3.clone(), 8), - (region_4.clone(), 9), - (region_5.clone(), 2), - ]; - sim.set_top_regions(&top_regions); - added.clear(); - removed.clear(); - rsm.collect_changed_ranges(&mut added, &mut removed); - assert_eq!( - &added, - &[ - CacheRange::from_region(®ion_3), - CacheRange::from_region(®ion_4), - CacheRange::from_region(®ion_5), - CacheRange::from_region(®ion_6) - ] - ); - // `region_1` is no longer in the top regions list, but since it was loaded less - // than 10 ms ago, it should not be included in the removed ranges. - assert!(removed.is_empty()); - std::thread::sleep(Duration::from_millis(100)); - // After 100 ms passed, check again, and verify `region_1` is evictable. - rsm.collect_changed_ranges(&mut added, &mut removed); - assert_eq!(&removed, &[CacheRange::from_region(®ion_1)]); - } - - #[test] - fn test_collect_candidates_for_eviction() { - fn make_region_vec(rs: &[&Region]) -> TopRegions { - rs.iter().map(|&r| (r.clone(), 42)).collect::>() - } - - let region_1 = new_region(1, b"k1", b"k2", 0); - let region_2 = new_region(2, b"k3", b"k4", 0); - let region_3 = new_region(3, b"k5", b"k6", 0); - let region_4 = new_region(4, b"k7", b"k8", 0); - let region_5 = new_region(5, b"k9", b"k10", 0); - let region_6 = new_region(6, b"k11", b"k12", 0); - - let all_regions = make_region_vec(&[ - ®ion_1, ®ion_2, ®ion_3, ®ion_4, ®ion_5, ®ion_6, - ]); - - let sim = Arc::new(RegionInfoSimulator { - regions: Mutex::new(all_regions.clone()), - }); - // 10 ms min duration eviction for testing purposes. - let rsm = RangeStatsManager::new( - 5, - Duration::from_millis(10), - RangeCacheEngineConfig::config_for_test().expected_region_size(), - sim.clone(), - ); - let r_i_p: Arc = sim.clone(); - let check_is_cached = move |range: &CacheRange| -> bool { - r_i_p - .find_region_by_key(&range.start[1..]) - .unwrap() - .get_id() - <= 5 - }; - let mut _added = Vec::::new(); - let mut _removed = Vec::::new(); - rsm.collect_changed_ranges(&mut _added, &mut _removed); - let mut candidates_for_eviction = Vec::<(CacheRange, u64)>::new(); - rsm.collect_candidates_for_eviction(&mut candidates_for_eviction, &check_is_cached); - assert!(candidates_for_eviction.is_empty()); - std::thread::sleep(Duration::from_millis(100)); - rsm.collect_candidates_for_eviction(&mut candidates_for_eviction, &check_is_cached); - let expected_candidates_for_eviction = all_regions - .iter() - .rev() - .filter_map(|(r, s)| { - if r.get_id() <= 5 { - Some((CacheRange::from_region(r), *s)) - } else { - None - } - }) - .collect::>(); - assert_eq!(expected_candidates_for_eviction, candidates_for_eviction); - } -} diff --git a/components/range_cache_memory_engine/src/write_batch.rs b/components/range_cache_memory_engine/src/write_batch.rs deleted file mode 100644 index 60da9dc7e38..00000000000 --- a/components/range_cache_memory_engine/src/write_batch.rs +++ /dev/null @@ -1,1026 +0,0 @@ -use std::{ - collections::BTreeSet, - sync::{atomic::Ordering, Arc}, - time::Duration, -}; - -use bytes::Bytes; -use crossbeam::epoch; -use engine_traits::{ - CacheRange, MiscExt, Mutable, RangeCacheEngine, Result, WriteBatch, WriteBatchExt, - WriteOptions, CF_DEFAULT, -}; -use tikv_util::{box_err, config::ReadableSize, error, info, time::Instant, warn}; - -use crate::{ - background::BackgroundTask, - engine::{cf_to_id, id_to_cf, is_lock_cf, SkiplistEngine}, - keys::{encode_key, InternalBytes, ValueType, ENC_KEY_SEQ_LENGTH}, - memory_controller::{MemoryController, MemoryUsage}, - metrics::{RANGE_PREPARE_FOR_WRITE_DURATION_HISTOGRAM, WRITE_DURATION_HISTOGRAM}, - range_manager::{RangeCacheStatus, RangeManager}, - RangeCacheMemoryEngine, -}; - -// This is a bit of a hack. It's the overhead of a node in the skiplist with -// height 3, which is sufficiently conservative for estimating the node overhead -// size. -pub(crate) const NODE_OVERHEAD_SIZE_EXPECTATION: usize = 96; -// As every key/value holds a Arc, this overhead should be -// taken into consideration. -pub(crate) const MEM_CONTROLLER_OVERHEAD: usize = 8; -// A threshold that when the lock cf increment bytes exceed it, a -// CleanLockTombstone will be scheduled to cleanup the lock tombstones. -// It's somewhat like RocksDB flush memtables when the memtable reaches to a -// certain bytes so that the compactions may cleanup some tombstones. By -// default, the memtable size for lock cf is 32MB. As not all ranges will be -// cached in the memory, just use half of it here. -const AMOUNT_TO_CLEAN_TOMBSTONE: u64 = ReadableSize::mb(16).0; -// The value of the delete entry in the in-memory engine. It's just a emptry -// slice. -const DELETE_ENTRY_VAL: &[u8] = b""; - -// `prepare_for_range` should be called before raft command apply for each peer -// delegate. It sets `range_cache_status` which is used to determine whether the -// writes of this peer should be buffered. -pub struct RangeCacheWriteBatch { - // `id` strictly incrementing and is used as the key in `ranges_being_written`, which records - // the ranges that are being written, so that when the write batch is consumed, we can - // quickly remove the ranges involved. - id: u64, - // `range_cache_status` indicates whether the range is cached, loading data, or not cached. If - // it is cached, we should buffer the write in `buffer` which is consumed during the write - // is written in the kv engine. If it is loading data, we should buffer the write in - // `pending_range_in_loading_buffer` which is cached in the memory engine and will be consumed - // after the snapshot has been loaded. - range_cache_status: RangeCacheStatus, - buffer: Vec, - pending_range_in_loading_buffer: Vec, - engine: RangeCacheMemoryEngine, - save_points: Vec, - sequence_number: Option, - memory_controller: Arc, - memory_usage_reach_hard_limit: bool, - - current_range: Option, - // the ranges that reaches the hard limit and need to be evicted - ranges_to_evict: BTreeSet, - - // record the total durations of the prepare work for write in the write batch - prepare_for_write_duration: Duration, -} - -impl std::fmt::Debug for RangeCacheWriteBatch { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("RangeCacheWriteBatch") - .field("buffer", &self.buffer) - .field("save_points", &self.save_points) - .field("sequence_number", &self.sequence_number) - .finish() - } -} - -impl From<&RangeCacheMemoryEngine> for RangeCacheWriteBatch { - fn from(engine: &RangeCacheMemoryEngine) -> Self { - Self { - id: engine.alloc_write_batch_id(), - range_cache_status: RangeCacheStatus::NotInCache, - buffer: Vec::new(), - pending_range_in_loading_buffer: Vec::new(), - engine: engine.clone(), - save_points: Vec::new(), - sequence_number: None, - memory_controller: engine.memory_controller(), - memory_usage_reach_hard_limit: false, - current_range: None, - ranges_to_evict: BTreeSet::default(), - prepare_for_write_duration: Duration::default(), - } - } -} - -impl RangeCacheWriteBatch { - pub fn with_capacity(engine: &RangeCacheMemoryEngine, cap: usize) -> Self { - Self { - id: engine.alloc_write_batch_id(), - range_cache_status: RangeCacheStatus::NotInCache, - buffer: Vec::with_capacity(cap), - // cache_buffer should need small capacity - pending_range_in_loading_buffer: Vec::new(), - engine: engine.clone(), - save_points: Vec::new(), - sequence_number: None, - memory_controller: engine.memory_controller(), - memory_usage_reach_hard_limit: false, - current_range: None, - ranges_to_evict: BTreeSet::default(), - prepare_for_write_duration: Duration::default(), - } - } - - /// Trigger a CleanLockTombstone task if the accumulated lock cf - /// modification exceeds the threshold (16MB). - /// - /// NB: Acquiring the RocksDB mutex is necessary to get the oldest snapshot, - /// so avoid calling this in any RocksDB callback (e.g., write batch - /// callback) to prevent potential deadlocks. - pub fn maybe_compact_lock_cf(&self) { - if self.engine.lock_modification_bytes.load(Ordering::Relaxed) > AMOUNT_TO_CLEAN_TOMBSTONE { - // Use `swap` to only allow one schedule when multiple writers reaches the limit - // concurrently. - if self - .engine - .lock_modification_bytes - .swap(0, Ordering::Relaxed) - > AMOUNT_TO_CLEAN_TOMBSTONE - { - let rocks_engine = self.engine.rocks_engine.as_ref().unwrap(); - let last_seqno = rocks_engine.get_latest_sequence_number(); - let snapshot_seqno = self - .engine - .rocks_engine - .as_ref() - .unwrap() - .get_oldest_snapshot_sequence_number() - .unwrap_or(last_seqno); - - if let Err(e) = self - .engine - .bg_worker_manager() - .schedule_task(BackgroundTask::CleanLockTombstone(snapshot_seqno)) - { - error!( - "schedule lock tombstone cleanup failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - } - } - - /// Sets the sequence number for this batch. This should only be called - /// prior to writing the batch. - pub fn set_sequence_number(&mut self, seq: u64) -> Result<()> { - if let Some(seqno) = self.sequence_number { - return Err(box_err!("Sequence number {} already set", seqno)); - }; - self.sequence_number = Some(seq); - Ok(()) - } - - // Note: `seq` is the sequence number of the first key in this write batch in - // the RocksDB, which will be incremented automatically for each key, so - // that all keys have unique sequence numbers. - fn write_impl(&mut self, mut seq: u64) -> Result<()> { - fail::fail_point!("on_write_impl"); - let ranges_to_delete = self.handle_ranges_to_evict(); - let (entries_to_write, engine) = self.engine.handle_pending_range_in_loading_buffer( - &mut seq, - std::mem::take(&mut self.pending_range_in_loading_buffer), - ); - let guard = &epoch::pin(); - let start = Instant::now(); - let mut lock_modification: u64 = 0; - let mut have_entry_applied = false; - // Some entries whose ranges may be marked as evicted above, but it does not - // matter, they will be deleted later. - let res = entries_to_write - .into_iter() - .chain(std::mem::take(&mut self.buffer)) - .try_for_each(|e| { - have_entry_applied = true; - if is_lock_cf(e.cf) { - lock_modification += e.data_size() as u64; - } - seq += 1; - e.write_to_memory(seq - 1, &engine, self.memory_controller.clone(), guard) - }); - let duration = start.saturating_elapsed_secs(); - WRITE_DURATION_HISTOGRAM.observe(duration); - - fail::fail_point!("in_memory_engine_write_batch_consumed"); - fail::fail_point!("before_clear_ranges_in_being_written"); - - self.engine - .core - .write() - .mut_range_manager() - .clear_ranges_in_being_written(self.id, have_entry_applied); - - self.engine - .lock_modification_bytes - .fetch_add(lock_modification, Ordering::Relaxed); - - if !ranges_to_delete.is_empty() { - if let Err(e) = self - .engine - .bg_worker_manager() - .schedule_task(BackgroundTask::DeleteRange(ranges_to_delete)) - { - error!( - "schedule delete range failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - - let dur = std::mem::take(&mut self.prepare_for_write_duration); - RANGE_PREPARE_FOR_WRITE_DURATION_HISTOGRAM.observe(dur.as_secs_f64()); - - res - } - - // return ranges that can be deleted from engine now - fn handle_ranges_to_evict(&mut self) -> Vec { - if self.ranges_to_evict.is_empty() { - return vec![]; - } - let mut core = self.engine.core.write(); - let mut ranges = vec![]; - let range_manager = core.mut_range_manager(); - for r in std::mem::take(&mut self.ranges_to_evict) { - let mut ranges_to_delete = range_manager.evict_range(&r); - if !ranges_to_delete.is_empty() { - ranges.append(&mut ranges_to_delete); - continue; - } - } - ranges - } - - #[inline] - pub fn set_range_cache_status(&mut self, range_cache_status: RangeCacheStatus) { - self.range_cache_status = range_cache_status; - } - - fn process_cf_operation(&mut self, entry_size: F1, entry: F2) - where - F1: FnOnce() -> usize, - F2: FnOnce() -> RangeCacheWriteBatchEntry, - { - if !matches!( - self.range_cache_status, - RangeCacheStatus::Cached | RangeCacheStatus::Loading - ) || self.memory_usage_reach_hard_limit - { - return; - } - - if !self.engine.enabled() { - let range = self.current_range.clone().unwrap(); - info!( - "range cache is disabled, evict the range"; - "range_start" => log_wrappers::Value(&range.start), - "range_end" => log_wrappers::Value(&range.end), - ); - self.ranges_to_evict.insert(range); - return; - } - let memory_expect = entry_size(); - if !self.memory_acquire(memory_expect) { - let range = self.current_range.clone().unwrap(); - info!( - "memory acquire failed due to reaching hard limit"; - "range_start" => log_wrappers::Value(&range.start), - "range_end" => log_wrappers::Value(&range.end), - ); - self.ranges_to_evict.insert(range); - return; - } - - match self.range_cache_status { - RangeCacheStatus::Cached => { - self.buffer.push(entry()); - } - RangeCacheStatus::Loading => { - self.pending_range_in_loading_buffer.push(entry()); - } - RangeCacheStatus::NotInCache => {} - } - } - - fn schedule_memory_check(&self) { - if self.memory_controller.memory_checking() { - return; - } - self.memory_controller.set_memory_checking(true); - if let Err(e) = self - .engine - .bg_worker_manager() - .schedule_task(BackgroundTask::MemoryCheckAndEvict) - { - error!( - "schedule memory check failed"; - "err" => ?e, - ); - assert!(tikv_util::thread_group::is_shutdown(!cfg!(test))); - } - } - - // return false means the memory usage reaches to hard limit and we have no - // quota to write to the engine - fn memory_acquire(&mut self, mem_required: usize) -> bool { - match self.memory_controller.acquire(mem_required) { - MemoryUsage::HardLimitReached(n) => { - self.memory_usage_reach_hard_limit = true; - warn!( - "the memory usage of in-memory engine reaches to hard limit"; - "range" => ?self.current_range.as_ref().unwrap(), - "memory_usage(MB)" => ReadableSize(n as u64).as_mb_f64(), - ); - self.schedule_memory_check(); - return false; - } - MemoryUsage::SoftLimitReached(_) => { - self.schedule_memory_check(); - } - _ => {} - } - true - } -} - -#[derive(Clone, Debug)] -enum WriteBatchEntryInternal { - PutValue(Bytes), - Deletion, -} - -impl WriteBatchEntryInternal { - fn encode(&self, key: &[u8], seq: u64) -> (InternalBytes, InternalBytes) { - match self { - WriteBatchEntryInternal::PutValue(value) => ( - encode_key(key, seq, ValueType::Value), - InternalBytes::from_bytes(value.clone()), - ), - WriteBatchEntryInternal::Deletion => ( - encode_key(key, seq, ValueType::Deletion), - InternalBytes::from_bytes(Bytes::from_static(DELETE_ENTRY_VAL)), - ), - } - } - fn data_size(&self) -> usize { - match self { - WriteBatchEntryInternal::PutValue(value) => value.len(), - WriteBatchEntryInternal::Deletion => 0, - } - } -} - -#[derive(Clone, Debug)] -pub(crate) struct RangeCacheWriteBatchEntry { - cf: usize, - key: Bytes, - inner: WriteBatchEntryInternal, -} - -impl RangeCacheWriteBatchEntry { - pub fn put_value(cf: &str, key: &[u8], value: &[u8]) -> Self { - Self { - cf: cf_to_id(cf), - key: Bytes::copy_from_slice(key), - inner: WriteBatchEntryInternal::PutValue(Bytes::copy_from_slice(value)), - } - } - - pub fn deletion(cf: &str, key: &[u8]) -> Self { - Self { - cf: cf_to_id(cf), - key: Bytes::copy_from_slice(key), - inner: WriteBatchEntryInternal::Deletion, - } - } - - #[inline] - pub fn encode(&self, seq: u64) -> (InternalBytes, InternalBytes) { - self.inner.encode(&self.key, seq) - } - - pub fn calc_put_entry_size(key: &[u8], value: &[u8]) -> usize { - RangeCacheWriteBatchEntry::memory_size_required_for_key_value(key, value) - } - - pub fn calc_delete_entry_size(key: &[u8]) -> usize { - // delete also has value which is an empty bytes - RangeCacheWriteBatchEntry::memory_size_required_for_key_value(key, DELETE_ENTRY_VAL) - } - - fn memory_size_required_for_key_value(key: &[u8], value: &[u8]) -> usize { - // The key will be encoded with sequence number when it is written to in-memory - // engine, so we have to acquire the sequence number suffix memory usage. - InternalBytes::memory_size_required(key.len() + ENC_KEY_SEQ_LENGTH) - + InternalBytes::memory_size_required(value.len()) - } - - pub fn data_size(&self) -> usize { - self.key.len() + ENC_KEY_SEQ_LENGTH + self.inner.data_size() - } - - #[inline] - pub fn write_to_memory( - &self, - seq: u64, - skiplist_engine: &SkiplistEngine, - memory_controller: Arc, - guard: &epoch::Guard, - ) -> Result<()> { - let handle = skiplist_engine.cf_handle(id_to_cf(self.cf)); - - let (mut key, mut value) = self.encode(seq); - key.set_memory_controller(memory_controller.clone()); - value.set_memory_controller(memory_controller); - handle.insert(key, value, guard); - - Ok(()) - } -} - -// group_write_batch_entries classifies the entries to two categories according -// to the infomation in range manager: -// 1. entreis that can be written to memory engine directly -// 2. entreis that need to be cached -// For 2, we group the entries according to the range. The method uses the -// property that entries in the same range are neighbors. Though that the method -// still handles corretly even they are randomly positioned. -// -// Note: Some entries may not found a range in both -// `pending_ranges_loading_data` and `ranges`, it means the range has been -// evicted. -pub fn group_write_batch_entries( - mut entries: Vec, - range_manager: &RangeManager, -) -> ( - Vec<(CacheRange, Vec)>, - Vec, -) { - let mut group_entries_to_cache: Vec<(CacheRange, Vec)> = vec![]; - let mut entries_to_write: Vec = vec![]; - let mut drain = entries.drain(..).peekable(); - while let Some(mut e) = drain.next() { - if let Some((range_loading, ..)) = range_manager - .pending_ranges_loading_data - .iter() - .find(|r| r.0.contains_key(&e.key)) - { - // The range of this write batch entry is still in loading status - let mut current_group = vec![]; - loop { - current_group.push(e); - if let Some(next_e) = drain.peek() - && range_loading.contains_key(&next_e.key) - { - e = drain.next().unwrap(); - } else { - break; - } - } - group_entries_to_cache.push((range_loading.clone(), current_group)); - } else if let Some(range) = range_manager - .ranges() - .keys() - .find(|r| r.contains_key(&e.key)) - { - // The range has finished loading and became a normal cache range - loop { - entries_to_write.push(e); - if let Some(next_e) = drain.peek() - && range.contains_key(&next_e.key) - { - e = drain.next().unwrap(); - } else { - break; - } - } - } else { - // The range of the entry is not found, it means the ranges has been - // evicted - } - } - (group_entries_to_cache, entries_to_write) -} - -impl WriteBatchExt for RangeCacheMemoryEngine { - type WriteBatch = RangeCacheWriteBatch; - // todo: adjust it - const WRITE_BATCH_MAX_KEYS: usize = 256; - - fn write_batch(&self) -> Self::WriteBatch { - RangeCacheWriteBatch::from(self) - } - - fn write_batch_with_cap(&self, cap: usize) -> Self::WriteBatch { - RangeCacheWriteBatch::with_capacity(self, cap) - } -} - -impl WriteBatch for RangeCacheWriteBatch { - fn write_opt(&mut self, _: &WriteOptions) -> Result { - self.sequence_number - .map(|seq| self.write_impl(seq).map(|()| seq)) - .transpose() - .map(|o| o.ok_or_else(|| box_err!("sequence_number must be set!")))? - } - - fn data_size(&self) -> usize { - self.buffer - .iter() - .map(RangeCacheWriteBatchEntry::data_size) - .sum() - } - - fn count(&self) -> usize { - self.buffer.len() - } - - fn is_empty(&self) -> bool { - self.buffer.is_empty() - } - - fn should_write_to_engine(&self) -> bool { - unimplemented!() - } - - fn clear(&mut self) { - self.buffer.clear(); - self.save_points.clear(); - _ = self.sequence_number.take(); - } - - fn set_save_point(&mut self) { - self.save_points.push(self.buffer.len()) - } - - fn pop_save_point(&mut self) -> Result<()> { - self.save_points - .pop() - .map(|_| ()) - .ok_or_else(|| box_err!("no save points available")) - } - - fn rollback_to_save_point(&mut self) -> Result<()> { - self.save_points - .pop() - .map(|sp| { - self.buffer.truncate(sp); - }) - .ok_or_else(|| box_err!("no save point available!")) - } - - fn merge(&mut self, mut other: Self) -> Result<()> { - self.buffer.append(&mut other.buffer); - Ok(()) - } - - fn prepare_for_range(&mut self, range: CacheRange) { - let time = Instant::now(); - self.set_range_cache_status(self.engine.prepare_for_apply(self.id, &range)); - self.memory_usage_reach_hard_limit = false; - self.current_range = Some(range); - self.prepare_for_write_duration += time.saturating_elapsed(); - } -} - -impl Mutable for RangeCacheWriteBatch { - fn put(&mut self, key: &[u8], val: &[u8]) -> Result<()> { - self.put_cf(CF_DEFAULT, key, val) - } - - fn put_cf(&mut self, cf: &str, key: &[u8], val: &[u8]) -> Result<()> { - self.process_cf_operation( - || RangeCacheWriteBatchEntry::calc_put_entry_size(key, val), - || RangeCacheWriteBatchEntry::put_value(cf, key, val), - ); - Ok(()) - } - - fn delete(&mut self, key: &[u8]) -> Result<()> { - self.delete_cf(CF_DEFAULT, key) - } - - fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { - self.process_cf_operation( - || RangeCacheWriteBatchEntry::calc_delete_entry_size(key), - || RangeCacheWriteBatchEntry::deletion(cf, key), - ); - Ok(()) - } - - // rather than delete the keys in the range, we evict ranges that overlap with - // them directly - fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - let range = CacheRange::new(begin_key.to_vec(), end_key.to_vec()); - self.engine.evict_range(&range); - Ok(()) - } - - fn delete_range_cf(&mut self, _: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - let range = CacheRange::new(begin_key.to_vec(), end_key.to_vec()); - self.engine.evict_range(&range); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::{sync::Arc, time::Duration}; - - use crossbeam_skiplist::SkipList; - use engine_rocks::util::new_engine; - use engine_traits::{ - CacheRange, FailedReason, KvEngine, Peekable, RangeCacheEngine, WriteBatch, CF_WRITE, - DATA_CFS, - }; - use online_config::{ConfigChange, ConfigManager, ConfigValue}; - use tempfile::Builder; - use tikv_util::config::VersionTrack; - - use super::*; - use crate::{ - background::flush_epoch, config::RangeCacheConfigManager, RangeCacheEngineConfig, - RangeCacheEngineContext, - }; - - // We should not use skiplist.get directly as we only cares keys without - // sequence number suffix - fn get_value( - sl: &Arc>, - key: &InternalBytes, - guard: &epoch::Guard, - ) -> Option> { - let mut iter = sl.owned_iter(); - iter.seek(key, guard); - if iter.valid() && iter.key().same_user_key_with(key) { - return Some(iter.value().as_slice().to_vec()); - } - None - } - - #[test] - fn test_write_to_skiplist() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let r = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(r.clone()); - { - let mut core = engine.core.write(); - core.mut_range_manager().set_safe_point(&r, 10); - } - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.range_cache_status = RangeCacheStatus::Cached; - wb.prepare_for_range(r.clone()); - wb.put(b"aaa", b"bbb").unwrap(); - wb.set_sequence_number(1).unwrap(); - assert_eq!(wb.write().unwrap(), 1); - let sl = engine.core.read().engine().data[cf_to_id(CF_DEFAULT)].clone(); - let guard = &crossbeam::epoch::pin(); - let val = get_value(&sl, &encode_key(b"aaa", 2, ValueType::Value), guard).unwrap(); - assert_eq!(&b"bbb"[..], val.as_slice()); - } - - #[test] - fn test_savepoints() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let r = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(r.clone()); - { - let mut core = engine.core.write(); - core.mut_range_manager().set_safe_point(&r, 10); - } - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.range_cache_status = RangeCacheStatus::Cached; - wb.prepare_for_range(r.clone()); - wb.put(b"aaa", b"bbb").unwrap(); - wb.set_save_point(); - wb.put(b"aaa", b"ccc").unwrap(); - wb.put(b"ccc", b"ddd").unwrap(); - wb.rollback_to_save_point().unwrap(); - wb.set_sequence_number(1).unwrap(); - assert_eq!(wb.write().unwrap(), 1); - let sl = engine.core.read().engine().data[cf_to_id(CF_DEFAULT)].clone(); - let guard = &crossbeam::epoch::pin(); - let val = get_value(&sl, &encode_key(b"aaa", 1, ValueType::Value), guard).unwrap(); - assert_eq!(&b"bbb"[..], val.as_slice()); - assert!(get_value(&sl, &encode_key(b"ccc", 1, ValueType::Value), guard).is_none()) - } - - #[test] - fn test_put_write_clear_delete_put_write() { - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let r = CacheRange::new(b"".to_vec(), b"z".to_vec()); - engine.new_range(r.clone()); - { - let mut core = engine.core.write(); - core.mut_range_manager().set_safe_point(&r, 10); - } - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.range_cache_status = RangeCacheStatus::Cached; - wb.prepare_for_range(r.clone()); - wb.put(b"aaa", b"bbb").unwrap(); - wb.set_sequence_number(1).unwrap(); - _ = wb.write(); - wb.clear(); - wb.prepare_for_range(r.clone()); - wb.put(b"bbb", b"ccc").unwrap(); - wb.delete(b"aaa").unwrap(); - wb.set_sequence_number(2).unwrap(); - _ = wb.write(); - let snapshot = engine.snapshot(r, u64::MAX, 3).unwrap(); - assert_eq!( - snapshot.get_value(&b"bbb"[..]).unwrap().unwrap(), - &b"ccc"[..] - ); - assert!(snapshot.get_value(&b"aaa"[..]).unwrap().is_none()) - } - - #[test] - fn test_prepare_for_apply() { - let path = Builder::new() - .prefix("test_prepare_for_apply") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let r1 = CacheRange::new(b"k01".to_vec(), b"k05".to_vec()); - let r2 = CacheRange::new(b"k05".to_vec(), b"k10".to_vec()); - let r3 = CacheRange::new(b"k10".to_vec(), b"k15".to_vec()); - { - engine.new_range(r1.clone()); - let mut core = engine.core.write(); - core.mut_range_manager().set_safe_point(&r1, 10); - - let snap = Arc::new(rocks_engine.snapshot(None)); - core.mut_range_manager() - .pending_ranges_loading_data - .push_back((r2.clone(), snap, false)); - } - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.prepare_for_range(r1.clone()); - wb.put(b"k01", b"val1").unwrap(); - wb.prepare_for_range(r2.clone()); - wb.put(b"k05", b"val5").unwrap(); - wb.prepare_for_range(r3); - wb.put(b"k10", b"val10").unwrap(); - wb.set_sequence_number(2).unwrap(); - let _ = wb.write(); - let snapshot = engine.snapshot(r1.clone(), u64::MAX, 5).unwrap(); - assert_eq!( - snapshot.get_value(&b"k01"[..]).unwrap().unwrap(), - &b"val1"[..] - ); - { - let core = engine.core.read(); - assert_eq!(core.cached_write_batch.get(&r2).unwrap().len(), 1); - } - - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.prepare_for_range(r1.clone()); - wb.delete(b"k01").unwrap(); - wb.set_sequence_number(5).unwrap(); - let _ = wb.write(); - let snapshot = engine.snapshot(r1, u64::MAX, 6).unwrap(); - assert!(snapshot.get_value(&b"k01"[..]).unwrap().is_none(),); - } - - #[test] - fn test_group_entries() { - let path = Builder::new().prefix("test_group").tempdir().unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - let snap = rocks_engine.snapshot(None); - - let mut range_manager = RangeManager::default(); - let r1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let r2 = CacheRange::new(b"k10".to_vec(), b"k20".to_vec()); - let r3 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - range_manager.new_range(r1.clone()); - let snap = Arc::new(snap); - range_manager - .pending_ranges_loading_data - .push_back((r2.clone(), snap.clone(), false)); - range_manager - .pending_ranges_loading_data - .push_back((r3.clone(), snap, false)); - - let entries = vec![ - RangeCacheWriteBatchEntry::put_value(CF_DEFAULT, b"k22", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_DEFAULT, b"k21", b"val"), - RangeCacheWriteBatchEntry::deletion(CF_DEFAULT, b"k25"), - RangeCacheWriteBatchEntry::put_value(CF_DEFAULT, b"k28", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k03", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k05", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k09", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k10", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k19", b"val"), - // Mock the range is evicted - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k32", b"val"), - RangeCacheWriteBatchEntry::put_value(CF_WRITE, b"k45", b"val"), - ]; - - let (group_entries_to_cache, entries_to_write) = - group_write_batch_entries(entries, &range_manager); - assert_eq!(group_entries_to_cache.len(), 2); - assert_eq!(entries_to_write.len(), 3); - entries_to_write - .iter() - .for_each(|e| assert!(r1.contains_key(&e.key))); - group_entries_to_cache.iter().for_each(|(range, entries)| { - if *range == r2 { - assert_eq!(entries.len(), 2); - } else if *range == r3 { - assert_eq!(entries.len(), 4); - } else { - unreachable!(); - } - entries - .iter() - .for_each(|e| assert!(range.contains_key(&e.key))) - }); - } - - fn wait_evict_done(engine: &RangeCacheMemoryEngine) { - let mut wait = 0; - while wait < 10 { - wait += 1; - if !engine - .core - .read() - .range_manager() - .ranges_being_deleted - .is_empty() - { - std::thread::sleep(Duration::from_millis(200)); - } else { - break; - } - } - } - - #[test] - fn test_write_batch_with_memory_controller() { - let mut config = RangeCacheEngineConfig::default(); - config.soft_limit_threshold = Some(ReadableSize(500)); - config.hard_limit_threshold = Some(ReadableSize(1000)); - config.enabled = true; - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(config), - ))); - let range1 = CacheRange::new(b"kk00".to_vec(), b"kk10".to_vec()); - let range2 = CacheRange::new(b"kk10".to_vec(), b"kk20".to_vec()); - let range3 = CacheRange::new(b"kk20".to_vec(), b"kk30".to_vec()); - let range4 = CacheRange::new(b"kk30".to_vec(), b"kk40".to_vec()); - let range5 = CacheRange::new(b"kk40".to_vec(), b"kk50".to_vec()); - for r in [&range1, &range2, &range3, &range4, &range5] { - engine.new_range(r.clone()); - { - let mut core = engine.core.write(); - core.mut_range_manager().set_safe_point(r, 10); - } - let _ = engine.snapshot(r.clone(), 1000, 1000).unwrap(); - } - - let val1: Vec = (0..150).map(|_| 0).collect(); - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.prepare_for_range(range1.clone()); - // memory required: - // 4(key) + 8(sequencen number) + 150(value) + 16(2 Arc) = 28 - // Now, 562 - wb.delete(b"kk21").unwrap(); - let val2: Vec = (0..500).map(|_| 2).collect(); - // The memory will fail to acquire - wb.put(b"kk22", &val2).unwrap(); - - // The memory capacity is enough for the following two inserts - let val3: Vec = (0..150).map(|_| 3).collect(); - wb.prepare_for_range(range4.clone()); - // Now, 740 - wb.put(b"kk32", &val3).unwrap(); - - // The memory will fail to acquire - let val4: Vec = (0..300).map(|_| 3).collect(); - wb.prepare_for_range(range5.clone()); - wb.put(b"kk41", &val4).unwrap(); - - let memory_controller = engine.memory_controller(); - // We should have allocated 740 as calculated above - assert_eq!(740, memory_controller.mem_usage()); - wb.write_impl(1000).unwrap(); - // We dont count the node overhead (96 bytes for each node) in write batch, so - // after they are written into the engine, the mem usage can even exceed - // the hard limit. But this should be fine as this amount should be at - // most MB level. - assert_eq!(1220, memory_controller.mem_usage()); - - let snap1 = engine.snapshot(range1.clone(), 1000, 1010).unwrap(); - assert_eq!(snap1.get_value(b"kk01").unwrap().unwrap(), &val1); - let snap2 = engine.snapshot(range2.clone(), 1000, 1010).unwrap(); - assert_eq!(snap2.get_value(b"kk11").unwrap().unwrap(), &val1); - - assert_eq!( - engine.snapshot(range3.clone(), 1000, 1000).unwrap_err(), - FailedReason::NotCached - ); - - let snap4 = engine.snapshot(range4.clone(), 1000, 1010).unwrap(); - assert_eq!(snap4.get_value(b"kk32").unwrap().unwrap(), &val3); - - assert_eq!( - engine.snapshot(range5.clone(), 1000, 1010).unwrap_err(), - FailedReason::NotCached - ); - - // For range3, one write is buffered but others is rejected, so the range3 is - // evicted and the keys of it are deleted. After flush the epoch, we should - // get 1220-178-28(kv)-96*2(node overhead) = 822 memory usage. - flush_epoch(); - wait_evict_done(&engine); - assert_eq!(822, memory_controller.mem_usage()); - - drop(snap1); - engine.evict_range(&range1); - flush_epoch(); - wait_evict_done(&engine); - assert_eq!(548, memory_controller.mem_usage()); - } - - #[test] - fn test_write_batch_with_config_change() { - let mut config = RangeCacheEngineConfig::default(); - config.soft_limit_threshold = Some(ReadableSize(u64::MAX)); - config.hard_limit_threshold = Some(ReadableSize(u64::MAX)); - config.enabled = true; - let config = Arc::new(VersionTrack::new(config)); - let engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config.clone())); - let r1 = CacheRange::new(b"kk00".to_vec(), b"kk10".to_vec()); - let r2 = CacheRange::new(b"kk10".to_vec(), b"kk20".to_vec()); - for r in [&r1, &r2] { - engine.new_range(r.clone()); - { - let mut core = engine.core.write(); - core.mut_range_manager().set_safe_point(r, 10); - } - let _ = engine.snapshot(r.clone(), 1000, 1000).unwrap(); - } - - let val1: Vec = (0..150).map(|_| 0).collect(); - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.prepare_for_range(r2.clone()); - wb.put(b"kk11", &val1).unwrap(); - let snap1 = engine.snapshot(r1.clone(), 1000, 1000).unwrap(); - - // disable the range cache - let mut config_manager = RangeCacheConfigManager(config.clone()); - let mut config_change = ConfigChange::new(); - config_change.insert(String::from("enabled"), ConfigValue::Bool(false)); - config_manager.dispatch(config_change).unwrap(); - - wb.write_impl(1000).unwrap(); - // existing snapshot can still work after the range cache is disabled, but new - // snapshot will fail to create - assert!(snap1.get_value(b"kk00").unwrap().is_none()); - - let mut wb = RangeCacheWriteBatch::from(&engine); - wb.prepare_for_range(r1.clone()); - // put should trigger the evict and it won't write into range cache - wb.put(b"kk01", &val1).unwrap(); - wb.write_impl(1000).unwrap(); - - // new snapshot will fail to create as it's evicted already - let snap1 = engine.snapshot(r1.clone(), 1000, 1000); - assert_eq!(snap1.unwrap_err(), FailedReason::NotCached); - let snap2 = engine.snapshot(r2.clone(), 1000, 1000).unwrap(); - // if no new write, the range cache can still be used. - assert_eq!(snap2.get_value(b"kk11").unwrap().unwrap(), &val1); - - // enable the range cache again - let mut config_manager = RangeCacheConfigManager(config.clone()); - let mut config_change = ConfigChange::new(); - config_change.insert(String::from("enabled"), ConfigValue::Bool(true)); - config_manager.dispatch(config_change).unwrap(); - - let snap1 = engine.snapshot(r1.clone(), 1000, 1000); - assert_eq!(snap1.unwrap_err(), FailedReason::NotCached); - let snap2 = engine.snapshot(r2.clone(), 1000, 1000).unwrap(); - assert_eq!(snap2.get_value(b"kk11").unwrap().unwrap(), &val1); - } -} diff --git a/components/range_cache_memory_engine/tests/failpoints/test_memory_engine.rs b/components/range_cache_memory_engine/tests/failpoints/test_memory_engine.rs deleted file mode 100644 index f43e600cdca..00000000000 --- a/components/range_cache_memory_engine/tests/failpoints/test_memory_engine.rs +++ /dev/null @@ -1,499 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - sync::{mpsc::sync_channel, Arc}, - time::Duration, -}; - -use crossbeam::epoch; -use engine_rocks::util::new_engine; -use engine_traits::{ - CacheRange, Mutable, RangeCacheEngine, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, - CF_WRITE, DATA_CFS, -}; -use range_cache_memory_engine::{ - decode_key, encode_key_for_boundary_without_mvcc, encoding_for_filter, test_util::put_data, - BackgroundTask, InternalBytes, InternalKey, RangeCacheEngineConfig, RangeCacheEngineContext, - RangeCacheMemoryEngine, SkiplistHandle, ValueType, -}; -use tempfile::Builder; -use tikv_util::config::{ReadableDuration, ReadableSize, VersionTrack}; -use txn_types::{Key, TimeStamp}; - -#[test] -fn test_set_disk_engine() { - let (tx, rx) = sync_channel(0); - fail::cfg_callback("in_memory_engine_set_rocks_engine", move || { - let _ = tx.send(true); - }) - .unwrap(); - let mut engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(RangeCacheEngineConfig::config_for_test()), - ))); - let path = Builder::new() - .prefix("test_set_disk_engine") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - engine.set_disk_engine(rocks_engine.clone()); - rx.recv_timeout(Duration::from_secs(5)).unwrap(); -} - -// We should not use skiplist.get directly as we only cares keys without -// sequence number suffix -fn key_exist(sl: &SkiplistHandle, key: &InternalBytes, guard: &epoch::Guard) -> bool { - let mut iter = sl.iterator(); - iter.seek(key, guard); - if iter.valid() && iter.key().same_user_key_with(key) { - return true; - } - false -} - -#[test] -fn test_gc_worker() { - let mut config = RangeCacheEngineConfig::config_for_test(); - config.gc_interval = ReadableDuration(Duration::from_secs(1)); - let engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(config), - ))); - let memory_controller = engine.memory_controller(); - let (write, default) = { - let mut core = engine.core().write(); - core.mut_range_manager() - .new_range(CacheRange::new(b"".to_vec(), b"z".to_vec())); - let engine = core.engine(); - (engine.cf_handle(CF_WRITE), engine.cf_handle(CF_DEFAULT)) - }; - - fail::cfg("in_memory_engine_gc_oldest_seqno", "return(1000)").unwrap(); - - let (tx, rx) = sync_channel(0); - fail::cfg_callback("in_memory_engine_gc_finish", move || { - tx.send(true).unwrap(); - }) - .unwrap(); - - let start_ts = TimeStamp::physical_now() - Duration::from_secs(10).as_millis() as u64; - let commit_ts1 = TimeStamp::physical_now() - Duration::from_secs(9).as_millis() as u64; - put_data( - b"k", - b"v1", - start_ts, - commit_ts1, - 100, - false, - &default, - &write, - memory_controller.clone(), - ); - - let start_ts = TimeStamp::physical_now() - Duration::from_secs(8).as_millis() as u64; - let commit_ts2 = TimeStamp::physical_now() - Duration::from_secs(7).as_millis() as u64; - put_data( - b"k", - b"v2", - start_ts, - commit_ts2, - 110, - false, - &default, - &write, - memory_controller.clone(), - ); - - let start_ts = TimeStamp::physical_now() - Duration::from_secs(6).as_millis() as u64; - let commit_ts3 = TimeStamp::physical_now() - Duration::from_secs(5).as_millis() as u64; - put_data( - b"k", - b"v3", - start_ts, - commit_ts3, - 110, - false, - &default, - &write, - memory_controller.clone(), - ); - - let start_ts = TimeStamp::physical_now() - Duration::from_secs(4).as_millis() as u64; - let commit_ts4 = TimeStamp::physical_now() - Duration::from_secs(3).as_millis() as u64; - put_data( - b"k", - b"v4", - start_ts, - commit_ts4, - 110, - false, - &default, - &write, - memory_controller.clone(), - ); - - let guard = &epoch::pin(); - for &ts in &[commit_ts1, commit_ts2, commit_ts3] { - let key = Key::from_raw(b"k"); - let key = encoding_for_filter(key.as_encoded(), TimeStamp::new(ts)); - - assert!(key_exist(&write, &key, guard)); - } - - let _ = rx.recv_timeout(Duration::from_secs(5)).unwrap(); - - let key = Key::from_raw(b"k"); - // now, the outdated mvcc versions should be gone - for &ts in &[commit_ts1, commit_ts2, commit_ts3] { - let key = encoding_for_filter(key.as_encoded(), TimeStamp::new(ts)); - assert!(!key_exist(&write, &key, guard)); - } - - let key = encoding_for_filter(key.as_encoded(), TimeStamp::new(commit_ts4)); - assert!(key_exist(&write, &key, guard)); -} - -#[test] -fn test_clean_up_tombstone() { - let config = Arc::new(VersionTrack::new(RangeCacheEngineConfig::config_for_test())); - let engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config.clone())); - let range = CacheRange::new(b"".to_vec(), b"z".to_vec()); - - let (tx, rx) = sync_channel(0); - fail::cfg_callback("clean_lock_tombstone_done", move || { - tx.send(true).unwrap(); - }) - .unwrap(); - - engine.new_range(range.clone()); - let mut wb = engine.write_batch(); - wb.prepare_for_range(range.clone()); - wb.put_cf("lock", b"k", b"val").unwrap(); - wb.put_cf("lock", b"k1", b"val").unwrap(); - wb.put_cf("lock", b"k2", b"val").unwrap(); - wb.delete_cf("lock", b"k").unwrap(); - wb.delete_cf("lock", b"k1").unwrap(); - wb.delete_cf("lock", b"k2").unwrap(); - wb.put_cf("lock", b"k", b"val2").unwrap(); // seq 107 - wb.set_sequence_number(100).unwrap(); - wb.write().unwrap(); - - let mut wb = engine.write_batch(); - wb.prepare_for_range(range.clone()); - wb.put_cf("lock", b"k", b"val").unwrap(); // seq 120 - wb.put_cf("lock", b"k1", b"val").unwrap(); // seq 121 - wb.put_cf("lock", b"k2", b"val").unwrap(); // seq 122 - wb.delete_cf("lock", b"k").unwrap(); // seq 123 - wb.delete_cf("lock", b"k1").unwrap(); // seq 124 - wb.delete_cf("lock", b"k2").unwrap(); // seq 125 - wb.set_sequence_number(120).unwrap(); - wb.write().unwrap(); - - let lock_handle = engine.core().read().engine().cf_handle("lock"); - assert_eq!(lock_handle.len(), 13); - - engine - .bg_worker_manager() - .schedule_task(BackgroundTask::CleanLockTombstone(107)) - .unwrap(); - - rx.recv_timeout(Duration::from_secs(5)).unwrap(); - - let mut iter = engine.core().read().engine().cf_handle("lock").iterator(); - - let mut first = true; - let guard = &epoch::pin(); - for (k, seq, ty) in [ - (b"k".to_vec(), 123, ValueType::Deletion), - (b"k".to_vec(), 120, ValueType::Value), - (b"k".to_vec(), 106, ValueType::Value), - (b"k1".to_vec(), 124, ValueType::Deletion), - (b"k1".to_vec(), 121, ValueType::Value), - (b"k2".to_vec(), 125, ValueType::Deletion), - (b"k2".to_vec(), 122, ValueType::Value), - ] { - if first { - iter.seek_to_first(guard); - first = false; - } else { - iter.next(guard); - } - - let key = iter.key(); - let InternalKey { - user_key, - sequence, - v_type, - } = decode_key(key.as_bytes()); - assert_eq!(sequence, seq); - assert_eq!(user_key, &k); - assert_eq!(v_type, ty); - } -} - -#[test] -fn test_evict_with_loading_range() { - let path = Builder::new().prefix("test").tempdir().unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - - let config = RangeCacheEngineConfig::config_for_test(); - let mut engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(config), - ))); - engine.set_disk_engine(rocks_engine); - - let range1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let range2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let range3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - let (snapshot_load_tx, snapshot_load_rx) = sync_channel(0); - - // range1 and range2 will be evicted - let r = CacheRange::new(b"k05".to_vec(), b"k25".to_vec()); - let engine_clone = engine.clone(); - fail::cfg_callback("on_snapshot_load_finished", move || { - let _ = snapshot_load_tx.send(true); - engine_clone.evict_range(&r); - }) - .unwrap(); - - let (loading_complete_tx, loading_complete_rx) = sync_channel(0); - fail::cfg_callback("pending_range_completes_loading", move || { - let _ = loading_complete_tx.send(true); - }) - .unwrap(); - - engine.load_range(range1.clone()).unwrap(); - engine.load_range(range2.clone()).unwrap(); - engine.load_range(range3.clone()).unwrap(); - - let mut wb = engine.write_batch(); - // prepare range to trigger loading - wb.prepare_for_range(range1.clone()); - wb.prepare_for_range(range2.clone()); - wb.prepare_for_range(range3.clone()); - wb.set_sequence_number(10).unwrap(); - wb.write().unwrap(); - - snapshot_load_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - snapshot_load_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - - loading_complete_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - - engine.snapshot(range1, 100, 100).unwrap_err(); - engine.snapshot(range2, 100, 100).unwrap_err(); - engine.snapshot(range3, 100, 100).unwrap(); -} - -#[test] -fn test_cached_write_batch_cleared_when_load_failed() { - let path = Builder::new().prefix("test").tempdir().unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - - let mut config = RangeCacheEngineConfig::config_for_test(); - config.soft_limit_threshold = Some(ReadableSize(20)); - config.hard_limit_threshold = Some(ReadableSize(40)); - let config = Arc::new(VersionTrack::new(config)); - let mut engine = - RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(config.clone())); - engine.set_disk_engine(rocks_engine); - - let (tx, rx) = sync_channel(0); - fail::cfg_callback("on_snapshot_load_finished", move || { - let _ = tx.send(true); - }) - .unwrap(); - - fail::cfg("on_snapshot_load_finished2", "pause").unwrap(); - - // range1 will be canceled in on_snapshot_load_finished whereas range2 will be - // canceled at begin - let range1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let range2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - engine.load_range(range1.clone()).unwrap(); - engine.load_range(range2.clone()).unwrap(); - - let mut wb = engine.write_batch(); - // range1 starts to load - wb.prepare_for_range(range1.clone()); - rx.recv_timeout(Duration::from_secs(5)).unwrap(); - - wb.put(b"k05", b"val").unwrap(); - wb.put(b"k06", b"val").unwrap(); - wb.prepare_for_range(range2.clone()); - wb.put(b"k25", b"val").unwrap(); - wb.set_sequence_number(100).unwrap(); - wb.write().unwrap(); - - fail::remove("on_snapshot_load_finished2"); - - let mut tried = 0; - while tried < 20 { - if !engine.core().read().has_cached_write_batch(&range1) - && !engine.core().read().has_cached_write_batch(&range2) - { - return; - } - std::thread::sleep(Duration::from_millis(100)); - tried += 1; - } - panic!("write batches are not cleared"); -} - -#[test] -fn test_concurrency_between_delete_range_and_write_to_memory() { - let path = Builder::new().prefix("test").tempdir().unwrap(); - let path_str = path.path().to_str().unwrap(); - let rocks_engine = new_engine(path_str, DATA_CFS).unwrap(); - let mut wb = rocks_engine.write_batch(); - wb.put_cf(CF_LOCK, b"k40", b"val").unwrap(); - wb.put_cf(CF_LOCK, b"k41", b"val").unwrap(); - wb.put_cf(CF_LOCK, b"k42", b"val").unwrap(); - wb.write().unwrap(); - - let config = RangeCacheEngineConfig::config_for_test(); - let mut engine = RangeCacheMemoryEngine::new(RangeCacheEngineContext::new_for_tests(Arc::new( - VersionTrack::new(config), - ))); - engine.set_disk_engine(rocks_engine); - - let range1 = CacheRange::new(b"k00".to_vec(), b"k10".to_vec()); - let range2 = CacheRange::new(b"k20".to_vec(), b"k30".to_vec()); - let range3 = CacheRange::new(b"k40".to_vec(), b"k50".to_vec()); - let (snapshot_load_cancel_tx, snapshot_load_cancel_rx) = sync_channel(0); - fail::cfg_callback("in_memory_engine_snapshot_load_canceled", move || { - let _ = snapshot_load_cancel_tx.send(true); - }) - .unwrap(); - let (snapshot_load_tx, snapshot_load_rx) = sync_channel(0); - fail::cfg_callback("on_snapshot_load_finished", move || { - let _ = snapshot_load_tx.send(true); - }) - .unwrap(); - fail::cfg("before_clear_ranges_in_being_written", "pause").unwrap(); - - let (write_batch_consume_tx, write_batch_consume_rx) = sync_channel(0); - fail::cfg_callback("in_memory_engine_write_batch_consumed", move || { - let _ = write_batch_consume_tx.send(true); - }) - .unwrap(); - - let (delete_range_tx, delete_range_rx) = sync_channel(0); - fail::cfg_callback("in_memory_engine_delete_range_done", move || { - let _ = delete_range_tx.send(true); - }) - .unwrap(); - - engine.new_range(range1.clone()); - engine.new_range(range2.clone()); - engine.load_range(range3.clone()).unwrap(); - - let engine_clone = engine.clone(); - let (range_prepared_tx, range_prepared_rx) = sync_channel(0); - let range1_clone = range1.clone(); - let range2_clone = range2.clone(); - let range3_clone = range3.clone(); - let handle = std::thread::spawn(move || { - let mut wb = engine_clone.write_batch(); - wb.prepare_for_range(range1_clone); - wb.put_cf(CF_LOCK, b"k02", b"val").unwrap(); - wb.put_cf(CF_LOCK, b"k03", b"val").unwrap(); - wb.put_cf(CF_LOCK, b"k04", b"val").unwrap(); - wb.set_sequence_number(100).unwrap(); - - let mut wb2 = engine_clone.write_batch(); - wb2.prepare_for_range(range2_clone); - wb.put_cf(CF_LOCK, b"k22", b"val").unwrap(); - wb.put_cf(CF_LOCK, b"k23", b"val").unwrap(); - wb2.set_sequence_number(200).unwrap(); - - let mut wb3 = engine_clone.write_batch(); - wb3.prepare_for_range(range3_clone); - wb3.set_sequence_number(300).unwrap(); - - range_prepared_tx.send(true).unwrap(); - - wb.write().unwrap(); - wb2.write().unwrap(); - wb3.write().unwrap(); - }); - - range_prepared_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - // Now, three ranges are in write status, delete range will not be performed - // until they leave the write status - - engine.evict_range(&range1); - engine.evict_range(&range2); - - let verify_data = |range, expected_num: u64| { - let handle = engine.core().write().engine().cf_handle(CF_LOCK); - let (start, end) = encode_key_for_boundary_without_mvcc(range); - let mut iter = handle.iterator(); - let guard = &epoch::pin(); - let mut count = 0; - iter.seek(&start, guard); - while iter.valid() && iter.key() < &end { - count += 1; - iter.next(guard); - } - assert_eq!(count, expected_num); - }; - - write_batch_consume_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - // Now, a DeleteRange task has been done: actually, the task will be delayed, so - // the data has not be deleted - verify_data(&range1, 3); - // remove failpoint so that the range can leave write status - fail::remove("before_clear_ranges_in_being_written"); - delete_range_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - // Now, data should be deleted - verify_data(&range1, 0); - - // Next to test range2 - fail::cfg("before_clear_ranges_in_being_written", "pause").unwrap(); - write_batch_consume_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - verify_data(&range2, 2); - // remove failpoint so that the range can leave write status - fail::remove("before_clear_ranges_in_being_written"); - delete_range_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - verify_data(&range2, 0); - - // ensure the range enters on_snapshot_load_finished before eviction - snapshot_load_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - engine.evict_range(&range3); - - fail::cfg("before_clear_ranges_in_being_written", "pause").unwrap(); - write_batch_consume_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - verify_data(&range3, 3); - snapshot_load_cancel_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - fail::remove("before_clear_ranges_in_being_written"); - delete_range_rx - .recv_timeout(Duration::from_secs(5)) - .unwrap(); - verify_data(&range3, 0); - - let _ = handle.join(); -} diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index 68abc12507d..94963f5ba3b 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -26,12 +26,10 @@ test-engines-panic = ["tikv/test-engines-panic"] [dependencies] collections = { workspace = true } concurrency_manager = { workspace = true } -crossbeam = { workspace = true } engine_traits = { workspace = true } fail = "0.5" futures = "0.3" grpcio = { workspace = true } -hex = "0.4" kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } @@ -52,7 +50,6 @@ txn_types = { workspace = true } [dev-dependencies] engine_rocks = { workspace = true } -panic_hook = { workspace = true } tempfile = "3.0" test_raftstore = { workspace = true } test_sst_importer = { workspace = true } diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 2195c086974..7a848381269 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -20,6 +20,7 @@ pub enum ChangeRow { start_ts: TimeStamp, lock_type: LockType, value: Option, + generation: u64, }, Commit { key: Key, @@ -117,6 +118,7 @@ impl ChangeLog { short_value, ts, lock_type, + generation, .. } = lock; let value = default.map_or(short_value, |v| Some(v.into_put().1)); @@ -125,6 +127,7 @@ impl ChangeLog { start_ts: ts, lock_type, value, + generation, } }), (None, Some(KeyOp::Delete), _) => Some(ChangeRow::Commit { @@ -395,6 +398,7 @@ mod tests { start_ts: 1.into(), value: Some(b"v1".to_vec()), lock_type: LockType::Put, + generation: 0, }, ChangeRow::Commit { key: k1.clone(), @@ -407,6 +411,7 @@ mod tests { start_ts: 3.into(), value: Some(b"v2".to_vec()), lock_type: LockType::Put, + generation: 0, }, ChangeRow::Commit { key: k1.clone(), @@ -419,6 +424,7 @@ mod tests { start_ts: 4.into(), value: Some(vec![b'v'; 512]), lock_type: LockType::Put, + generation: 0, }, ChangeRow::Commit { key: k1.clone(), @@ -431,6 +437,7 @@ mod tests { start_ts: 5.into(), value: Some(b"v3".to_vec()), lock_type: LockType::Put, + generation: 0, }, ChangeRow::Commit { key: k1.clone(), @@ -443,6 +450,7 @@ mod tests { start_ts: 6.into(), value: Some(b"v4".to_vec()), lock_type: LockType::Put, + generation: 0, }, ChangeRow::Commit { key: k1.clone(), @@ -455,6 +463,7 @@ mod tests { start_ts: 7.into(), value: Some(b"v5".to_vec()), lock_type: LockType::Put, + generation: 0, }, // Rollback of the txn@start_ts=7 will be missing as overlapped rollback is not // hanlded. diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index dfbafc0531f..a02cdc8877e 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -27,7 +27,7 @@ use raftstore::{ }, }; use security::SecurityManager; -use tikv::config::ResolvedTsConfig; +use tikv::{config::ResolvedTsConfig, storage::txn::txn_status_cache::TxnStatusCache}; use tikv_util::{ memory::{HeapSize, MemoryQuota}, warn, @@ -45,7 +45,7 @@ use crate::{ Error, Result, TsSource, TxnLocks, ON_DROP_WARN_HEAP_SIZE, }; -/// grace period for identifying identifying slow resolved-ts and safe-ts. +/// grace period for identifying slow resolved-ts and safe-ts. const SLOW_LOG_GRACE_PERIOD_MS: u64 = 1000; const MEMORY_QUOTA_EXCEEDED_BACKOFF: Duration = Duration::from_secs(30); @@ -157,6 +157,7 @@ enum PendingLock { Track { key: Key, start_ts: TimeStamp, + generation: u64, }, Untrack { key: Key, @@ -193,9 +194,15 @@ impl ObserveRegion { rrp: Arc, memory_quota: Arc, cancelled: Sender<()>, + txn_status_cache: Arc, ) -> Self { ObserveRegion { - resolver: Resolver::with_read_progress(meta.id, Some(rrp), memory_quota.clone()), + resolver: Resolver::with_read_progress( + meta.id, + Some(rrp), + memory_quota.clone(), + txn_status_cache, + ), meta, handle: ObserveHandle::new(), resolver_status: ResolverStatus::Pending { @@ -235,9 +242,15 @@ impl ObserveRegion { ChangeLog::Rows { rows, index } => { for row in rows { let lock = match row { - ChangeRow::Prewrite { key, start_ts, .. } => PendingLock::Track { + ChangeRow::Prewrite { + key, + start_ts, + generation, + .. + } => PendingLock::Track { key: key.clone(), start_ts: *start_ts, + generation: *generation, }, ChangeRow::Commit { key, @@ -296,17 +309,23 @@ impl ObserveRegion { ChangeLog::Rows { rows, index } => { for row in rows { match row { - ChangeRow::Prewrite { key, start_ts, .. } => { + ChangeRow::Prewrite { + key, + start_ts, + generation, + .. + } => { self.resolver.track_lock( *start_ts, key.to_raw().unwrap(), Some(*index), + *generation, )?; } ChangeRow::Commit { key, .. } => self .resolver .untrack_lock(&key.to_raw().unwrap(), Some(*index)), - // One pc command do not contains any lock, so just skip it + // One pc command do not contain any lock, so just skip it ChangeRow::OnePc { .. } => { self.resolver.update_tracked_index(*index); } @@ -330,8 +349,12 @@ impl ObserveRegion { panic!("region {:?} resolver has ready", self.meta.id) } for (key, lock) in locks { - self.resolver - .track_lock(lock.ts, key.to_raw().unwrap(), Some(apply_index))?; + self.resolver.track_lock( + lock.ts, + key.to_raw().unwrap(), + Some(apply_index), + lock.generation, + )?; } } ScanEntries::None => { @@ -343,11 +366,16 @@ impl ObserveRegion { resolver_status.drain_pending_locks(self.meta.id); for lock in pending_locks { match lock { - PendingLock::Track { key, start_ts } => { + PendingLock::Track { + key, + start_ts, + generation, + } => { self.resolver.track_lock( start_ts, key.to_raw().unwrap(), Some(pending_tracked_index), + generation, )?; } PendingLock::Untrack { key, .. } => self @@ -380,6 +408,7 @@ pub struct Endpoint { scan_concurrency_semaphore: Arc, scheduler: Scheduler, advance_worker: AdvanceTsWorker, + txn_status_cache: Arc, _phantom: PhantomData<(T, E)>, } @@ -397,6 +426,7 @@ where let store_id = self.get_or_init_store_id(); let mut stats = Stats::default(); + let now = self.approximate_now_tso(); self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { let (leader_info, leader_store_id) = read_progress.dump_leader_info(); @@ -419,6 +449,9 @@ where } } else { // follower safe-ts + RTS_MIN_FOLLOWER_SAFE_TS_GAP_HISTOGRAM + .observe(now.saturating_sub(TimeStamp::from(safe_ts).physical()) as f64); + if safe_ts > 0 && safe_ts < stats.min_follower_safe_ts.safe_ts { stats.min_follower_safe_ts.set(*region_id, &core); } @@ -655,6 +688,7 @@ where concurrency_manager: ConcurrencyManager, env: Arc, security_mgr: Arc, + txn_status_cache: Arc, ) -> Self { let (region_read_progress, store_id) = { let meta = store_meta.lock().unwrap(); @@ -685,6 +719,7 @@ where scanner_pool, scan_concurrency_semaphore, regions: HashMap::default(), + txn_status_cache, _phantom: PhantomData, }; ep.handle_advance_resolved_ts(leader_resolver); @@ -705,6 +740,7 @@ where read_progress, self.memory_quota.clone(), cancelled_tx, + self.txn_status_cache.clone(), ); let observe_handle = observe_region.handle.clone(); observe_region @@ -1225,7 +1261,7 @@ impl LeaderStats { last_resolve_attempt: resolver.as_mut().and_then(|r| r.take_last_attempt()), min_lock: resolver .as_ref() - .and_then(|r| r.oldest_transaction().map(|(t, tk)| (*t, tk.clone()))), + .and_then(|r| r.oldest_transaction().map(|(t, tk)| (t, tk.clone()))), applied_index: region_read_progress.applied_index(), lock_num: resolver.as_ref().map(|r| r.num_locks()), txn_num: resolver.as_ref().map(|r| r.num_transactions()), diff --git a/components/resolved_ts/src/metrics.rs b/components/resolved_ts/src/metrics.rs index fb751491d10..ed258580366 100644 --- a/components/resolved_ts/src/metrics.rs +++ b/components/resolved_ts/src/metrics.rs @@ -115,6 +115,12 @@ lazy_static! { &["type"] ) .unwrap(); + pub static ref RTS_MIN_FOLLOWER_SAFE_TS_GAP_HISTOGRAM: Histogram = register_histogram!( + "tikv_resolved_ts_min_follower_safe_ts_gap_millis_histogram", + "Bucketed histogram of the gap between now() and the minimal (non-zero) safe ts for followers", + exponential_buckets(50.0, 2.0, 20).unwrap(), + ) + .unwrap(); pub static ref RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( "tikv_resolved_ts_check_leader_duration_seconds", "Bucketed histogram of resolved-ts check leader duration", diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 2aec9c336cd..7c3f1d74976 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -2,8 +2,9 @@ use std::{cmp, collections::BTreeMap, sync::Arc, time::Duration}; -use collections::{HashMap, HashMapEntry}; +use collections::{HashMap, HashMapEntry, HashSet}; use raftstore::store::RegionReadProgress; +use tikv::storage::txn::txn_status_cache::{TxnState, TxnStatusCache}; use tikv_util::{ memory::{MemoryQuota, MemoryQuotaExceeded}, time::Instant, @@ -80,6 +81,13 @@ pub struct Resolver { locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. lock_ts_heap: BTreeMap, + // the start_ts of large transactions, which use a different tracking strategy with normal + // transactions. + large_txn_ts: HashSet, + // each large transaction tracked by this resolver has a representative key tracked. So that + // when the large transaction is rolled back, we can rely on this key to guarantee that + // eventually there will be orphaned transactions. + large_txn_key_representative: HashMap, TimeStamp>, // The last shrink time. last_aggressive_shrink_time: Instant, // The timestamps that guarantees no more commit will happen before. @@ -96,6 +104,7 @@ pub struct Resolver { memory_quota: Arc, // The last attempt of resolve(), used for diagnosis. last_attempt: Option, + txn_status_cache: Arc, } #[derive(Clone)] @@ -169,20 +178,37 @@ impl Drop for Resolver { } impl Resolver { - pub fn new(region_id: u64, memory_quota: Arc) -> Resolver { - Resolver::with_read_progress(region_id, None, memory_quota) + pub fn new( + region_id: u64, + memory_quota: Arc, + txn_status_cache: Arc, + ) -> Resolver { + Resolver::with_read_progress(region_id, None, memory_quota, txn_status_cache) + } + + #[cfg(test)] + fn new_for_test(region_id: u64, memory_quota: Arc) -> Resolver { + Resolver::with_read_progress( + region_id, + None, + memory_quota, + Arc::new(TxnStatusCache::new_for_test()), + ) } pub fn with_read_progress( region_id: u64, read_progress: Option>, memory_quota: Arc, + txn_status_cache: Arc, ) -> Resolver { Resolver { region_id, resolved_ts: TimeStamp::zero(), locks_by_key: HashMap::default(), lock_ts_heap: BTreeMap::new(), + large_txn_ts: HashSet::::default(), + large_txn_key_representative: HashMap::, TimeStamp>::default(), last_aggressive_shrink_time: Instant::now_coarse(), read_progress, tracked_index: 0, @@ -190,6 +216,7 @@ impl Resolver { stopped: false, memory_quota, last_attempt: None, + txn_status_cache, } } @@ -245,6 +272,12 @@ impl Resolver { self.locks_by_key.len() * (key_bytes / key_count + std::mem::size_of::()) + self.lock_ts_heap.len() * (std::mem::size_of::() + std::mem::size_of::()) + + self + .large_txn_key_representative + .keys() + .map(|k| k.len() + std::mem::size_of::()) + .sum::() + + self.large_txn_ts.len() * std::mem::size_of::() } fn lock_heap_size(&self, key: &[u8]) -> usize { @@ -278,11 +311,12 @@ impl Resolver { start_ts: TimeStamp, key: Vec, index: Option, + generation: u64, /* generation is used to identify whether the lock is a pipelined + * transaction's lock */ ) -> Result<(), MemoryQuotaExceeded> { if let Some(index) = index { self.update_tracked_index(index); } - let bytes = self.lock_heap_size(&key); debug!( "track lock {}@{}", &log_wrappers::Value::key(&key), @@ -290,8 +324,22 @@ impl Resolver { "region_id" => self.region_id, "memory_in_use" => self.memory_quota.in_use(), "memory_capacity" => self.memory_quota.capacity(), - "key_heap_size" => bytes, + "generation" => generation, ); + if generation == 0 { + self.track_normal_lock(start_ts, key)?; + } else { + self.track_large_txn_lock(start_ts, key)?; + } + Ok(()) + } + + fn track_normal_lock( + &mut self, + start_ts: TimeStamp, + key: Vec, + ) -> Result<(), MemoryQuotaExceeded> { + let bytes = self.lock_heap_size(&key); self.memory_quota.alloc(bytes)?; let key: Arc<[u8]> = key.into_boxed_slice().into(); match self.locks_by_key.entry(key) { @@ -318,36 +366,43 @@ impl Resolver { if let Some(index) = index { self.update_tracked_index(index); } - let start_ts = if let Some(start_ts) = self.locks_by_key.remove(key) { + if let Some(start_ts) = self.locks_by_key.remove(key) { let bytes = self.lock_heap_size(key); self.memory_quota.free(bytes); - start_ts + debug!( + "untrack lock {}@{}", + &log_wrappers::Value::key(key), + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), + ); + if let Some(txn_locks) = self.lock_ts_heap.get_mut(&start_ts) { + if txn_locks.lock_count > 0 { + txn_locks.lock_count -= 1; + } + if txn_locks.lock_count == 0 { + self.lock_ts_heap.remove(&start_ts); + } + }; + // Use a large ratio to amortize the cost of rehash. + let shrink_ratio = 8; + self.shrink_ratio(shrink_ratio); + } else if let Some(start_ts) = self.large_txn_key_representative.remove(key) { + let is_new = self.large_txn_ts.remove(&start_ts); + debug_assert!(is_new, "large txn lock should be untracked only once"); + debug!( + "untrack lock {}@{}", + &log_wrappers::Value::key(key), + start_ts; + "region_id" => self.region_id, + "memory_in_use" => self.memory_quota.in_use(), + ); } else { - debug!("untrack a lock that was not tracked before"; + debug!("untrack a lock whose key is not tracked, should be from a pipelined transaction"; "key" => &log_wrappers::Value::key(key), "region_id" => self.region_id, ); - return; - }; - debug!( - "untrack lock {}@{}", - &log_wrappers::Value::key(key), - start_ts; - "region_id" => self.region_id, - "memory_in_use" => self.memory_quota.in_use(), - ); - - if let Some(txn_locks) = self.lock_ts_heap.get_mut(&start_ts) { - if txn_locks.lock_count > 0 { - txn_locks.lock_count -= 1; - } - if txn_locks.lock_count == 0 { - self.lock_ts_heap.remove(&start_ts); - } - }; - // Use a large ratio to amortize the cost of rehash. - let shrink_ratio = 8; - self.shrink_ratio(shrink_ratio); + } } /// Try to advance resolved ts. @@ -377,15 +432,13 @@ impl Resolver { // Find the min start ts. let min_lock = self.oldest_transaction(); let has_lock = min_lock.is_some(); - let min_start_ts = min_lock.as_ref().map(|(ts, _)| **ts).unwrap_or(min_ts); + let min_txn_ts = min_lock.as_ref().map(|(ts, _)| *ts).unwrap_or(min_ts); // No more commit happens before the ts. - let new_resolved_ts = cmp::min(min_start_ts, min_ts); + let new_resolved_ts = cmp::min(min_txn_ts, min_ts); // reason is the min source of the new resolved ts. let reason = match (min_lock, min_ts) { - (Some((lock_ts, txn_locks)), min_ts) if *lock_ts < min_ts => { - TsSource::Lock(txn_locks.clone()) - } + (Some((lock_ts, txn_locks)), min_ts) if lock_ts < min_ts => TsSource::Lock(txn_locks), (Some(_), _) => source, (None, _) => source, }; @@ -429,45 +482,140 @@ impl Resolver { self.resolved_ts } - pub(crate) fn log_locks(&self, min_start_ts: u64) { - // log lock with the minimum start_ts >= min_start_ts - if let Some((start_ts, txn_locks)) = self - .lock_ts_heap - .range(TimeStamp::new(min_start_ts)..) - .next() - { + /// Logs the txns with min start_ts or min_commit_ts. Search from + /// `lower_bound`. Normal txns are logged with start_ts. + /// Large txns are logged with min_commit_ts. + pub(crate) fn log_locks(&self, lower_bound: u64) { + self.log_min_lock(lower_bound.into()); + self.log_min_large_txn(lower_bound.into()); + } + + fn log_min_lock(&self, lower_bound: TimeStamp) { + if let Some((start_ts, txn_locks)) = self.lock_ts_heap.range(lower_bound..).next() { info!( - "locks with the minimum start_ts in resolver"; - "region_id" => self.region_id, + "non-large txn locks with the minimum start_ts in resolver"; + "search_lower_bound" => lower_bound, "start_ts" => start_ts, "txn_locks" => ?txn_locks, + "region_id" => self.region_id, + ); + } + } + + fn log_min_large_txn(&self, lower_bound: TimeStamp) { + let min_min_commit_ts_txn = self + .large_txn_ts + .iter() + .filter_map(|&start_ts| { + self.lookup_min_commit_ts(start_ts) + .map(|min_commit_ts| (start_ts, min_commit_ts)) + }) + .filter(|(_, min_commit_ts)| *min_commit_ts >= lower_bound) + .min_by_key(|(_, min_commit_ts)| *min_commit_ts); + + if let Some((start_ts, min_commit_ts)) = min_min_commit_ts_txn { + info!( + "large txn locks with the minimum min_commit_ts in resolver"; + "search_lower_bound" => lower_bound, + "start_ts" => start_ts, + "min_commit_ts" => min_commit_ts, + "region_id" => self.region_id, ); } } + // Map a transaction's start_ts to a min_commit_ts. + // When a large txn is committed or rolled back, return None. + // When not found in cache, fallback to its start_ts as start_ts is also a valid + // min_commit_ts + fn lookup_min_commit_ts(&self, start_ts: TimeStamp) -> Option { + match self.txn_status_cache.get(start_ts) { + None => { + info!("Large txn not found in cache"; "start_ts" => start_ts); + Some(start_ts) + } + // TODO: optimization: whenever a large txn is committed or rolled back, we can stop + // tracking this txn + Some(TxnState::Ongoing { min_commit_ts }) => Some(min_commit_ts), + Some(TxnState::Committed { .. }) | Some(TxnState::RolledBack) => None, + } + } + pub(crate) fn num_locks(&self) -> u64 { - self.locks_by_key.len() as u64 + // this is inaccurate, but it's just for monitoring. + // TODO: count the number of locks of large transactions, namely also track + // TxnLocks + (self.locks_by_key.len() + self.large_txn_ts.len()) as u64 } pub(crate) fn num_transactions(&self) -> u64 { - self.lock_ts_heap.len() as u64 + (self.lock_ts_heap.len() + self.large_txn_ts.len()) as u64 } pub(crate) fn read_progress(&self) -> Option<&Arc> { self.read_progress.as_ref() } - pub(crate) fn oldest_transaction(&self) -> Option<(&TimeStamp, &TxnLocks)> { - self.lock_ts_heap.iter().next() + // Return the transaction with the smallest min_commit_ts. When min_commit_ts + // is unknown, use start_ts instead. + pub(crate) fn oldest_transaction(&self) -> Option<(TimeStamp, TxnLocks)> { + let oldest_normal_txn = self + .lock_ts_heap + .iter() + .next() + .map(|(ts, txn_locks)| (ts, txn_locks.clone())); + + let oldest_large_txn = self + .large_txn_ts + .iter() + .filter_map(|start_ts| self.lookup_min_commit_ts(*start_ts)) + .min() + .map(|ts| { + ( + ts, + TxnLocks { + lock_count: 1, + // TODO: maybe fill this + sample_lock: None, + }, + ) + }); + + match (oldest_normal_txn, oldest_large_txn) { + (Some((&ts1, txn_locks1)), Some((ts2, txn_locks2))) => { + if ts1 < ts2 { + Some((ts1, txn_locks1)) + } else { + Some((ts2, txn_locks2)) + } + } + (Some((&ts, txn_locks)), None) => Some((ts, txn_locks)), + (None, Some((ts, txn_locks))) => Some((ts, txn_locks)), + (None, None) => None, + } } pub(crate) fn take_last_attempt(&mut self) -> Option { self.last_attempt.take() } + + fn track_large_txn_lock( + &mut self, + start_ts: TimeStamp, + key: Vec, + ) -> Result<(), MemoryQuotaExceeded> { + let is_new = self.large_txn_ts.insert(start_ts); + if is_new { + self.large_txn_key_representative.insert(key, start_ts); + } + Ok(()) + } } #[cfg(test)] mod tests { + use std::time::SystemTime; + use txn_types::Key; use super::*; @@ -535,12 +683,12 @@ mod tests { for (i, case) in cases.into_iter().enumerate() { let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let mut resolver = Resolver::new(1, memory_quota); + let mut resolver = Resolver::new_for_test(1, memory_quota); for e in case.clone() { match e { Event::Lock(start_ts, key) => { resolver - .track_lock(start_ts.into(), key.into_raw().unwrap(), None) + .track_lock(start_ts.into(), key.into_raw().unwrap(), None, 0) .unwrap(); } Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), @@ -560,11 +708,11 @@ mod tests { #[test] fn test_memory_quota() { let memory_quota = Arc::new(MemoryQuota::new(1024)); - let mut resolver = Resolver::new(1, memory_quota.clone()); + let mut resolver = Resolver::new_for_test(1, memory_quota.clone()); let mut key = vec![0; 77]; let lock_size = resolver.lock_heap_size(&key); let mut ts = TimeStamp::default(); - while resolver.track_lock(ts, key.clone(), None).is_ok() { + while resolver.track_lock(ts, key.clone(), None, 0).is_ok() { ts.incr(); key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); } @@ -585,13 +733,13 @@ mod tests { #[test] fn test_untrack_lock_shrink_ratio() { let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let mut resolver = Resolver::new(1, memory_quota); + let mut resolver = Resolver::new_for_test(1, memory_quota); let mut key = vec![0; 16]; let mut ts = TimeStamp::default(); for _ in 0..1000 { ts.incr(); key[0..8].copy_from_slice(&ts.into_inner().to_be_bytes()); - let _ = resolver.track_lock(ts, key.clone(), None); + let _ = resolver.track_lock(ts, key.clone(), None, 0); } assert!( resolver.locks_by_key.capacity() >= 1000, @@ -640,7 +788,7 @@ mod tests { #[test] fn test_idempotent_track_and_untrack_lock() { let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); - let mut resolver = Resolver::new(1, memory_quota); + let mut resolver = Resolver::new_for_test(1, memory_quota); let mut key = vec![0; 16]; // track_lock @@ -650,7 +798,7 @@ mod tests { for k in 0..100u64 { key[0..8].copy_from_slice(&k.to_be_bytes()); key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); - let _ = resolver.track_lock(ts, key.clone(), None); + let _ = resolver.track_lock(ts, key.clone(), None, 0); } let in_use1 = resolver.memory_quota.in_use(); let key_count1 = resolver.locks_by_key.len(); @@ -664,7 +812,7 @@ mod tests { for k in 0..100u64 { key[0..8].copy_from_slice(&k.to_be_bytes()); key[8..16].copy_from_slice(&ts.into_inner().to_be_bytes()); - let _ = resolver.track_lock(ts, key.clone(), None); + let _ = resolver.track_lock(ts, key.clone(), None, 0); } let in_use2 = resolver.memory_quota.in_use(); let key_count2 = resolver.locks_by_key.len(); @@ -710,4 +858,44 @@ mod tests { assert_eq!(resolver.locks_by_key.len(), 0); assert_eq!(resolver.lock_ts_heap.len(), 0); } + + #[test] + fn test_large_txn_tracking() { + let memory_quota = Arc::new(MemoryQuota::new(std::usize::MAX)); + let txn_status_cache = Arc::new(TxnStatusCache::new(100)); + let mut resolver = Resolver::new(1, memory_quota, txn_status_cache.clone()); + let key: Vec = vec![1, 2, 3, 4]; + let key2: Vec = vec![5, 6, 7, 8]; + + // track 2 large txns + resolver.track_lock(1.into(), key.clone(), None, 1).unwrap(); + resolver + .track_lock(2.into(), key2.clone(), None, 1) + .unwrap(); + assert_eq!(resolver.num_locks(), 2); + assert_eq!(resolver.num_transactions(), 2); + assert_eq!(resolver.locks_by_key.len(), 0); + assert_eq!(resolver.large_txn_ts.len(), 2); + assert_eq!(resolver.large_txn_key_representative.len(), 2); + assert_eq!(resolver.resolved_ts(), TimeStamp::zero()); + + assert_eq!(resolver.resolve(20.into(), None, TsSource::PdTso), 1.into()); + + txn_status_cache.upsert( + 1.into(), + TxnState::Ongoing { + min_commit_ts: 10.into(), + }, + SystemTime::now(), + ); + txn_status_cache.upsert( + 2.into(), + TxnState::Ongoing { + min_commit_ts: 5.into(), + }, + SystemTime::now(), + ); + + assert_eq!(resolver.resolve(20.into(), None, TsSource::PdTso), 5.into()); + } } diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 5547fef461a..90592de582b 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -3,7 +3,6 @@ use std::{sync::*, time::Duration}; use collections::HashMap; -use engine_rocks::RocksEngine; use grpcio::{ChannelBuilder, Environment}; use kvproto::{import_sstpb_grpc::ImportSstClient, kvrpcpb::*, tikvpb::TikvClient}; use online_config::ConfigValue; @@ -19,7 +18,7 @@ pub fn init() { } pub struct TestSuite { - pub cluster: Cluster>, + pub cluster: Cluster, tikv_cli: HashMap, import_cli: HashMap, diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 666c0d732e0..1c8ca387483 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -9,10 +9,8 @@ license = "Apache-2.0" failpoints = ["fail/failpoints"] [dependencies] -byteorder = "1.2" collections = { workspace = true } crossbeam = { workspace = true } -crossbeam-skiplist = { workspace = true } dashmap = "5.1" fail = "0.5" file_system = { workspace = true } @@ -37,4 +35,3 @@ yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] rand = "0.8" test_pd = { workspace = true } -test_pd_client = { workspace = true } diff --git a/components/resource_control/src/config.rs b/components/resource_control/src/config.rs new file mode 100644 index 00000000000..e4429fe6a27 --- /dev/null +++ b/components/resource_control/src/config.rs @@ -0,0 +1,105 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. +use std::{fmt, sync::Arc}; + +use online_config::{ConfigManager, ConfigValue, OnlineConfig}; +use serde::{Deserialize, Serialize}; +use tikv_util::config::VersionTrack; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct Config { + #[online_config(skip)] + pub enabled: bool, + pub priority_ctl_strategy: PriorityCtlStrategy, +} + +impl Default for Config { + fn default() -> Self { + Self { + enabled: true, + priority_ctl_strategy: PriorityCtlStrategy::Moderate, + } + } +} + +/// PriorityCtlStrategy controls how resource quota is granted to low-priority +/// tasks. +#[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PriorityCtlStrategy { + /// Prioritize the throughput and latency of high-priority tasks, result in + /// low-priority tasks running much slower. + Aggressive, + #[default] + /// Try to balance between the latency of high-prioirty tasks and throughput + /// of low-priority tasks. + Moderate, + /// Prioritize of overall throughput, the latency of high-priority tasks may + /// be significantly impacted when the overall load is high. + Conservative, +} + +impl PriorityCtlStrategy { + pub fn to_resource_util_percentage(self) -> f64 { + match self { + Self::Aggressive => 0.5, + Self::Moderate => 0.7, + Self::Conservative => 0.9, + } + } +} + +impl fmt::Display for PriorityCtlStrategy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let str_value = match *self { + Self::Aggressive => "aggressive", + Self::Moderate => "moderate", + Self::Conservative => "conservative", + }; + f.write_str(str_value) + } +} + +impl From for ConfigValue { + fn from(v: PriorityCtlStrategy) -> Self { + ConfigValue::String(format!("{}", v)) + } +} + +impl TryFrom for PriorityCtlStrategy { + type Error = String; + fn try_from(v: ConfigValue) -> Result { + if let ConfigValue::String(s) = v { + match s.as_str() { + "aggressive" => Ok(Self::Aggressive), + "moderate" => Ok(Self::Moderate), + "conservative" => Ok(Self::Conservative), + s => Err(format!("invalid config value: {}", s)), + } + } else { + panic!("expect ConfigValue::String, got: {:?}", v); + } + } +} + +pub struct ResourceContrlCfgMgr { + config: Arc>, +} + +impl ResourceContrlCfgMgr { + pub fn new(config: Arc>) -> Self { + Self { config } + } +} + +impl ConfigManager for ResourceContrlCfgMgr { + fn dispatch(&mut self, change: online_config::ConfigChange) -> online_config::Result<()> { + let cfg_str = format!("{:?}", change); + let res = self.config.update(|c| c.update(change)); + if res.is_ok() { + tikv_util::info!("update resource control config"; "change" => cfg_str); + } + res + } +} diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs index 345f7d88f36..31af006b21d 100644 --- a/components/resource_control/src/future.rs +++ b/components/resource_control/src/future.rs @@ -324,7 +324,11 @@ mod tests { let dur = start.saturating_elapsed(); assert_eq!(delta.total_consumed, 150); assert!(delta.total_wait_dur_us >= 140_000 && delta.total_wait_dur_us <= 160_000); - assert!(dur >= Duration::from_millis(150) && dur <= Duration::from_millis(160)); + assert!( + dur >= Duration::from_millis(140) && dur <= Duration::from_millis(160), + "dur: {:?}", + dur + ); // fetch io bytes failed, consumed value is 0. #[cfg(feature = "failpoints")] diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index b9a79e1f9ae..55b0ee10f17 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -1,11 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] +#![feature(let_chains)] use std::sync::Arc; -use online_config::OnlineConfig; use pd_client::RpcClient; -use serde::{Deserialize, Serialize}; mod resource_group; pub use resource_group::{ @@ -25,6 +24,8 @@ pub use service::ResourceManagerService; pub mod channel; pub use channel::ResourceMetered; +pub mod config; + mod resource_limiter; pub use resource_limiter::ResourceLimiter; use tikv_util::worker::Worker; @@ -35,20 +36,6 @@ use worker::{ mod metrics; pub mod worker; -#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] -#[serde(default)] -#[serde(rename_all = "kebab-case")] -pub struct Config { - #[online_config(skip)] - pub enabled: bool, -} - -impl Default for Config { - fn default() -> Self { - Self { enabled: true } - } -} - pub fn start_periodic_tasks( mgr: &Arc, pd_client: Arc, diff --git a/components/resource_control/src/metrics.rs b/components/resource_control/src/metrics.rs index 45723063492..594c6af486a 100644 --- a/components/resource_control/src/metrics.rs +++ b/components/resource_control/src/metrics.rs @@ -28,6 +28,26 @@ lazy_static! { &["priority"] ) .unwrap(); + pub static ref PRIORITY_CPU_TIME_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_resource_control_priority_task_exec_duration", + "Total execution duration of tasks per-priority", + &["priority"] + ) + .unwrap(); + pub static ref PRIORITY_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( + "tikv_resource_control_priority_wait_duration", + "Histogram of wait duration cause by priority quota limiter", + &["priority"], + exponential_buckets(1e-5, 2.0, 18).unwrap() // 10us ~ 2.5s + ) + .unwrap(); + + pub static ref BACKGROUND_TASK_RESOURCE_UTILIZATION_VEC: IntGaugeVec = register_int_gauge_vec!( + "tikv_resource_control_bg_resource_utilization", + "The total resource utilization percentage of background tasks", + &["type"] + ) + .unwrap(); } pub fn deregister_metrics(name: &str) { diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 85730e60481..1332ed1950d 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -12,9 +12,7 @@ use std::{ }; use collections::HashMap; -#[cfg(test)] -use dashmap::mapref::one::Ref; -use dashmap::DashMap; +use dashmap::{mapref::one::Ref, DashMap}; use fail::fail_point; use kvproto::{ kvrpcpb::{CommandPri, ResourceControlContext}, @@ -22,13 +20,14 @@ use kvproto::{ }; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; use tikv_util::{ + config::VersionTrack, info, resource_control::{TaskMetadata, TaskPriority, DEFAULT_RESOURCE_GROUP_NAME}, time::Instant, }; use yatp::queue::priority::TaskPriorityProvider; -use crate::{metrics::deregister_metrics, resource_limiter::ResourceLimiter}; +use crate::{config::Config, metrics::deregister_metrics, resource_limiter::ResourceLimiter}; // a read task cost at least 50us. const DEFAULT_PRIORITY_PER_READ_TASK: u64 = 50; @@ -68,10 +67,18 @@ pub struct ResourceGroupManager { version_generator: AtomicU64, // the shared resource limiter of each priority priority_limiters: [Arc; TaskPriority::PRIORITY_COUNT], + // lastest config. + config: Arc>, } impl Default for ResourceGroupManager { fn default() -> Self { + Self::new(Config::default()) + } +} + +impl ResourceGroupManager { + pub fn new(config: Config) -> Self { let priority_limiters = TaskPriority::priorities().map(|p| { Arc::new(ResourceLimiter::new( p.as_str().to_owned(), @@ -87,6 +94,7 @@ impl Default for ResourceGroupManager { registry: Default::default(), version_generator: AtomicU64::new(0), priority_limiters, + config: Arc::new(VersionTrack::new(config)), }; // init the default resource group by default. @@ -103,9 +111,7 @@ impl Default for ResourceGroupManager { manager } -} -impl ResourceGroupManager { #[inline] pub fn get_group_count(&self) -> u64 { self.group_count.load(Ordering::Relaxed) @@ -217,11 +223,14 @@ impl ResourceGroupManager { } } - #[cfg(test)] pub(crate) fn get_resource_group(&self, name: &str) -> Option> { self.resource_groups.get(&name.to_ascii_lowercase()) } + pub fn get_config(&self) -> &Arc> { + &self.config + } + pub fn get_all_resource_groups(&self) -> Vec { self.resource_groups .iter() diff --git a/components/resource_control/src/resource_limiter.rs b/components/resource_control/src/resource_limiter.rs index ab2144f18cc..85102ea5a73 100644 --- a/components/resource_control/src/resource_limiter.rs +++ b/components/resource_control/src/resource_limiter.rs @@ -8,10 +8,11 @@ use std::{ use file_system::IoBytes; use futures::compat::Future01CompatExt; +use prometheus::Histogram; use strum::EnumCount; -use tikv_util::{time::Limiter, timer::GLOBAL_TIMER_HANDLE}; +use tikv_util::{resource_control::TaskPriority, time::Limiter, timer::GLOBAL_TIMER_HANDLE}; -use crate::metrics::BACKGROUND_TASKS_WAIT_DURATION; +use crate::metrics::PRIORITY_WAIT_DURATION_VEC; #[derive(Clone, Copy, Eq, PartialEq, EnumCount)] #[repr(usize)] @@ -36,11 +37,13 @@ impl fmt::Debug for ResourceType { } pub struct ResourceLimiter { - name: String, + _name: String, version: u64, limiters: [QuotaLimiter; ResourceType::COUNT], // whether the resource limiter is a background limiter or priority limiter. is_background: bool, + // the wait duration histogram for prioitry limiter. + wait_histogram: Option, } impl std::fmt::Debug for ResourceLimiter { @@ -59,11 +62,23 @@ impl ResourceLimiter { ) -> Self { let cpu_limiter = QuotaLimiter::new(cpu_limit); let io_limiter = QuotaLimiter::new(io_limit); + // high priority tasks does not triggers wait, so no need to generate an empty + // metrics. + let wait_histogram = if !is_background && name != TaskPriority::High.as_str() { + Some( + PRIORITY_WAIT_DURATION_VEC + .get_metric_with_label_values(&[&name]) + .unwrap(), + ) + } else { + None + }; Self { - name, + _name: name, version, limiters: [cpu_limiter, io_limiter], is_background, + wait_histogram, } } @@ -76,12 +91,11 @@ impl ResourceLimiter { self.limiters[ResourceType::Cpu as usize].consume(cpu_time.as_micros() as u64, wait); let io_dur = self.limiters[ResourceType::Io as usize].consume_io(io_bytes, wait); let wait_dur = cpu_dur.max(io_dur); - if wait_dur > Duration::ZERO { - BACKGROUND_TASKS_WAIT_DURATION - .with_label_values(&[&self.name]) - .inc_by(wait_dur.as_micros() as u64); + if !wait_dur.is_zero() + && let Some(h) = &self.wait_histogram + { + h.observe(wait_dur.as_secs_f64()); } - wait_dur } @@ -127,7 +141,14 @@ pub(crate) struct QuotaLimiter { impl QuotaLimiter { fn new(limit: f64) -> Self { Self { - limiter: Limiter::new(limit), + // we use 1s refill and 1ms min_wait duration to avoid trigger + // wait too frequently or waiting too long. + // NOTE: the parameter `refill` mainly impact the capacity + // of token bucket but not refill interval. + limiter: Limiter::builder(limit) + .refill(Duration::from_millis(1000)) + .min_wait(Duration::from_millis(1)) + .build(), total_wait_dur_us: AtomicU64::new(0), read_bytes: AtomicU64::new(0), write_bytes: AtomicU64::new(0), diff --git a/components/resource_control/src/worker.rs b/components/resource_control/src/worker.rs index 4957ee1aa3f..4d85b147fbc 100644 --- a/components/resource_control/src/worker.rs +++ b/components/resource_control/src/worker.rs @@ -13,7 +13,7 @@ use prometheus::Histogram; use strum::EnumCount; use tikv_util::{ debug, - resource_control::TaskPriority, + resource_control::{TaskPriority, DEFAULT_RESOURCE_GROUP_NAME}, sys::{cpu_time::ProcessStat, SysQuota}, time::Instant, warn, @@ -137,6 +137,20 @@ impl GroupQuotaAdjustWorker { } self.last_adjust_time = now; + let mut background_util_limit = self + .resource_ctl + .get_resource_group(DEFAULT_RESOURCE_GROUP_NAME) + .map_or(0, |r| { + r.group.get_background_settings().get_utilization_limit() + }); + if background_util_limit == 0 { + background_util_limit = 100; + } + + BACKGROUND_TASK_RESOURCE_UTILIZATION_VEC + .with_label_values(&["limit"]) + .set(background_util_limit as i64); + let mut background_groups: Vec<_> = self .resource_ctl .resource_groups @@ -156,8 +170,18 @@ impl GroupQuotaAdjustWorker { return; } - self.do_adjust(ResourceType::Cpu, dur_secs, &mut background_groups); - self.do_adjust(ResourceType::Io, dur_secs, &mut background_groups); + self.do_adjust( + ResourceType::Cpu, + dur_secs, + background_util_limit, + &mut background_groups, + ); + self.do_adjust( + ResourceType::Io, + dur_secs, + background_util_limit, + &mut background_groups, + ); // clean up deleted group stats if self.prev_stats_by_group[0].len() != background_groups.len() { @@ -173,6 +197,7 @@ impl GroupQuotaAdjustWorker { &mut self, resource_type: ResourceType, dur_secs: f64, + utilization_limit: u64, bg_group_stats: &mut [GroupStats], ) { let resource_stats = match self.resource_quota_getter.get_current_stats(resource_type) { @@ -211,6 +236,12 @@ impl GroupQuotaAdjustWorker { BACKGROUND_RESOURCE_CONSUMPTION .with_label_values(&[&g.name, resource_type.as_str()]) .inc_by(stats_delta.total_consumed); + if resource_type == ResourceType::Cpu { + BACKGROUND_TASKS_WAIT_DURATION + .with_label_values(&[&g.name]) + .inc_by(stats_delta.total_wait_dur_us); + } + let stats_per_sec = stats_delta / dur_secs; background_consumed_total += stats_per_sec.total_consumed as f64; g.stats_per_sec = stats_per_sec; @@ -219,6 +250,12 @@ impl GroupQuotaAdjustWorker { } } + let background_util = + (background_consumed_total / resource_stats.total_quota * 100.0) as u64; + BACKGROUND_TASK_RESOURCE_UTILIZATION_VEC + .with_label_values(&[resource_type.as_str()]) + .set(background_util as i64); + // fast path if process cpu is low let is_low_load = resource_stats.current_used <= (resource_stats.total_quota * 0.1); if is_low_load && !has_wait && self.is_last_time_low_load[resource_type as usize] { @@ -226,6 +263,7 @@ impl GroupQuotaAdjustWorker { } self.is_last_time_low_load[resource_type as usize] = is_low_load; + let util_limit_percent = (utilization_limit as f64 / 100.0).min(1.0); // the available resource for background tasks is defined as: // (total_resource_quota - foreground_task_used). foreground_task_used // resource is calculated by: (resource_current_total_used - @@ -235,6 +273,7 @@ impl GroupQuotaAdjustWorker { - resource_stats.current_used + background_consumed_total) * 0.8) + .min(resource_stats.total_quota * util_limit_percent) .max(resource_stats.total_quota * 0.1); let mut total_expected_cost = 0.0; for g in bg_group_stats.iter_mut() { @@ -386,7 +425,7 @@ impl PriorityLimiterAdjustWorker { } }; - if process_cpu_stats.current_used < process_cpu_stats.total_quota * 0.5 { + if process_cpu_stats.current_used < process_cpu_stats.total_quota * 0.3 { if self.is_last_low_cpu { return; } @@ -405,10 +444,10 @@ impl PriorityLimiterAdjustWorker { } self.is_last_low_cpu = false; - let total_reqs: u64 = stats.iter().map(|s| s.req_count).sum(); - let max_reqs = stats.iter().map(|s| s.req_count).max().unwrap(); + let total_cpus: f64 = stats.iter().map(|s| s.cpu_secs).sum(); + let max_cpus = stats.iter().map(|s| s.cpu_secs).fold(0.0, f64::max); // there is only 1 active priority, do not restrict. - if total_reqs * 99 / 100 <= max_reqs { + if total_cpus * 0.99 <= max_cpus { self.trackers .iter() .skip(1) @@ -422,7 +461,15 @@ impl PriorityLimiterAdjustWorker { let cpu_duration: [_; TaskPriority::PRIORITY_COUNT] = array::from_fn(|i| stats[i].cpu_secs); let real_cpu_total: f64 = cpu_duration.iter().sum(); - let expect_pool_cpu_total = real_cpu_total * (process_cpu_stats.total_quota * 0.95) + + let available_quota_percentage = self + .resource_ctl + .get_config() + .value() + .priority_ctl_strategy + .to_resource_util_percentage(); + let expect_pool_cpu_total = real_cpu_total + * (process_cpu_stats.total_quota * available_quota_percentage) / process_cpu_stats.current_used; let mut limits = [0.0; 2]; let level_expected: [_; TaskPriority::PRIORITY_COUNT] = @@ -431,7 +478,7 @@ impl PriorityLimiterAdjustWorker { let mut expect_cpu_time_total = expect_pool_cpu_total - level_expected[0]; // still reserve a minimal cpu quota - let minimal_quota = process_cpu_stats.total_quota / MICROS_PER_SEC * 0.05; + let minimal_quota = process_cpu_stats.total_quota / MICROS_PER_SEC * 0.1; for i in 1..self.trackers.len() { if expect_cpu_time_total < minimal_quota { expect_cpu_time_total = minimal_quota; @@ -461,8 +508,6 @@ struct LimiterStats { cpu_secs: f64, // QuotaLimiter waited secs in total. wait_secs: f64, - // the total number of tasks that are scheduled. - req_count: u64, } struct HistogramTracker { @@ -521,19 +566,24 @@ impl PriorityLimiterStatsTracker { fn get_and_update_last_stats(&mut self, dur_secs: f64) -> LimiterStats { let cur_stats = self.limiter.get_limit_statistics(ResourceType::Cpu); - let stats_delta = (cur_stats - self.last_stats) / dur_secs; + let stats_delta = cur_stats - self.last_stats; self.last_stats = cur_stats; + PRIORITY_CPU_TIME_VEC + .with_label_values(&[self.priority]) + .inc_by(stats_delta.total_consumed); + let stats_per_sec = stats_delta / dur_secs; + let wait_stats: [_; 2] = array::from_fn(|i| self.task_wait_dur_trakcers[i].get_and_upate_statistics()); let schedule_wait_dur_secs = wait_stats.iter().map(|s| s.0).sum::() / dur_secs; - let expected_wait_dur_secs = stats_delta.request_count as f64 * MINIMAL_SCHEDULE_WAIT_SECS; + let expected_wait_dur_secs = + stats_per_sec.request_count as f64 * MINIMAL_SCHEDULE_WAIT_SECS; let normed_schedule_wait_dur_secs = (schedule_wait_dur_secs - expected_wait_dur_secs).max(0.0); LimiterStats { - cpu_secs: stats_delta.total_consumed as f64 / MICROS_PER_SEC, - wait_secs: stats_delta.total_wait_dur_us as f64 / MICROS_PER_SEC + cpu_secs: stats_per_sec.total_consumed as f64 / MICROS_PER_SEC, + wait_secs: stats_per_sec.total_wait_dur_us as f64 / MICROS_PER_SEC + normed_schedule_wait_dur_secs, - req_count: stats_delta.request_count, } } } @@ -640,6 +690,7 @@ mod tests { worker.last_adjust_time = now - dur; }; + #[track_caller] fn check(val: f64, expected: f64) { assert!( expected * 0.99 < val && val < expected * 1.01, @@ -649,6 +700,7 @@ mod tests { ); } + #[track_caller] fn check_limiter(limiter: &Arc, cpu: f64, io: IoBytes) { check( limiter.get_limiter(ResourceType::Cpu).get_rate_limit(), @@ -815,18 +867,18 @@ mod tests { worker.adjust_quota(); check_limiter( &limiter, - 2.4, + 1.2, IoBytes { - read: 1400, - write: 1400, + read: 1800, + write: 1800, }, ); check_limiter( &bg_limiter, - 1.6, + 2.8, IoBytes { - read: 1800, - write: 1800, + read: 1400, + write: 1400, }, ); @@ -893,18 +945,18 @@ mod tests { worker.adjust_quota(); check_limiter( &limiter, - 2.4, + 2.2, IoBytes { - read: 1400, - write: 1400, + read: 2133, + write: 2133, }, ); check_limiter( &new_bg_limiter, - 1.6, + 1.8, IoBytes { - read: 1800, - write: 1800, + read: 1066, + write: 1066, }, ); } @@ -929,6 +981,7 @@ mod tests { .set_rate_limit(f64::INFINITY); }; + #[track_caller] fn check(val: f64, expected: f64) { assert!( (val.is_infinite() && expected.is_infinite()) @@ -982,7 +1035,7 @@ mod tests { priority_limiters[1].consume(Duration::from_millis(400), IoBytes::default(), true); } worker.adjust(); - check_limiter(f64::INFINITY, 5.2, 1.2); + check_limiter(f64::INFINITY, 3.2, 0.8); reset_quota(&mut worker, 6.4); for _i in 0..100 { @@ -990,7 +1043,7 @@ mod tests { priority_limiters[1].consume(Duration::from_millis(200), IoBytes::default(), true); } worker.adjust(); - check_limiter(f64::INFINITY, 2.6, 0.6); + check_limiter(f64::INFINITY, 1.6, 0.8); reset_quota(&mut worker, 6.4); for _i in 0..100 { @@ -1006,7 +1059,7 @@ mod tests { priority_limiters[2].consume(Duration::from_millis(320), IoBytes::default(), true); } worker.adjust(); - check_limiter(f64::INFINITY, 5.2, 2.8); + check_limiter(f64::INFINITY, 3.2, 0.8); reset_quota(&mut worker, 6.0); for _i in 0..100 { @@ -1014,12 +1067,12 @@ mod tests { priority_limiters[2].consume(Duration::from_millis(360), IoBytes::default(), true); } worker.adjust(); - check_limiter(f64::INFINITY, 5.2, 5.2); + check_limiter(f64::INFINITY, 3.2, 3.2); // duration too small, unchanged. worker.resource_quota_getter.cpu_used = 6.0; worker.last_adjust_time = Instant::now_coarse() - Duration::from_millis(500); worker.adjust(); - check_limiter(f64::INFINITY, 5.2, 5.2); + check_limiter(f64::INFINITY, 3.2, 3.2); } } diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index 34922001fdb..1befad00916 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -11,8 +11,6 @@ futures = "0.3" grpcio = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" -libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } online_config = { workspace = true } pdqselect = "0.1" pin-project = "1.0" @@ -23,9 +21,6 @@ slog = { workspace = true } slog-global = { workspace = true } tikv_util = { workspace = true } -[target.'cfg(target_os = "linux")'.dependencies] -procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "7693954bd1dd86eb1709572fd7b62fd5f7ff2ea1" } - [dev-dependencies] rand = "0.8" diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index 5889438f402..2865fef20c9 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -9,12 +9,9 @@ license = "Apache-2.0" collections = { workspace = true } encryption = { workspace = true } grpcio = { workspace = true } -kvproto = { workspace = true } log_wrappers = { workspace = true } serde = "1.0" serde_derive = "1.0" -serde_json = "1.0" -tikv_util = { workspace = true } [dev-dependencies] tempfile = "3.0" diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 61e53af9805..5e844aa16fa 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -15,18 +15,10 @@ sse = ["tikv/sse"] memory-engine = [] mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] -test-engine-kv-rocksdb = [ - "tikv/test-engine-kv-rocksdb" -] -test-engine-raft-raft-engine = [ - "tikv/test-engine-raft-raft-engine" -] -test-engines-rocksdb = [ - "tikv/test-engines-rocksdb", -] -test-engines-panic = [ - "tikv/test-engines-panic", -] +test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] +test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] +test-engines-rocksdb = ["tikv/test-engines-rocksdb"] +test-engines-panic = ["tikv/test-engines-panic"] nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] @@ -49,17 +41,14 @@ engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -fs2 = "0.4" futures = "0.3" grpcio = { workspace = true } -grpcio-health = { workspace = true } health_controller = { workspace = true } -hex = "0.4" hybrid_engine = { workspace = true } +in_memory_engine = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } libc = "0.2" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } @@ -68,7 +57,6 @@ raft = { workspace = true } raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } -range_cache_memory_engine = { workspace = true } resolved_ts = { workspace = true } resource_control = { workspace = true } resource_metering = { workspace = true } @@ -83,8 +71,6 @@ tikv = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -toml = "0.5" -txn_types = { workspace = true } yatp = { workspace = true } [target.'cfg(unix)'.dependencies] diff --git a/components/server/src/common.rs b/components/server/src/common.rs index b3449bfa42f..89769de17d4 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -23,23 +23,25 @@ use engine_rocks::{ }; use engine_traits::{ data_cf_offset, CachedTablet, CfOptions, CfOptionsExt, FlowControlFactorsExt, KvEngine, - RaftEngine, RangeCacheEngine, StatisticsReporter, TabletRegistry, CF_DEFAULT, DATA_CFS, + RaftEngine, RegionCacheEngine, StatisticsReporter, TabletRegistry, CF_DEFAULT, DATA_CFS, }; use error_code::ErrorCodeExt; use file_system::{get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor}; use grpcio::Environment; use hybrid_engine::HybridEngine; +use in_memory_engine::{ + flush_in_memory_engine_statistics, InMemoryEngineContext, InMemoryEngineStatistics, + RegionCacheMemoryEngine, +}; use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; -use raftstore::coprocessor::RegionInfoProvider; -use range_cache_memory_engine::{ - flush_range_cache_engine_statistics, RangeCacheEngineContext, RangeCacheMemoryEngine, - RangeCacheMemoryEngineStatistics, -}; +use raftstore::{coprocessor::RegionInfoProvider, store::CasualRouter}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, TikvConfig}, - server::{status_server::StatusServer, DEFAULT_CLUSTER_ID}, + server::{ + gc_worker::compaction_filter::GC_CONTEXT, status_server::StatusServer, DEFAULT_CLUSTER_ID, + }, }; use tikv_util::{ config::{ensure_dir_exist, RaftDataStateMachine}, @@ -220,8 +222,9 @@ impl TikvServerCore { } } - let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); - let mut capacity = disk_stats.total_space(); + let (disk_cap, disk_avail) = + disk::get_disk_space_stats(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_cap; if self.config.raft_store.capacity.0 > 0 { capacity = cmp::min(capacity, self.config.raft_store.capacity.0); } @@ -229,11 +232,7 @@ impl TikvServerCore { let kv_reserved_size = calculate_reserved_space(capacity, self.config.storage.reserve_space.0); disk::set_disk_reserved_space(kv_reserved_size); - reserve_physical_space( - &self.config.storage.data_dir, - disk_stats.available_space(), - kv_reserved_size, - ); + reserve_physical_space(&self.config.storage.data_dir, disk_avail, kv_reserved_size); let raft_data_dir = if self.config.raft_engine.enable { self.config.raft_engine.config().dir @@ -244,18 +243,13 @@ impl TikvServerCore { let separated_raft_mount_path = path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); if separated_raft_mount_path { - let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + let (raft_disk_cap, raft_disk_avail) = + disk::get_disk_space_stats(&raft_data_dir).unwrap(); // reserve space for raft engine if raft engine is deployed separately - let raft_reserved_size = calculate_reserved_space( - raft_disk_stats.total_space(), - self.config.storage.reserve_raft_space.0, - ); + let raft_reserved_size = + calculate_reserved_space(raft_disk_cap, self.config.storage.reserve_raft_space.0); disk::set_raft_disk_reserved_space(raft_reserved_size); - reserve_physical_space( - &raft_data_dir, - raft_disk_stats.available_space(), - raft_reserved_size, - ); + reserve_physical_space(&raft_data_dir, raft_disk_avail, raft_reserved_size); } } @@ -573,19 +567,19 @@ impl EnginesResourceInfo { } let mut normalized_pending_bytes = 0; - for (i, (pending, limit)) in compaction_pending_bytes + for (i, (pending, evict_threshold)) in compaction_pending_bytes .iter() .zip(soft_pending_compaction_bytes_limit) .enumerate() { - if limit > 0 { + if evict_threshold > 0 { normalized_pending_bytes = cmp::max( normalized_pending_bytes, - (*pending * EnginesResourceInfo::SCALE_FACTOR / limit) as u32, + (*pending * EnginesResourceInfo::SCALE_FACTOR / evict_threshold) as u32, ); let base = self.base_max_compactions[i]; if base > 0 { - let level = *pending as f32 / limit as f32; + let level = *pending as f32 / evict_threshold as f32; // 50% -> 1, 70% -> 2, 85% -> 3, 95% -> 6, 98% -> 1024. let delta1 = if level > 0.98 { 1024 @@ -619,7 +613,7 @@ impl EnginesResourceInfo { "cf" => cf, "n" => base + delta, "pending_bytes" => *pending, - "soft_limit" => limit, + "evict_threshold" => evict_threshold, "level0_ratio" => level0_ratio[i], ); } @@ -703,48 +697,38 @@ impl Stop for LazyWorker { } } -pub trait KvEngineBuilder: KvEngine { - fn build( - range_cache_engine_context: RangeCacheEngineContext, - disk_engine: RocksEngine, - pd_client: Option>, - region_info_provider: Option>, - ) -> Self; -} - -impl KvEngineBuilder for RocksEngine { - fn build( - _: RangeCacheEngineContext, - disk_engine: RocksEngine, - _pd_client: Option>, - _region_info_provider: Option>, - ) -> Self { - disk_engine +pub fn build_hybrid_engine( + region_cache_engine_context: InMemoryEngineContext, + disk_engine: RocksEngine, + pd_client: Option>, + region_info_provider: Option>, + casual_router: Box>, +) -> HybridEngine { + // todo(SpadeA): add config for it + let mut memory_engine = RegionCacheMemoryEngine::with_region_info_provider( + region_cache_engine_context.clone(), + region_info_provider, + Some(casual_router), + ); + memory_engine.set_disk_engine(disk_engine.clone()); + if let Some(pd_client) = pd_client.as_ref() { + memory_engine.start_hint_service( + ::RangeHintService::from( + pd_client.clone(), + ), + ) } -} -impl KvEngineBuilder for HybridEngine { - fn build( - range_cache_engine_context: RangeCacheEngineContext, - disk_engine: RocksEngine, - pd_client: Option>, - region_info_provider: Option>, - ) -> Self { - // todo(SpadeA): add config for it - let mut memory_engine = RangeCacheMemoryEngine::with_region_info_provider( - range_cache_engine_context, - region_info_provider, - ); - memory_engine.set_disk_engine(disk_engine.clone()); - if let Some(pd_client) = pd_client.as_ref() { - memory_engine.start_hint_service( - ::RangeHintService::from( - pd_client.clone(), - ), - ) - } - HybridEngine::new(disk_engine, memory_engine) - } + memory_engine.start_cross_check( + disk_engine.clone(), + region_cache_engine_context.pd_client(), + Box::new(|| { + let ctx = GC_CONTEXT.lock().unwrap(); + ctx.as_ref().map(|ctx| ctx.safe_point()) + }), + ); + + HybridEngine::new(disk_engine, memory_engine) } pub trait ConfiguredRaftEngine: RaftEngine { @@ -865,7 +849,7 @@ const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60 pub struct EngineMetricsManager { tablet_registry: TabletRegistry, kv_statistics: Option>, - range_cache_engine_statistics: Option>, + in_memory_engine_statistics: Option>, kv_is_titan: bool, raft_engine: ER, raft_statistics: Option>, @@ -876,7 +860,7 @@ impl EngineMetricsManager { pub fn new( tablet_registry: TabletRegistry, kv_statistics: Option>, - range_cache_engine_statistics: Option>, + in_memory_engine_statistics: Option>, kv_is_titan: bool, raft_engine: ER, raft_statistics: Option>, @@ -884,7 +868,7 @@ impl EngineMetricsManager { EngineMetricsManager { tablet_registry, kv_statistics, - range_cache_engine_statistics, + in_memory_engine_statistics, kv_is_titan, raft_engine, raft_statistics, @@ -910,8 +894,8 @@ impl EngineMetricsManager { if let Some(s) = self.raft_statistics.as_ref() { flush_engine_statistics(s, "raft", false); } - if let Some(s) = self.range_cache_engine_statistics.as_ref() { - flush_range_cache_engine_statistics(s); + if let Some(s) = self.in_memory_engine_statistics.as_ref() { + flush_in_memory_engine_statistics(s); } if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { if let Some(s) = self.kv_statistics.as_ref() { @@ -924,3 +908,340 @@ impl EngineMetricsManager { } } } + +fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } +} + +/// A checker to inspect the disk usage of kv engine and raft engine. +/// The caller should call `inspect` periodically to get the disk usage status +/// manually. +#[derive(Clone)] +pub struct DiskUsageChecker { + /// The path of kv engine. + kvdb_path: String, + /// The path of raft engine. + raft_path: String, + /// The path of auxiliary directory of raft engine if specified. + raft_auxiliary_path: Option, + /// Whether the main directory of raft engine is separated from kv engine. + separated_raft_mount_path: bool, + /// Whether the auxiliary directory of raft engine is separated from kv + /// engine. + separated_raft_auxiliary_mount_path: bool, + /// Whether the auxiliary directory of raft engine is both separated from + /// the main directory of raft engine and kv engine. + separated_raft_auxiliary_and_kvdb_mount_path: bool, + /// The threshold of disk usage of kv engine to trigger the almost full + /// status. + kvdb_almost_full_thd: u64, + /// The threshold of disk usage of raft engine to trigger the almost full + /// status. + raft_almost_full_thd: u64, + /// The specified disk capacity for the whole disk. + config_disk_capacity: u64, +} + +impl DiskUsageChecker { + pub fn new( + kvdb_path: String, + raft_path: String, + raft_auxiliary_path: Option, + separated_raft_mount_path: bool, + separated_raft_auxiliary_mount_path: bool, + separated_raft_auxiliary_and_kvdb_mount_path: bool, + kvdb_almost_full_thd: u64, + raft_almost_full_thd: u64, + config_disk_capacity: u64, + ) -> Self { + DiskUsageChecker { + kvdb_path, + raft_path, + raft_auxiliary_path, + separated_raft_mount_path, + separated_raft_auxiliary_mount_path, + separated_raft_auxiliary_and_kvdb_mount_path, + kvdb_almost_full_thd, + raft_almost_full_thd, + config_disk_capacity, + } + } + + /// Inspect the disk usage of kv engine and raft engine. + /// The `kvdb_used_size` is the used size of kv engine, and the + /// `raft_used_size` is the used size of raft engine. + /// + /// Returns the disk usage status of the whole disk, kv engine and raft + /// engine, the whole disk capacity and available size. + pub fn inspect( + &self, + kvdb_used_size: u64, + raft_used_size: u64, + ) -> ( + disk::DiskUsage, // whole disk status + disk::DiskUsage, // kvdb disk status + disk::DiskUsage, // raft disk status + u64, // whole capacity + u64, // whole available + ) { + // By default, the almost full threshold of kv engine is half of the + // configured value. + let kvdb_already_full_thd = self.kvdb_almost_full_thd / 2; + let raft_already_full_thd = self.raft_almost_full_thd / 2; + // Check the disk space of raft engine. + let raft_disk_status = { + if !self.separated_raft_mount_path || self.raft_almost_full_thd == 0 { + disk::DiskUsage::Normal + } else { + let (raft_disk_cap, raft_disk_avail) = match disk::get_disk_space_stats( + &self.raft_path, + ) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft_engine_path" => &self.raft_path, + "err" => ?e + ); + return ( + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + 0, + 0, + ); + } + Ok((cap, avail)) => { + if !self.separated_raft_auxiliary_mount_path { + // If the auxiliary directory of raft engine is not separated from + // kv engine, returns u64::MAX to indicate that the disk space of + // the raft engine should not be checked. + (std::u64::MAX, std::u64::MAX) + } else if self.separated_raft_auxiliary_and_kvdb_mount_path { + // If the auxiliary directory of raft engine is separated from kv + // engine and the main directory of + // raft engine, the disk space of + // the auxiliary directory should be + // checked. + assert!(self.raft_auxiliary_path.is_some()); + let (auxiliary_disk_cap, auxiliary_disk_avail) = + match disk::get_disk_space_stats( + self.raft_auxiliary_path.as_ref().unwrap(), + ) { + Err(e) => { + error!( + "get auxiliary disk stat for raft engine failed"; + "raft_engine_path" => self.raft_auxiliary_path.as_ref().unwrap(), + "err" => ?e + ); + (0_u64, 0_u64) + } + Ok((total, avail)) => (total, avail), + }; + (cap + auxiliary_disk_cap, avail + auxiliary_disk_avail) + } else { + (cap, avail) + } + } + }; + let raft_disk_available = cmp::min( + raft_disk_cap + .checked_sub(raft_used_size) + .unwrap_or_default(), + raft_disk_avail, + ); + if raft_disk_available <= raft_already_full_thd { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= self.raft_almost_full_thd { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + } + } + }; + // Check the disk space of kv engine. + let (disk_cap, disk_avail) = match disk::get_disk_space_stats(&self.kvdb_path) { + Err(e) => { + error!( + "get disk stat for kv store failed"; + "kv_path" => &self.kvdb_path, + "err" => ?e + ); + return ( + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + disk::DiskUsage::Normal, + 0, + 0, + ); + } + Ok((total, avail)) => (total, avail), + }; + let capacity = if self.config_disk_capacity == 0 || disk_cap < self.config_disk_capacity { + disk_cap + } else { + self.config_disk_capacity + }; + let available = cmp::min( + capacity.checked_sub(kvdb_used_size).unwrap_or_default(), + disk_avail, + ); + let cur_kv_disk_status = if available <= kvdb_already_full_thd { + disk::DiskUsage::AlreadyFull + } else if available <= self.kvdb_almost_full_thd { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); + ( + cur_disk_status, + cur_kv_disk_status, + raft_disk_status, + capacity, + available, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_disk_usage_checker() { + let kvdb_path = "/tmp/tikv-kvdb".to_owned(); + let raft_path = "/tmp/tikv-raft".to_owned(); + let raft_spill_path = "/tmp/tikv-raft/spill".to_owned(); + + // Case 1: mock the kvdb and raft engine are not separated. + fail::cfg("mock_disk_space_stats", "return(10000,5000)").unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + false, + true, + false, + 100, + 100, + 1000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 1000); + assert_eq!(disk_status, disk::DiskUsage::AlreadyFull); + assert_eq!(kvdb_status, disk::DiskUsage::AlreadyFull); + assert_eq!(raft_status, disk::DiskUsage::Normal); + + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + false, + true, + false, + 100, + 100, + 4100, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 1000); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::AlmostFull); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(3999, 1000); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + + // Case 2: mock the kvdb and raft engine are separated. + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + true, + false, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::AlreadyFull); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::AlreadyFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 400); + assert_eq!(raft_status, disk::DiskUsage::AlmostFull); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 399); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)->1*return(500,200)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + false, + false, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 500); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4900, 500); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::AlmostFull); + assert_eq!(disk_status, disk::DiskUsage::AlmostFull); + fail::remove("mock_disk_space_stats"); + + // Case 3: mock the kvdb and raft engine are separated and the auxiliary + // directory of raft engine is separated from the main directory of + // raft. + fail::cfg( + "mock_disk_space_stats", + "1*return(500,200)->1*return(100,20)->1*return(5000,2000)", + ) + .unwrap(); + let disk_usage_checker = DiskUsageChecker::new( + kvdb_path.clone(), + raft_path.clone(), + Some(raft_spill_path.clone()), + true, + true, + true, + 100, + 100, + 6000, + ); + let (disk_status, kvdb_status, raft_status, ..) = disk_usage_checker.inspect(4000, 450); + assert_eq!(raft_status, disk::DiskUsage::Normal); + assert_eq!(kvdb_status, disk::DiskUsage::Normal); + assert_eq!(disk_status, disk::DiskUsage::Normal); + fail::remove("mock_disk_space_stats"); + } +} diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index 144cc1885d5..845dda6176d 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -15,3 +15,4 @@ pub mod raft_engine_switch; pub mod server; pub mod server2; pub mod signal_handler; +mod utils; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 8712d38338f..a083ce5769a 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -12,7 +12,6 @@ //! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ - cmp, collections::HashMap, convert::TryFrom, path::{Path, PathBuf}, @@ -35,14 +34,21 @@ use engine_rocks::{ }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - Engines, KvEngine, RaftEngine, SingletonFactory, TabletContext, TabletRegistry, CF_DEFAULT, - CF_WRITE, + Engines, KvEngine, MiscExt, RaftEngine, SingletonFactory, TabletContext, TabletRegistry, + CF_DEFAULT, CF_WRITE, }; use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetricsManager}; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use health_controller::HealthController; -use hybrid_engine::{observer::Observer as HybridEngineObserver, HybridEngine}; +use hybrid_engine::observer::{ + HybridSnapshotObserver, LoadEvictionObserver as HybridEngineLoadEvictionObserver, + RegionCacheWriteBatchObserver, +}; +use in_memory_engine::{ + config::InMemoryEngineConfigManager, InMemoryEngineContext, InMemoryEngineStatistics, + RegionCacheMemoryEngine, +}; use kvproto::{ brpb::create_backup, cdcpb::create_change_data, deadlock::create_deadlock, debugpb::create_debug, diagnosticspb::create_diagnostics, import_sstpb::create_import_sst, @@ -73,12 +79,8 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; -use range_cache_memory_engine::{ - config::RangeCacheConfigManager, RangeCacheEngineContext, RangeCacheMemoryEngine, - RangeCacheMemoryEngineStatistics, -}; use resolved_ts::{LeadershipResolver, Task}; -use resource_control::ResourceGroupManager; +use resource_control::{config::ResourceContrlCfgMgr, ResourceGroupManager}; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use snap_recovery::RecoveryService; @@ -112,7 +114,10 @@ use tikv::{ config_manager::StorageConfigManger, kv::LocalTablets, mvcc::MvccConsistencyCheckObserver, - txn::flow_controller::{EngineFlowController, FlowController}, + txn::{ + flow_controller::{EngineFlowController, FlowController}, + txn_status_cache::TxnStatusCache, + }, Engine, Storage, }, }; @@ -134,26 +139,26 @@ use tokio::runtime::Builder; use crate::{ common::{ - ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, KvEngineBuilder, - TikvServerCore, + build_hybrid_engine, ConfiguredRaftEngine, DiskUsageChecker, EngineMetricsManager, + EnginesResourceInfo, TikvServerCore, }, memory::*, setup::*, signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, + utils, }; #[inline] -fn run_impl( +fn run_impl( config: TikvConfig, service_event_tx: TikvMpsc::Sender, service_event_rx: TikvMpsc::Receiver, ) where - EK: KvEngine + KvEngineBuilder, CER: ConfiguredRaftEngine, F: KvFormat, { - let mut tikv = TikvServer::::init(config, service_event_tx.clone()); + let mut tikv = TikvServer::::init(config, service_event_tx.clone()); // Must be called after `TikvServer::init`. let memory_limit = tikv.core.config.memory_usage_limit.unwrap().0; let high_water = (tikv.core.config.memory_usage_high_water * memory_limit as f64) as u64; @@ -165,7 +170,7 @@ fn run_impl( tikv.core.init_encryption(); let fetcher = tikv.core.init_io_utility(); let listener = tikv.core.init_flow_receiver(); - let (engines, engines_info) = tikv.init_raw_engines(listener); + let (engines, engines_info, in_memory_engine) = tikv.init_raw_engines(listener); tikv.init_engines(engines.clone()); let server_config = tikv.init_servers(); tikv.register_services(); @@ -173,7 +178,7 @@ fn run_impl( tikv.init_cgroup_monitor(); tikv.init_storage_stats_task(engines); tikv.run_server(server_config); - tikv.run_status_server(); + tikv.run_status_server(in_memory_engine); tikv.core.init_quota_tuning_task(tikv.quota_limiter.clone()); // Build a background worker for handling signals. @@ -226,33 +231,9 @@ pub fn run_tikv( dispatch_api_version!(config.storage.api_version(), { if !config.raft_engine.enable { - if cfg!(feature = "memory-engine") && config.range_cache_engine.enabled { - run_impl::, RocksEngine, API>( - config, - service_event_tx, - service_event_rx, - ) - } else { - run_impl::( - config, - service_event_tx, - service_event_rx, - ) - } + run_impl::(config, service_event_tx, service_event_rx) } else { - if cfg!(feature = "memory-engine") && config.range_cache_engine.enabled { - run_impl::, RaftLogEngine, API>( - config, - service_event_tx, - service_event_rx, - ) - } else { - run_impl::( - config, - service_event_tx, - service_event_rx, - ) - } + run_impl::(config, service_event_tx, service_event_rx) } }) } @@ -263,9 +244,8 @@ const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); const DEFAULT_CGROUP_MONITOR_INTERVAL: Duration = Duration::from_secs(10); /// A complete TiKV server. -struct TikvServer +struct TikvServer where - EK: KvEngine, ER: RaftEngine, F: KvFormat, { @@ -273,17 +253,17 @@ where cfg_controller: Option, security_mgr: Arc, pd_client: Arc, - router: RaftRouter, - system: Option>, + router: RaftRouter, + system: Option>, resolver: Option, snap_mgr: Option, // Will be filled in `init_servers`. - engines: Option>, + engines: Option>, kv_statistics: Option>, - range_cache_engine_statistics: Option>, + in_memory_engine_statistics: Option>, raft_statistics: Option>, - servers: Option>, - region_info_accessor: RegionInfoAccessor, - coprocessor_host: Option>, + servers: Option>, + region_info_accessor: Option, + coprocessor_host: Option>, concurrency_manager: ConcurrencyManager, env: Arc, check_leader_worker: Worker, @@ -298,34 +278,38 @@ where snap_br_rejector: Option>, } -struct TikvEngines { - engines: Engines, +struct TikvEngines { + engines: Engines, store_meta: Arc>, - engine: RaftKv>, + engine: RaftKv>, } -struct Servers { +struct Servers { lock_mgr: LockManager, - server: LocalServer, - raft_server: MultiRaftServer, - importer: Arc>, + server: LocalServer, + raft_server: MultiRaftServer, + importer: Arc>, cdc_scheduler: tikv_util::worker::Scheduler, cdc_memory_quota: Arc, rsmeter_pubsub_service: resource_metering::PubSubService, backup_stream_scheduler: Option>, - debugger: DebuggerImpl>, LockManager, F>, + debugger: DebuggerImpl< + ER, + RaftKv>, + LockManager, + F, + >, } type LocalServer = Server>; type LocalRaftKv = RaftKv>; -impl TikvServer +impl TikvServer where - EK: KvEngine, ER: RaftEngine, F: KvFormat, { - fn init(mut config: TikvConfig, tx: TikvMpsc::Sender) -> TikvServer { + fn init(mut config: TikvConfig, tx: TikvMpsc::Sender) -> TikvServer { tikv_util::thread_group::set_properties(Some(GroupProperties::default())); // It is okay use pd config and security config before `init_config`, // because these configs must be provided by command line, and only @@ -393,7 +377,7 @@ where .create(); let resource_manager = if config.resource_control.enabled { - let mgr = Arc::new(ResourceGroupManager::default()); + let mgr = Arc::new(ResourceGroupManager::new(config.resource_control.clone())); let io_bandwidth = config.storage.io_rate_limit.max_bytes_per_sec.0; resource_control::start_periodic_tasks( &mgr, @@ -409,30 +393,11 @@ where // Initialize raftstore channels. let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); - let mut coprocessor_host = Some(CoprocessorHost::new( + let coprocessor_host = Some(CoprocessorHost::new( router.clone(), config.coprocessor.clone(), )); - // Region stats manager collects region heartbeat for use by in-memory engine. - let region_stats_manager_enabled_cb: Arc bool + Send + Sync> = - if cfg!(feature = "memory-engine") { - let cfg_controller_clone = cfg_controller.clone(); - Arc::new(move || { - cfg_controller_clone - .get_current() - .range_cache_engine - .enabled - }) - } else { - Arc::new(|| false) - }; - - let region_info_accessor = RegionInfoAccessor::new( - coprocessor_host.as_mut().unwrap(), - region_stats_manager_enabled_cb, - ); - // Initialize concurrency manager let latest_ts = block_on(pd_client.get_tso()).expect("failed to get timestamp from PD"); let concurrency_manager = ConcurrencyManager::new(latest_ts); @@ -489,10 +454,10 @@ where snap_mgr: None, engines: None, kv_statistics: None, - range_cache_engine_statistics: None, + in_memory_engine_statistics: None, raft_statistics: None, servers: None, - region_info_accessor, + region_info_accessor: None, coprocessor_host, concurrency_manager, env, @@ -509,7 +474,7 @@ where } } - fn init_engines(&mut self, engines: Engines) { + fn init_engines(&mut self, engines: Engines) { let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); let engine = RaftKv::new( ServerRaftStoreRouter::new( @@ -518,10 +483,11 @@ where engines.kv.clone(), StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), self.router.clone(), + self.coprocessor_host.as_ref().unwrap().clone(), ), ), engines.kv.clone(), - self.region_info_accessor.region_leaders(), + self.region_info_accessor.as_ref().unwrap().region_leaders(), ); self.engines = Some(TikvEngines { @@ -531,14 +497,16 @@ where }); } - fn init_gc_worker(&mut self) -> GcWorker>> { + fn init_gc_worker( + &mut self, + ) -> GcWorker>> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), self.core.flow_info_sender.take().unwrap(), self.core.config.gc.clone(), self.pd_client.feature_gate().clone(), - Arc::new(self.region_info_accessor.clone()), + Arc::new(self.region_info_accessor.clone().unwrap()), ); let cfg_controller = self.cfg_controller.as_mut().unwrap(); @@ -597,7 +565,7 @@ where if let Some(sst_worker) = &mut self.sst_worker { let sst_runner = RecoveryRunner::new( - engines.engines.kv.get_disk_engine().clone(), + engines.engines.kv.clone(), engines.store_meta.clone(), self.core .config @@ -683,6 +651,13 @@ where Box::new(cfg_manager), ); + if let Some(resource_ctl) = &self.resource_manager { + cfg_controller.register( + tikv::config::Module::ResourceControl, + Box::new(ResourceContrlCfgMgr::new(resource_ctl.get_config().clone())), + ); + } + let storage_read_pool_handle = if self.core.config.readpool.storage.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { @@ -693,6 +668,9 @@ where )); storage_read_pools.handle() }; + let txn_status_cache = Arc::new(TxnStatusCache::new( + self.core.config.storage.txn_status_cache_capacity, + )); let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), @@ -711,6 +689,7 @@ where .as_ref() .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), self.resource_manager.clone(), + txn_status_cache.clone(), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( @@ -733,12 +712,6 @@ where ReplicaReadLockChecker::new(self.concurrency_manager.clone()) .register(self.coprocessor_host.as_mut().unwrap()); - // Hybrid engine observer. - if self.core.config.range_cache_engine.enabled { - let observer = HybridEngineObserver::new(Arc::new(engines.engines.kv.clone())); - observer.register_to(self.coprocessor_host.as_mut().unwrap()); - } - // Create snapshot manager, server. let snap_path = self .core @@ -765,6 +738,7 @@ where .enable_receive_tablet_snapshot( self.core.config.raft_store.enable_v2_compatible_learner, ) + .min_ingest_snapshot_limit(self.core.config.server.snap_min_ingest_size) .build(snap_path); // Create coprocessor endpoint. @@ -939,6 +913,12 @@ where Duration::from_secs(60), ); + // build stream backup encryption manager + let backup_encryption_manager = + utils::build_backup_encryption_manager(self.core.encryption_key_manager.clone()) + .expect("failed to build backup encryption manager in server"); + + // build stream backup endpoint let backup_stream_endpoint = backup_stream::Endpoint::new( raft_server.id(), PdStore::new(Checked::new(Sourced::new( @@ -949,12 +929,13 @@ where self.core.config.resolved_ts.clone(), backup_stream_scheduler.clone(), backup_stream_ob, - self.region_info_accessor.clone(), + self.region_info_accessor.clone().unwrap(), CdcRaftRouter(self.router.clone()), self.pd_client.clone(), self.concurrency_manager.clone(), BackupStreamResolver::V1(leadership_resolver), - self.core.encryption_key_manager.clone(), + backup_encryption_manager, + txn_status_cache.clone(), ); backup_stream_worker.start(backup_stream_endpoint); self.core.to_stop.push(backup_stream_worker); @@ -1063,11 +1044,14 @@ where assert!(raft_server.id() > 0); // MultiRaftServer id should never be 0. let auto_gc_config = AutoGcConfig::new( self.pd_client.clone(), - self.region_info_accessor.clone(), + self.region_info_accessor.clone().unwrap(), raft_server.id(), ); gc_worker - .start(raft_server.id()) + .start( + raft_server.id(), + self.coprocessor_host.as_ref().cloned().unwrap(), + ) .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { fatal!("failed to start auto_gc on storage, error: {}", e); @@ -1077,7 +1061,7 @@ where if self.core.config.storage.enable_ttl { ttl_checker.start_with_timer(TtlChecker::new( self.engines.as_ref().unwrap().engine.kv_engine().unwrap(), - self.region_info_accessor.clone(), + self.region_info_accessor.clone().unwrap(), self.core.config.storage.ttl_check_poll_interval.into(), )); self.core.to_stop.push(ttl_checker); @@ -1119,6 +1103,7 @@ where self.concurrency_manager.clone(), server.env(), self.security_mgr.clone(), + storage.get_scheduler().get_txn_status_cache(), ); self.resolved_ts_scheduler = Some(rts_worker.scheduler()); rts_worker.start_with_timer(rts_endpoint); @@ -1135,10 +1120,7 @@ where // Create Debugger. let mut debugger = DebuggerImpl::new( - Engines::new( - engines.engines.kv.get_disk_engine().clone(), - engines.engines.raft.clone(), - ), + Engines::new(engines.engines.kv.clone(), engines.engines.raft.clone()), self.cfg_controller.as_ref().unwrap().clone(), Some(storage), ); @@ -1173,7 +1155,7 @@ where servers.importer.clone(), None, self.resource_manager.clone(), - Arc::new(self.region_info_accessor.clone()), + Arc::new(self.region_info_accessor.clone().unwrap()), ); let import_cfg_mgr = import_service.get_config_manager(); @@ -1262,7 +1244,7 @@ where let backup_endpoint = backup::Endpoint::new( servers.raft_server.id(), engines.engine.clone(), - self.region_info_accessor.clone(), + self.region_info_accessor.clone().unwrap(), LocalTablets::Singleton(engines.engines.kv.clone()), self.core.config.backup.clone(), self.concurrency_manager.clone(), @@ -1312,7 +1294,7 @@ where } if let Some(sched) = servers.backup_stream_scheduler.take() { - let pitr_service = backup_stream::Service::new(sched); + let pitr_service = backup_stream::BackupStreamGrpcService::new(sched); if servers .server .register_service(create_log_backup(pitr_service)) @@ -1345,7 +1327,7 @@ where let mut engine_metrics = EngineMetricsManager::::new( self.tablet_registry.clone().unwrap(), self.kv_statistics.clone(), - self.range_cache_engine_statistics.clone(), + self.in_memory_engine_statistics.clone(), self.core.config.rocksdb.titan.enabled.map_or(false, |v| v), self.engines.as_ref().unwrap().engines.raft.clone(), self.raft_statistics.clone(), @@ -1384,7 +1366,7 @@ where ); } - fn init_storage_stats_task(&self, engines: Engines) { + fn init_storage_stats_task(&self, engines: Engines) { let config_disk_capacity: u64 = self.core.config.raft_store.capacity.0; let data_dir = self.core.config.storage.data_dir.clone(); let store_path = self.core.store_path.clone(); @@ -1398,77 +1380,53 @@ where let raft_path = engines.raft.get_engine_path().to_string(); let separated_raft_mount_path = path_in_diff_mount_point(raft_path.as_str(), engines.kv.path()); - let raft_almost_full_threshold = reserve_raft_space; - let raft_already_full_threshold = reserve_raft_space / 2; - - let almost_full_threshold = reserve_space; - let already_full_threshold = reserve_space / 2; - fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { - match (a, b) { - (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, - (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, - (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, - (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, - (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, - } - } + // If the auxiliary directory of raft engine is specified, it's needed to be + // checked. Otherwise, it's not needed to be checked. And as the configuration + // is static, it's safe to check it only once. + let raft_auxiliay_path = if self.core.config.raft_engine.enable { + self.core.config.raft_engine.config().spill_dir.clone() + } else { + None + }; + let (separated_raft_auxillay_mount_path, separated_raft_auxiliary_with_kvdb) = + raft_auxiliay_path + .as_ref() + .map(|path| { + let seperated_with_kvdb = + path_in_diff_mount_point(path.as_str(), engines.kv.path()); + let seperated_with_raft = + path_in_diff_mount_point(path.as_str(), raft_path.as_str()); + ( + seperated_with_kvdb && seperated_with_raft, + seperated_with_kvdb, + ) + }) + .unwrap_or((false, false)); + let disk_usage_checker = DiskUsageChecker::new( + store_path.as_path().to_str().unwrap().to_string(), + raft_path, + raft_auxiliay_path, + separated_raft_mount_path, + separated_raft_auxillay_mount_path, + separated_raft_auxiliary_with_kvdb, + reserve_space, + reserve_raft_space, + config_disk_capacity, + ); self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { - let disk_stats = match fs2::statvfs(&store_path) { - Err(e) => { - error!( - "get disk stat for kv store failed"; - "kv_path" => store_path.to_str(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let disk_cap = disk_stats.total_space(); let snap_size = snap_mgr.get_total_snap_size().unwrap(); - let kv_size = engines .kv .get_engine_used_size() .expect("get kv engine size"); - let raft_size = engines .raft .get_engine_size() .expect("get raft engine size"); - - let mut raft_disk_status = disk::DiskUsage::Normal; - if separated_raft_mount_path && reserve_raft_space != 0 { - let raft_disk_stats = match fs2::statvfs(&raft_path) { - Err(e) => { - error!( - "get disk stat for raft engine failed"; - "raft_engine_path" => raft_path.clone(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let raft_disk_cap = raft_disk_stats.total_space(); - let mut raft_disk_available = - raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); - raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); - raft_disk_status = if raft_disk_available <= raft_already_full_threshold - { - disk::DiskUsage::AlreadyFull - } else if raft_disk_available <= raft_almost_full_threshold - { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); - let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); @@ -1477,24 +1435,9 @@ where } else { snap_size + kv_size + placeholder_size }; - let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { - disk_cap - } else { - config_disk_capacity - }; - - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - available = cmp::min(available, disk_stats.available_space()); - + // Check the disk usage and update the disk usage status. + let (cur_disk_status, cur_kv_disk_status, raft_disk_status, capacity, available) = disk_usage_checker.inspect(used_size, raft_size); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_kv_disk_status = if available <= already_full_threshold { - disk::DiskUsage::AlreadyFull - } else if available <= almost_full_threshold { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", @@ -1565,7 +1508,7 @@ where .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); } - fn run_status_server(&mut self) { + fn run_status_server(&mut self, in_memory_engine: Option) { // Create a status server. let status_enabled = !self.core.config.server.status_addr.is_empty(); if status_enabled { @@ -1576,6 +1519,7 @@ where self.engines.as_ref().unwrap().engine.raft_extension(), self.resource_manager.clone(), self.grpc_service_mgr.clone(), + in_memory_engine, ) { Ok(status_server) => Box::new(status_server), Err(e) => { @@ -1611,7 +1555,7 @@ where .unwrap_or_else(|e| fatal!("failed to stop server: {}", e)); servers.raft_server.stop(); - self.region_info_accessor.stop(); + self.region_info_accessor.as_ref().unwrap().stop(); servers.lock_mgr.stop(); @@ -1643,16 +1587,20 @@ where } } -impl TikvServer +impl TikvServer where - EK: KvEngine + KvEngineBuilder, + RocksEngine: KvEngine, CER: ConfiguredRaftEngine, F: KvFormat, { fn init_raw_engines( &mut self, flow_listener: engine_rocks::FlowListener, - ) -> (Engines, Arc) { + ) -> ( + Engines, + Arc, + Option, + ) { let block_cache = self.core.config.storage.block_cache.build_shared_cache(); let env = self .core @@ -1672,6 +1620,29 @@ where ); self.raft_statistics = raft_statistics; + // Region stats manager collects region heartbeat for use by in-memory engine. + let region_stats_manager_enabled_cb: Arc bool + Send + Sync> = + if cfg!(feature = "memory-engine") { + let cfg_controller_clone = self.cfg_controller.clone().unwrap(); + Arc::new(move || cfg_controller_clone.get_current().in_memory_engine.enable) + } else { + Arc::new(|| false) + }; + + let in_memory_engine_config = self.core.config.in_memory_engine.clone(); + let in_memory_engine_config = Arc::new(VersionTrack::new(in_memory_engine_config)); + let in_memory_engine_config_clone = in_memory_engine_config.clone(); + let region_info_accessor = RegionInfoAccessor::new( + self.coprocessor_host.as_mut().unwrap(), + region_stats_manager_enabled_cb, + Box::new(move || { + in_memory_engine_config_clone + .value() + .mvcc_amplification_threshold + }), + ); + self.region_info_accessor = Some(region_info_accessor); + // Create kv engine. let builder = KvEngineFactoryBuilder::new( env, @@ -1682,47 +1653,60 @@ where .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(self.router.clone()), })) - .region_info_accessor(self.region_info_accessor.clone()) + .region_info_accessor(self.region_info_accessor.clone().unwrap()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); let factory = Box::new(builder.build()); - let disk_engine = factory + let kv_engine = factory .create_shared_db(&self.core.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); - let mut range_cache_engine_config = self.core.config.range_cache_engine.clone(); - let _ = range_cache_engine_config - .expected_region_size - .get_or_insert(self.core.config.coprocessor.region_split_size()); - let range_cache_engine_config = Arc::new(VersionTrack::new(range_cache_engine_config)); - let range_cache_engine_context = - RangeCacheEngineContext::new(range_cache_engine_config.clone(), self.pd_client.clone()); - let range_cache_engine_statistics = range_cache_engine_context.statistics(); - let kv_engine: EK = KvEngineBuilder::build( - range_cache_engine_context, - disk_engine.clone(), - Some(self.pd_client.clone()), - Some(Arc::new(self.region_info_accessor.clone())), - ); - let range_cache_config_manager = RangeCacheConfigManager(range_cache_engine_config); + let in_memory_engine_context = + InMemoryEngineContext::new(in_memory_engine_config.clone(), self.pd_client.clone()); + let in_memory_engine_statistics = in_memory_engine_context.statistics(); + let ime_engine = if self.core.config.in_memory_engine.enable { + let in_memory_engine = build_hybrid_engine( + in_memory_engine_context, + kv_engine.clone(), + Some(self.pd_client.clone()), + Some(Arc::new(self.region_info_accessor.clone().unwrap())), + Box::new(self.router.clone()), + ); + + // Hybrid engine observer. + let eviction_observer = HybridEngineLoadEvictionObserver::new(Arc::new( + in_memory_engine.region_cache_engine().clone(), + )); + eviction_observer.register_to(self.coprocessor_host.as_mut().unwrap()); + let write_batch_observer = + RegionCacheWriteBatchObserver::new(in_memory_engine.region_cache_engine().clone()); + write_batch_observer.register_to(self.coprocessor_host.as_mut().unwrap()); + let snapshot_observer = + HybridSnapshotObserver::new(in_memory_engine.region_cache_engine().clone()); + snapshot_observer.register_to(self.coprocessor_host.as_mut().unwrap()); + Some(in_memory_engine.region_cache_engine().clone()) + } else { + None + }; + let in_memory_engine_config_manager = InMemoryEngineConfigManager(in_memory_engine_config); self.kv_statistics = Some(factory.rocks_statistics()); - self.range_cache_engine_statistics = Some(range_cache_engine_statistics); - let engines = Engines::new(kv_engine, raft_engine); + self.in_memory_engine_statistics = Some(in_memory_engine_statistics); + let engines = Engines::new(kv_engine.clone(), raft_engine); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, Box::new(DbConfigManger::new( cfg_controller.get_current().rocksdb, - disk_engine.clone(), + kv_engine.clone(), DbType::Kv, )), ); cfg_controller.register( - tikv::config::Module::RangeCacheEngine, - Box::new(range_cache_config_manager), + tikv::config::Module::InMemoryEngine, + Box::new(in_memory_engine_config_manager), ); let reg = TabletRegistry::new( - Box::new(SingletonFactory::new(disk_engine)), + Box::new(SingletonFactory::new(kv_engine)), &self.core.store_path, ) .unwrap(); @@ -1739,7 +1723,7 @@ where 180, // max_samples_to_preserve )); - (engines, engines_info) + (engines, engines_info, ime_engine) } } diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 7613ae18def..74a9ffaffa9 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -12,7 +12,6 @@ //! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ - cmp, collections::HashMap, marker::PhantomData, path::{Path, PathBuf}, @@ -41,6 +40,7 @@ use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetrics use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use health_controller::HealthController; +use in_memory_engine::InMemoryEngineStatistics; use kvproto::{ brpb::create_backup, cdcpb_grpc::create_change_data, deadlock::create_deadlock, debugpb_grpc::create_debug, diagnosticspb::create_diagnostics, @@ -67,9 +67,8 @@ use raftstore_v2::{ router::{DiskSnapBackupHandle, PeerMsg, RaftRouter}, StateStorage, }; -use range_cache_memory_engine::RangeCacheMemoryEngineStatistics; use resolved_ts::Task; -use resource_control::ResourceGroupManager; +use resource_control::{config::ResourceContrlCfgMgr, ResourceGroupManager}; use security::SecurityManager; use service::{service_event::ServiceEvent, service_manager::GrpcServiceManager}; use tikv::{ @@ -102,7 +101,10 @@ use tikv::{ config_manager::StorageConfigManger, kv::LocalTablets, mvcc::MvccConsistencyCheckObserver, - txn::flow_controller::{FlowController, TabletFlowController}, + txn::{ + flow_controller::{FlowController, TabletFlowController}, + txn_status_cache::TxnStatusCache, + }, Engine, Storage, }, }; @@ -123,11 +125,15 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, + common::{ + ConfiguredRaftEngine, DiskUsageChecker, EngineMetricsManager, EnginesResourceInfo, + TikvServerCore, + }, memory::*, setup::*, signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, + utils, }; #[inline] @@ -233,7 +239,7 @@ struct TikvServer { snap_mgr: Option, // Will be filled in `init_servers`. engines: Option>, kv_statistics: Option>, - range_cache_engine_statistics: Option>, + in_memory_engine_statistics: Option>, raft_statistics: Option>, servers: Option>, region_info_accessor: Option, @@ -333,7 +339,7 @@ where )); let resource_manager = if config.resource_control.enabled { - let mgr = Arc::new(ResourceGroupManager::default()); + let mgr = Arc::new(ResourceGroupManager::new(config.resource_control.clone())); let io_bandwidth = config.storage.io_rate_limit.max_bytes_per_sec.0; resource_control::start_periodic_tasks( &mgr, @@ -382,7 +388,7 @@ where snap_mgr: None, engines: None, kv_statistics: None, - range_cache_engine_statistics: None, + in_memory_engine_statistics: None, raft_statistics: None, servers: None, region_info_accessor: None, @@ -531,6 +537,12 @@ where tikv::config::Module::ResourceMetering, Box::new(cfg_manager), ); + if let Some(resource_ctl) = &self.resource_manager { + cfg_controller.register( + tikv::config::Module::ResourceControl, + Box::new(ResourceContrlCfgMgr::new(resource_ctl.get_config().clone())), + ); + } let storage_read_pool_handle = if self.core.config.readpool.storage.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() @@ -542,6 +554,9 @@ where )); storage_read_pools.handle() }; + let txn_status_cache = Arc::new(TxnStatusCache::new( + self.core.config.storage.txn_status_cache_capacity, + )); let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), @@ -560,6 +575,7 @@ where .as_ref() .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), self.resource_manager.clone(), + txn_status_cache.clone(), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( @@ -696,6 +712,7 @@ where self.concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), + storage.get_scheduler().get_txn_status_cache(), ); self.resolved_ts_scheduler = Some(rts_worker.scheduler()); rts_worker.start_with_timer(rts_endpoint); @@ -720,6 +737,11 @@ where )), ); + // build stream backup encryption manager + let backup_encryption_manager = + utils::build_backup_encryption_manager(self.core.encryption_key_manager.clone()) + .expect("failed to build backup encryption manager in server"); + let backup_stream_endpoint = backup_stream::Endpoint::new( self.node.as_ref().unwrap().id(), PdStore::new(Checked::new(Sourced::new( @@ -735,7 +757,8 @@ where self.pd_client.clone(), self.concurrency_manager.clone(), BackupStreamResolver::V2(self.router.clone().unwrap(), PhantomData), - self.core.encryption_key_manager.clone(), + backup_encryption_manager.clone(), + txn_status_cache.clone(), ); backup_stream_worker.start(backup_stream_endpoint); self.core.to_stop.push(backup_stream_worker); @@ -907,7 +930,7 @@ where store_id, ); gc_worker - .start(store_id) + .start(store_id, self.coprocessor_host.as_ref().cloned().unwrap()) .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { fatal!("failed to start auto_gc on storage, error: {}", e); @@ -984,7 +1007,7 @@ where } if let Some(sched) = self.backup_stream_scheduler.take() { - let pitr_service = backup_stream::Service::new(sched); + let pitr_service = backup_stream::BackupStreamGrpcService::new(sched); if servers .server .register_service(create_log_backup(pitr_service)) @@ -1103,7 +1126,7 @@ where let mut engine_metrics = EngineMetricsManager::::new( self.tablet_registry.clone().unwrap(), self.kv_statistics.clone(), - self.range_cache_engine_statistics.clone(), + self.in_memory_engine_statistics.clone(), self.core.config.rocksdb.titan.enabled.map_or(false, |v| v), self.engines.as_ref().unwrap().raft_engine.clone(), self.raft_statistics.clone(), @@ -1158,36 +1181,42 @@ where let raft_path = raft_engine.get_engine_path().to_string(); let separated_raft_mount_path = path_in_diff_mount_point(raft_path.as_str(), tablet_registry.tablet_root()); - let raft_almost_full_threshold = reserve_raft_space; - let raft_already_full_threshold = reserve_raft_space / 2; - - let almost_full_threshold = reserve_space; - let already_full_threshold = reserve_space / 2; - fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { - match (a, b) { - (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, - (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, - (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, - (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, - (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, - } - } + // If the auxiliary directory of raft engine is specified, it's needed to be + // checked. Otherwise, it's not needed to be checked. And as the configuration + // is static, it's safe to check it only once. + let raft_auxiliay_path = if self.core.config.raft_engine.enable { + self.core.config.raft_engine.config().spill_dir.clone() + } else { + None + }; + let (separated_raft_auxillay_mount_path, separated_raft_auxiliary_with_kvdb) = + raft_auxiliay_path + .as_ref() + .map(|path| { + let seperated_with_kvdb = + path_in_diff_mount_point(path.as_str(), tablet_registry.tablet_root()); + let seperated_with_raft = + path_in_diff_mount_point(path.as_str(), raft_path.as_str()); + ( + seperated_with_kvdb && seperated_with_raft, + seperated_with_kvdb, + ) + }) + .unwrap_or((false, false)); + let disk_usage_checker = DiskUsageChecker::new( + store_path.as_path().to_str().unwrap().to_string(), + raft_path, + raft_auxiliay_path, + separated_raft_mount_path, + separated_raft_auxillay_mount_path, + separated_raft_auxiliary_with_kvdb, + reserve_space, + reserve_raft_space, + config_disk_capacity, + ); self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { - let disk_stats = match fs2::statvfs(&store_path) { - Err(e) => { - error!( - "get disk stat for kv store failed"; - "kv_path" => store_path.to_str(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let disk_cap = disk_stats.total_space(); let snap_size = snap_mgr.total_snap_size().unwrap(); - let mut kv_size = 0; tablet_registry.for_each_opened_tablet(|_, cached| { if let Some(tablet) = cached.latest() { @@ -1195,42 +1224,12 @@ where } true }); - let raft_size = raft_engine .get_engine_size() .expect("get raft engine size"); - - let mut raft_disk_status = disk::DiskUsage::Normal; - if separated_raft_mount_path && reserve_raft_space != 0 { - let raft_disk_stats = match fs2::statvfs(&raft_path) { - Err(e) => { - error!( - "get disk stat for raft engine failed"; - "raft_engine_path" => raft_path.clone(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let raft_disk_cap = raft_disk_stats.total_space(); - let mut raft_disk_available = - raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); - raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); - raft_disk_status = if raft_disk_available <= raft_already_full_threshold - { - disk::DiskUsage::AlreadyFull - } else if raft_disk_available <= raft_almost_full_threshold - { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); - let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); @@ -1239,24 +1238,9 @@ where } else { snap_size + kv_size + placeholder_size }; - let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { - disk_cap - } else { - config_disk_capacity - }; - - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - available = cmp::min(available, disk_stats.available_space()); - + // Check the disk usage and update the disk usage status. + let (cur_disk_status, cur_kv_disk_status, raft_disk_status, capacity, available) = disk_usage_checker.inspect(used_size, raft_size); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_kv_disk_status = if available <= already_full_threshold { - disk::DiskUsage::AlreadyFull - } else if available <= almost_full_threshold { - disk::DiskUsage::AlmostFull - } else { - disk::DiskUsage::Normal - }; - let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", @@ -1342,6 +1326,7 @@ where self.engines.as_ref().unwrap().engine.raft_extension(), self.resource_manager.clone(), self.grpc_service_mgr.clone(), + None, ) { Ok(status_server) => Box::new(status_server), Err(e) => { @@ -1555,6 +1540,10 @@ impl TikvServer { let region_info_accessor = RegionInfoAccessor::new( &mut coprocessor_host, Arc::new(|| false), // Not applicable to v2. + Box::new(|| { + // v2 does not support ime + unreachable!() + }), ); let cdc_worker = Box::new(LazyWorker::new("cdc")); diff --git a/components/server/src/utils.rs b/components/server/src/utils.rs new file mode 100644 index 00000000000..dd88ad7c0d1 --- /dev/null +++ b/components/server/src/utils.rs @@ -0,0 +1,19 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{io, sync::Arc}; + +use encryption::{BackupEncryptionManager, DataKeyManager, MultiMasterKeyBackend}; +use kvproto::encryptionpb::EncryptionMethod; + +pub fn build_backup_encryption_manager( + opt_encryption_key_manager: Option>, +) -> Result { + let multi_master_key_backend = MultiMasterKeyBackend::new(); + + Ok(BackupEncryptionManager::new( + None, + EncryptionMethod::Plaintext, + multi_master_key_backend, + opt_encryption_key_manager, + )) +} diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml index 72049f5a318..557fec0a2cf 100644 --- a/components/snap_recovery/Cargo.toml +++ b/components/snap_recovery/Cargo.toml @@ -14,32 +14,25 @@ test-engines-panic = ["tikv/test-engines-panic"] [dependencies] chrono = { workspace = true } -encryption = { workspace = true } encryption_export = { workspace = true } engine_rocks = { workspace = true } engine_traits = { workspace = true } futures = { version = "0.3", features = ["executor"] } grpcio = { workspace = true } -itertools = "0.10" keys = { workspace = true } kvproto = { workspace = true } lazy_static = "1.4" -log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } pd_client = { workspace = true } prometheus = { version = "0.13", default_features = false, features = ["nightly"] } prometheus-static-metric = "0.5" -protobuf = { version = "2.8", features = ["bytes"] } raft_log_engine = { workspace = true } raftstore = { workspace = true } slog = { workspace = true } slog-global = { workspace = true } -structopt = "0.3" tempfile = "3.0" thiserror = "1.0" tikv = { workspace = true } -tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.17", features = ["rt"] } -toml = "0.5" txn_types = { workspace = true } diff --git a/components/snap_recovery/src/services.rs b/components/snap_recovery/src/services.rs index ff83db76bf2..931870a8ae4 100644 --- a/components/snap_recovery/src/services.rs +++ b/components/snap_recovery/src/services.rs @@ -228,9 +228,9 @@ where pub fn wait_apply_last(router: RaftRouter, sender: Sender) { let wait_apply = SnapshotBrWaitApplySyncer::new(0, sender); router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::SnapshotBrWaitApply( + PeerMsg::SignificantMsg(Box::new(SignificantMsg::SnapshotBrWaitApply( SnapshotBrWaitApplyRequest::relaxed(wait_apply.clone()), - )) + ))) }); } } diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 41f29fb6c70..f3b260a7e87 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -27,6 +27,7 @@ collections = { workspace = true } crc32fast = "1.2" dashmap = "5" encryption = { workspace = true } +encryption_export = { workspace = true } engine_rocks = { workspace = true } engine_traits = { workspace = true } error_code = { workspace = true } @@ -56,7 +57,9 @@ txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } [dev-dependencies] +async-compression = { version = "0.4.12", features = ["tokio", "zstd"] } engine_test = { workspace = true } tempfile = "3.0" test_sst_importer = { workspace = true } test_util = { workspace = true } +tokio-util = { version = "0.7", features = ["compat"] } diff --git a/components/sst_importer/src/service.rs b/components/sst_importer/src/service.rs deleted file mode 100644 index a7c15e88539..00000000000 --- a/components/sst_importer/src/service.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. - -use grpcio::{RpcStatus, RpcStatusCode}; -use std::fmt::Debug; - -pub fn make_rpc_error(err: E) -> RpcStatus { - // FIXME: Just spewing debug error formatting here seems pretty unfriendly - RpcStatus::with_message(RpcStatusCode::UNKNOWN, format!("{:?}", err)) -} - -#[macro_export] -macro_rules! send_rpc_response { - ($res:ident, $sink:ident, $label:ident, $timer:ident) => {{ - let res = match $res { - Ok(resp) => { - IMPORT_RPC_DURATION - .with_label_values(&[$label, "ok"]) - .observe($timer.saturating_elapsed_secs()); - $sink.success(resp) - } - Err(e) => { - IMPORT_RPC_DURATION - .with_label_values(&[$label, "error"]) - .observe($timer.saturating_elapsed_secs()); - error_inc($label, &e); - $sink.fail(make_rpc_error(e)) - } - }; - let _ = res.map_err(|e| warn!("send rpc response"; "err" => %e)).await; - }}; -} diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 1f9075599ea..e3047d59ae1 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -13,18 +13,21 @@ use std::{ use collections::HashSet; use dashmap::{mapref::entry::Entry, DashMap}; -use encryption::{DataKeyManager, FileEncryptionInfo}; +use encryption::{DataKeyManager, FileEncryptionInfo, MultiMasterKeyBackend}; +use encryption_export::create_async_backend; use engine_traits::{ name_to_cf, util::check_key_in_range, CfName, IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; use external_storage::{ - compression_reader_dispatcher, encrypt_wrap_reader, ExternalStorage, RestoreConfig, + compression_reader_dispatcher, encrypt_wrap_reader, wrap_with_checksum_reader_if_needed, + ExternalStorage, RestoreConfig, }; use file_system::{IoType, OpenOptions}; use kvproto::{ brpb::{CipherInfo, StorageBackend}, + encryptionpb::{EncryptionMethod, FileEncryptionInfo_oneof_mode, MasterKey}, import_sstpb::{Range, *}, kvrpcpb::ApiVersion, metapb::Region, @@ -101,13 +104,12 @@ pub enum CacheKvFile { Fs(Arc), } -/// returns a error indices that we are going to panic in a invalid state. -/// (Rust panic information cannot be send to BR, hence client cannot know -/// what happens, so we pack it into a `Result`.) -fn bug(message: impl std::fmt::Display) -> Error { - Error::Io(std::io::Error::new( - std::io::ErrorKind::Other, - format!("BUG in TiKV: {}", message), +/// returns an error on an invalid internal state. +/// pass the error back to the client side for further debugging. +fn error(message: impl std::fmt::Display) -> Error { + Error::Io(io::Error::new( + ErrorKind::Other, + format!("internal error in TiKV: {}", message), )) } @@ -150,12 +152,13 @@ pub struct SstImporter { _download_rt: Runtime, file_locks: Arc>, memory_quota: Arc, + multi_master_keys_backend: MultiMasterKeyBackend, } impl SstImporter { pub fn new>( cfg: &Config, - root: P, + import_dir: P, key_manager: Option>, api_version: ApiVersion, raft_kv_v2: bool, @@ -190,7 +193,7 @@ impl SstImporter { "size" => ?memory_limit, ); - let dir = ImportDir::new(root)?; + let dir = ImportDir::new(import_dir)?; Ok(SstImporter { dir, @@ -202,6 +205,7 @@ impl SstImporter { cached_storage, _download_rt: download_rt, memory_quota: Arc::new(MemoryQuota::new(memory_limit as _)), + multi_master_keys_backend: MultiMasterKeyBackend::new(), }) } @@ -448,11 +452,10 @@ impl SstImporter { &self, file_length: u64, src_file_name: &str, - dst_file: std::path::PathBuf, + dst_file: PathBuf, backend: &StorageBackend, - support_kms: bool, speed_limiter: &Limiter, - restore_config: external_storage::RestoreConfig, + restore_config: RestoreConfig, ) -> Result<()> { self._download_rt .block_on(self.async_download_file_from_external_storage( @@ -460,7 +463,6 @@ impl SstImporter { src_file_name, dst_file, backend, - support_kms, speed_limiter, "", restore_config, @@ -490,17 +492,16 @@ impl SstImporter { &self, file_length: u64, src_file_name: &str, - dst_file: std::path::PathBuf, + dst_file: PathBuf, backend: &StorageBackend, - support_kms: bool, speed_limiter: &Limiter, cache_key: &str, - restore_config: external_storage::RestoreConfig, + restore_config: RestoreConfig, ) -> Result<()> { let start_read = Instant::now(); if let Some(p) = dst_file.parent() { file_system::create_dir_all(p).or_else(|e| { - if e.kind() == io::ErrorKind::AlreadyExists { + if e.kind() == ErrorKind::AlreadyExists { Ok(()) } else { Err(e) @@ -509,7 +510,7 @@ impl SstImporter { } let ext_storage = self.external_storage_or_cache(backend, cache_key)?; - let ext_storage = self.wrap_kms(ext_storage, support_kms); + let ext_storage = self.auto_encrypt_local_file_if_needed(ext_storage); let result = ext_storage .restore( @@ -537,7 +538,7 @@ impl SstImporter { .with_label_values(&["read"]) .observe(start_read.saturating_elapsed().as_secs_f64()); - debug!("downloaded file succeed"; + debug!("successfully download the file"; "name" => src_file_name, "url" => %util::url_for(&ext_storage), ); @@ -594,16 +595,14 @@ impl SstImporter { CACHED_FILE_IN_MEM.set(self.memory_quota.capacity() as _); - if self.import_support_download() { + if self.download_to_disk_only() { let shrink_file_count = shrink_files.len(); if shrink_file_count > 0 || retain_file_count > 0 { info!("shrink space by tick"; "shrink_files_count" => shrink_file_count, "retain_files_count" => retain_file_count); } for f in shrink_files { - if let Err(e) = file_system::remove_file(&f) { - info!("failed to remove file"; "filename" => ?f, "error" => ?e); - } + self.remove_file_no_throw(&f); } shrink_file_count } else { @@ -614,9 +613,7 @@ impl SstImporter { } } - // If memory_quota is 0, which represent download kv-file when import. - // Or read kv-file into buffer directly. - pub fn import_support_download(&self) -> bool { + pub fn download_to_disk_only(&self) -> bool { self.memory_quota.capacity() == 0 } @@ -633,11 +630,13 @@ impl SstImporter { } } - async fn exec_download( + async fn download_kv_file_to_mem_buf( &self, meta: &KvMeta, - ext_storage: Arc, + ext_storage: Arc, speed_limiter: &Limiter, + opt_file_encryption_info: Option, + opt_encrypted_file_checksum: Option>, ) -> Result { let start = Instant::now(); let permit = self @@ -661,15 +660,16 @@ impl SstImporter { Some((meta.get_range_offset(), range_length)) } }; - let restore_config = external_storage::RestoreConfig { + let restore_config = RestoreConfig { range, compression_type: Some(meta.get_compression_type()), - expected_sha256, - file_crypter: None, + expected_plaintext_file_checksum: expected_sha256, + file_crypter: opt_file_encryption_info, + opt_encrypted_file_checksum, }; let buff = self - .read_kv_files_from_external_storage( + .download_kv_files_from_external_storage_to_mem( file_length, meta.get_name(), ext_storage, @@ -689,11 +689,13 @@ impl SstImporter { }) } - pub async fn do_read_kv_file( + pub async fn download_kv_file_to_mem_cache( &self, meta: &KvMeta, - ext_storage: Arc, + ext_storage: Arc, speed_limiter: &Limiter, + opt_file_encryption_info: Option, + opt_encrypted_file_checksum: Option>, ) -> Result { let start = Instant::now(); let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); @@ -711,7 +713,7 @@ impl SstImporter { Arc::clone(buff) } _ => { - return Err(bug(concat!( + return Err(error(concat!( "using both read-to-memory and download-to-file is unacceptable for now.", "(If you think it is possible in the future you are reading this, ", "please change this line to `return item.get.0.clone()`)", @@ -732,53 +734,71 @@ impl SstImporter { } cache - .get_or_try_init(|| self.exec_download(meta, ext_storage, speed_limiter)) + .get_or_try_init(|| { + self.download_kv_file_to_mem_buf( + meta, + ext_storage, + speed_limiter, + opt_file_encryption_info, + opt_encrypted_file_checksum, + ) + }) .await?; Ok(CacheKvFile::Mem(cache)) } - pub fn wrap_kms( + pub fn auto_encrypt_local_file_if_needed( &self, ext_storage: Arc, - support_kms: bool, - ) -> Arc { - // kv-files needn't are decrypted with KMS when download currently because these - // files are not encrypted when log-backup. It is different from - // sst-files because sst-files is encrypted when saved with rocksdb env - // with KMS. to do: support KMS when log-backup and restore point. - match (support_kms, self.key_manager.clone()) { - (true, Some(key_manager)) => Arc::new(external_storage::EncryptedExternalStorage { - key_manager, - storage: ext_storage, - }), - _ => ext_storage, + ) -> Arc { + if let Some(key_manager) = self.key_manager.clone() { + Arc::new( + external_storage::AutoEncryptLocalRestoredFileExternalStorage { + key_manager, + storage: ext_storage, + }, + ) + } else { + ext_storage } } - async fn read_kv_files_from_external_storage( + async fn download_kv_files_from_external_storage_to_mem( &self, file_length: u64, file_name: &str, - ext_storage: Arc, + ext_storage: Arc, speed_limiter: &Limiter, restore_config: RestoreConfig, ) -> Result> { let RestoreConfig { range, compression_type, - expected_sha256, + expected_plaintext_file_checksum: expected_sha256, file_crypter, + opt_encrypted_file_checksum, } = restore_config; - let mut reader = { + let (mut reader, opt_hasher) = { let inner = if let Some((off, len)) = range { ext_storage.read_part(file_name, off, len) } else { ext_storage.read(file_name) }; - let inner = compression_reader_dispatcher(compression_type, inner)?; - encrypt_wrap_reader(file_crypter, inner)? + // wrap with checksum reader if needed + // + let (checksum_reader, opt_hasher) = + wrap_with_checksum_reader_if_needed(opt_encrypted_file_checksum.is_some(), inner)?; + + // wrap with decrypter if needed + // + let encrypted_reader = encrypt_wrap_reader(file_crypter, checksum_reader)?; + + ( + compression_reader_dispatcher(compression_type, encrypted_reader)?, + opt_hasher, + ) }; let r = external_storage::read_external_storage_info_buff( @@ -787,6 +807,8 @@ impl SstImporter { file_length, expected_sha256, external_storage::MIN_READ_SPEED, + opt_encrypted_file_checksum, + opt_hasher, ) .await; let url = ext_storage.url()?.to_string(); @@ -800,54 +822,89 @@ impl SstImporter { Ok(buff) } - pub async fn read_from_kv_file( + pub async fn download_kv_file( &self, meta: &KvMeta, - ext_storage: Arc, + ext_storage: Arc, backend: &StorageBackend, speed_limiter: &Limiter, + opt_cipher_info: Option, + master_keys_proto: Vec, ) -> Result> { - let c = if self.import_support_download() { - self.do_download_kv_file(meta, backend, speed_limiter) - .await? + // update the master key backends if needed. + // + self.multi_master_keys_backend + .update_from_proto_if_needed(master_keys_proto, create_async_backend) + .await?; + + // extract backup file encryption info if configured + // + let opt_file_encryption_info = self + .extract_file_encryption_info(meta, opt_cipher_info) + .await?; + let opt_checksum = extract_checksum_info(meta); + + let c = if self.download_to_disk_only() { + self.download_kv_file_to_disk( + meta, + backend, + speed_limiter, + opt_file_encryption_info, + opt_checksum, + ) + .await? } else { - self.do_read_kv_file(meta, ext_storage, speed_limiter) - .await? + self.download_kv_file_to_mem_cache( + meta, + ext_storage, + speed_limiter, + opt_file_encryption_info, + opt_checksum, + ) + .await? }; match c { - // If cache memroy, it has been rewrite, return buffer directly. + // If cache in memory, it has been rewrite, and content is plaintext, + // return buffer directly. CacheKvFile::Mem(buff) => Ok(Arc::clone( &buff .get() - .ok_or_else(|| bug("invalid cache state"))? + .ok_or_else(|| error("invalid cache state"))? .content, )), - // If cache file name, it need to read and rewrite. + // If cache in a file, it needs to read and rewrite, and it is locally encrypted if + // data key manager is configured CacheKvFile::Fs(path) => { - let file = File::open(path.as_ref())?; - let mut reader = BufReader::new(file); let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - + if let Some(key_manager) = self.key_manager.clone() { + let mut decrypter_reader = key_manager.open_file_for_read(path.as_ref())?; + decrypter_reader.read_to_end(&mut buffer)?; + } else { + let file = File::open(path.as_ref())?; + let mut reader = BufReader::new(file); + reader.read_to_end(&mut buffer)?; + } Ok(Arc::from(buffer.into_boxed_slice())) } } } - pub async fn do_download_kv_file( + pub async fn download_kv_file_to_disk( &self, meta: &KvMeta, backend: &StorageBackend, speed_limiter: &Limiter, + opt_file_encryption_info: Option, + opt_encrypted_file_checksum: Option>, ) -> Result { let offset = meta.get_range_offset(); let src_name = meta.get_name(); let dst_name = format!("{}_{}", src_name, offset); let path = self.dir.get_import_path(&dst_name)?; let start = Instant::now(); - let sha256 = meta.get_sha256().to_vec(); - let expected_sha256 = if !sha256.is_empty() { - Some(sha256) + let plaintext_file_checksum = meta.get_sha256().to_vec(); + let expected_plaintext_checksum = if !plaintext_file_checksum.is_empty() { + Some(plaintext_file_checksum) } else { None }; @@ -868,19 +925,20 @@ impl SstImporter { } else { Some((offset, range_length)) }; - let restore_config = external_storage::RestoreConfig { + + let restore_config = RestoreConfig { range, compression_type: Some(meta.compression_type), - expected_sha256, - file_crypter: None, + expected_plaintext_file_checksum: expected_plaintext_checksum, + file_crypter: opt_file_encryption_info, + opt_encrypted_file_checksum, }; + self.async_download_file_from_external_storage( meta.get_length(), src_name, path.temp.clone(), backend, - false, - // don't support encrypt for now. speed_limiter, "", restore_config, @@ -896,7 +954,7 @@ impl SstImporter { if let Some(p) = path.save.parent() { // we have v1 prefix in file name. file_system::create_dir_all(p).or_else(|e| { - if e.kind() == io::ErrorKind::AlreadyExists { + if e.kind() == ErrorKind::AlreadyExists { Ok(()) } else { Err(e) @@ -904,7 +962,12 @@ impl SstImporter { })?; } - file_system::rename(path.temp, path.save)?; + if let Some(manager) = self.key_manager.clone() { + manager.rename_file(&path.temp, &path.save)?; + } else { + file_system::rename(path.temp.clone(), path.save.clone())?; + } + IMPORTER_APPLY_DURATION .with_label_values(&["download"]) .observe(start.saturating_elapsed().as_secs_f64()); @@ -1099,7 +1162,7 @@ impl SstImporter { iv: meta.cipher_iv.to_owned(), }); - let restore_config = external_storage::RestoreConfig { + let restore_config = RestoreConfig { file_crypter, ..Default::default() }; @@ -1109,7 +1172,6 @@ impl SstImporter { name, path.temp.clone(), backend, - true, speed_limiter, ext.cache_key.unwrap_or(""), restore_config, @@ -1334,7 +1396,7 @@ impl SstImporter { } } - let _ = file_system::remove_file(&path.temp); + self.remove_file_no_throw(&path.temp); IMPORTER_DOWNLOAD_DURATION .with_label_values(&["rewrite"]) @@ -1355,7 +1417,7 @@ impl SstImporter { // nothing is written: prevents finishing the SST at all. // also delete the empty sst file that is created when creating sst_writer drop(sst_writer); - let _ = file_system::remove_file(&path.save); + self.remove_file_no_throw(&path.save); Ok(None) } } @@ -1416,6 +1478,107 @@ impl SstImporter { self.api_version, )) } + + async fn extract_file_encryption_info( + &self, + kv_meta: &KvMeta, + opt_cipher_info: Option, + ) -> Result> { + if let Some(encryption_info) = kv_meta.file_encryption_info.as_ref() { + if let Some(encryption_info_mode) = &encryption_info.mode { + match encryption_info_mode { + FileEncryptionInfo_oneof_mode::PlainTextDataKey(_) => { + if let Some(cipher_info) = opt_cipher_info { + if cipher_info.cipher_type == EncryptionMethod::Unknown + || cipher_info.cipher_type == EncryptionMethod::Plaintext + { + return Err(error( + "plaintext data key needed from client but plaintext or unknown provided", + )); + } + Ok(Some(FileEncryptionInfo { + method: cipher_info.cipher_type, + key: cipher_info.cipher_key, + iv: encryption_info.file_iv.clone(), + })) + } else { + Err(error( + "plaintext data key needed from client but not provided", + )) + } + } + FileEncryptionInfo_oneof_mode::MasterKeyBased(parsed_master_key_info) => { + // sanity check + if self.multi_master_keys_backend.is_initialized().await { + // decrypt encrypted data key + if parsed_master_key_info.data_key_encrypted_content.is_empty() { + return Err(error( + "internal error: couldn't find any encrypted data key information for log backup file", + )); + } + // get the first key for the current impl + // the field is a list for future extension + // when multiple master key backends are provided for high availability. + let plaintext_data_key = self + .multi_master_keys_backend + .decrypt( + parsed_master_key_info + .data_key_encrypted_content + .first() + .unwrap(), + ) + .await + .map_err(|e| { + error(format!("failed to decrypt encrypted data key: {:?}", e)) + })?; + Ok(Some(FileEncryptionInfo { + method: encryption_info.encryption_method, + key: plaintext_data_key, + iv: encryption_info.file_iv.clone(), + })) + } else { + Err(error( + "internal error: need to decrypt data key but multi master key backends is not initialized", + )) + } + } + } + } else { + // encryption info set but empty, should never happen + Err(error( + "internal error: encryption information is set in the kv file but empty, should never happen", + )) + } + } else { + // doesn't have encryption info, plaintext log backup files. + Ok(None) + } + } + + fn remove_file_no_throw(&self, path_buf: &PathBuf) { + // remove from file system + if let Err(e) = file_system::remove_file(path_buf) { + warn!("failed to remove file"; "filename" => ?path_buf, "error" => ?e); + } + // remove tracking from key manager if needed + if let Some(key_manager) = self.key_manager.as_ref() { + if let Err(e) = key_manager.delete_file(&path_buf.to_string_lossy(), None) { + warn!("failed to remove file from key manager"; "filename" => ?path_buf, "error" => ?e); + } + } + } +} + +fn extract_checksum_info(kv_meta: &KvMeta) -> Option> { + if let Some(encryption_info) = kv_meta.file_encryption_info.as_ref() { + if encryption_info.checksum.is_empty() { + None + } else { + Some(encryption_info.checksum.clone()) + } + } else { + None + } } fn key_to_bound(key: &[u8]) -> Bound<&[u8]> { @@ -1453,25 +1616,34 @@ fn is_after_end_bound>(value: &[u8], bound: &Bound) -> bool { #[cfg(test)] mod tests { use std::{ - io::{self, BufWriter, Write}, + io::{self, Cursor}, ops::Sub, usize, }; + use async_compression::tokio::write::ZstdEncoder; + use encryption::{EncrypterWriter, Iv}; use engine_rocks::get_env; use engine_traits::{ collect, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, RefIterable, - SstReader, SstWriter, CF_DEFAULT, DATA_CFS, + SstCompressionType::Zstd, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; use external_storage::read_external_storage_info_buff; - use file_system::File; - use kvproto::encryptionpb::EncryptionMethod; + use file_system::Sha256Reader; + use kvproto::{ + brpb::CompressionType, + encryptionpb, + encryptionpb::{EncryptionMethod, MasterKeyBased, MasterKeyFile, PlainTextDataKey}, + }; use online_config::{ConfigManager, OnlineConfig}; use openssl::hash::{Hasher, MessageDigest}; - use tempfile::Builder; + use rand::Rng; + use tempfile::{Builder, TempDir}; use test_sst_importer::*; use test_util::new_test_key_manager; use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io}; + use tokio::io::{AsyncWrite, AsyncWriteExt}; + use tokio_util::compat::{FuturesAsyncWriteCompatExt, TokioAsyncWriteCompatExt}; use txn_types::{Value, WriteType}; use uuid::Uuid; @@ -1632,9 +1804,9 @@ mod tests { content_a == content_b } - fn new_key_manager_for_test() -> (tempfile::TempDir, Arc) { + fn new_key_manager_for_test() -> (TempDir, Arc) { // test with tde - let tmp_dir = tempfile::TempDir::new().unwrap(); + let tmp_dir = TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); (tmp_dir, Arc::new(key_manager.unwrap().unwrap())) } @@ -1651,7 +1823,7 @@ mod tests { fn create_external_sst_file_with_write_fn( write_fn: F, - ) -> Result<(tempfile::TempDir, StorageBackend, SstMeta)> + ) -> Result<(TempDir, StorageBackend, SstMeta)> where F: FnOnce(&mut RocksSstWriter) -> Result<()>, { @@ -1675,7 +1847,7 @@ mod tests { Ok((ext_sst_dir, backend, meta)) } - fn create_sample_external_sst_file() -> Result<(tempfile::TempDir, StorageBackend, SstMeta)> { + fn create_sample_external_sst_file() -> Result<(TempDir, StorageBackend, SstMeta)> { create_external_sst_file_with_write_fn(|writer| { writer.put(b"zt123_r01", b"abc")?; writer.put(b"zt123_r04", b"xyz")?; @@ -1686,14 +1858,102 @@ mod tests { }) } - fn create_sample_external_kv_file() - -> Result<(tempfile::TempDir, StorageBackend, KvMeta, Vec)> { + fn create_sample_external_kv_file() -> Result<(TempDir, StorageBackend, KvMeta, Vec)> { + create_sample_external_kv_file_with_optional_encryption( + None, + Vec::new(), + EncryptionMethod::Plaintext, + false, + ) + } + fn create_sample_external_kv_file_with_optional_encryption( + opt_cipher_info: Option, + master_key_configs: Vec, + master_key_based_data_encryption_method: EncryptionMethod, + compression: bool, + ) -> Result<(TempDir, StorageBackend, KvMeta, Vec)> { let ext_dir = tempfile::tempdir()?; let file_name = "v1/t000001/abc.log"; let file_path = ext_dir.path().join(file_name); std::fs::create_dir_all(file_path.parent().unwrap())?; - let file = File::create(file_path).unwrap(); - let mut buff = BufWriter::new(file); + let file = block_on_external_io(tokio::fs::File::create(file_path.clone())).unwrap(); + + // write to a buffer first, later flush to disk + // + let mut file_buffer = Vec::new(); + let cursor = Cursor::new(&mut file_buffer); + let buf_writer = tokio::io::BufWriter::new(cursor); + let mut kv_meta = KvMeta::default(); + + // writer should compress the data first then encrypt, + // wrapping zstdEncoder around Encrypter + // + let writer = if let Some(cipher_info) = opt_cipher_info { + let iv = Iv::new_ctr().unwrap(); + // update meta + // + let mut encryption_info = encryptionpb::FileEncryptionInfo::new(); + encryption_info.set_file_iv(iv.as_slice().to_vec()); + encryption_info.set_encryption_method(cipher_info.cipher_type); + encryption_info.set_plain_text_data_key(PlainTextDataKey::new()); + kv_meta.set_file_encryption_info(encryption_info); + + Box::new( + EncrypterWriter::new( + buf_writer.compat_write(), + cipher_info.cipher_type, + &cipher_info.cipher_key, + iv, + ) + .unwrap() + .compat_write(), + ) as Box + } else if !master_key_configs.is_empty() { + let multi_master_key_backend = MultiMasterKeyBackend::new(); + block_on_external_io( + multi_master_key_backend + .update_from_proto_if_needed(master_key_configs, create_async_backend), + ) + .unwrap(); + + let iv = Iv::new_ctr().unwrap(); + let plaintext_data_key = multi_master_key_backend + .generate_data_key(master_key_based_data_encryption_method) + .unwrap(); + + let encryption_info = + block_on_external_io(multi_master_key_backend.encrypt(&plaintext_data_key)) + .unwrap(); + + let mut encryption_info_proto = encryptionpb::FileEncryptionInfo::new(); + let mut master_key_proto = MasterKeyBased::new(); + encryption_info_proto.set_file_iv(iv.as_slice().to_vec()); + encryption_info_proto.set_encryption_method(master_key_based_data_encryption_method); + master_key_proto.set_data_key_encrypted_content(protobuf::RepeatedField::from_vec( + vec![encryption_info], + )); + encryption_info_proto.set_master_key_based(master_key_proto); + kv_meta.set_file_encryption_info(encryption_info_proto); + + Box::new( + EncrypterWriter::new( + buf_writer.compat_write(), + master_key_based_data_encryption_method, + &plaintext_data_key, + iv, + ) + .unwrap() + .compat_write(), + ) as Box + } else { + Box::new(buf_writer) as Box + }; + + let mut writer = if compression { + Box::new(ZstdEncoder::new(writer)) as Box + } else { + writer + }; let kvs = vec![ (b"t1_r01".to_vec(), b"tidb".to_vec()), @@ -1704,23 +1964,45 @@ mod tests { let mut sha256 = Hasher::new(MessageDigest::sha256()).unwrap(); let mut len = 0; + let mut buf = vec![]; for kv in kvs { let encoded = EventEncoder::encode_event(&kv.0, &kv.1); for slice in encoded { - len += buff.write(slice.as_ref()).unwrap(); + len += block_on_external_io(writer.write(slice.as_ref())).unwrap(); sha256.update(slice.as_ref()).unwrap(); + buf.extend_from_slice(slice.as_ref()); } } + block_on_external_io(writer.flush()).unwrap(); + drop(writer); + + // calc checksum of the file buffer + // + if kv_meta.has_file_encryption_info() { + let mut tmp_buf = Vec::new(); + let (mut checksum_reader, hasher) = + Sha256Reader::new(Cursor::new(&mut file_buffer)).unwrap(); + checksum_reader.read_to_end(&mut tmp_buf).unwrap(); + let checksum = hasher.lock().unwrap().finish().unwrap().to_vec(); + kv_meta.mut_file_encryption_info().set_checksum(checksum); + } + + // actually write to disk + // + let mut buf_writer = tokio::io::BufWriter::new(file); + block_on_external_io(buf_writer.write_all(&file_buffer)).unwrap(); + block_on_external_io(buf_writer.flush()).unwrap(); - let mut kv_meta = KvMeta::default(); kv_meta.set_name(file_name.to_string()); kv_meta.set_cf(String::from("default")); kv_meta.set_is_delete(false); kv_meta.set_length(len as _); kv_meta.set_sha256(sha256.finish().unwrap().to_vec()); - + if compression { + kv_meta.set_compression_type(CompressionType::Zstd); + } let backend = external_storage::make_local_backend(ext_dir.path()); - Ok((ext_dir, backend, kv_meta, buff.buffer().to_vec())) + Ok((ext_dir, backend, kv_meta, buf)) } fn create_sample_external_rawkv_sst_file( @@ -1746,7 +2028,7 @@ mod tests { fn get_encoded_key(key: &[u8], ts: u64) -> Vec { keys::data_key( - txn_types::Key::from_raw(key) + Key::from_raw(key) .append_ts(TimeStamp::new(ts)) .as_encoded(), ) @@ -1875,6 +2157,8 @@ mod tests { input_len, Some(hash256), 8192, + None, + None, )) .unwrap(); assert_eq!(&*output, data); @@ -1893,9 +2177,11 @@ mod tests { 0, None, usize::MAX, + None, + None, )) .unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::TimedOut); + assert_eq!(err.kind(), ErrorKind::TimedOut); } #[test] @@ -1916,11 +2202,13 @@ mod tests { len, Some(sha_256.clone()), 0, + None, + None, )) .unwrap(); assert_eq!(&output, data); - // test without expected_sha245. + // test without expected_sha256. reader = data; let output = block_on_external_io(read_external_storage_info_buff( &mut reader, @@ -1928,11 +2216,13 @@ mod tests { len, None, 0, + None, + None, )) .unwrap(); assert_eq!(&output, data); - // test with wrong expectd_len. + // test with wrong expected len. reader = data; let err = block_on_external_io(read_external_storage_info_buff( &mut reader, @@ -1940,6 +2230,8 @@ mod tests { len + 1, Some(sha_256.clone()), 0, + None, + None, )) .unwrap_err(); assert!(err.to_string().contains("length not match")); @@ -1952,6 +2244,8 @@ mod tests { len, Some(sha_256[..sha_256.len() - 1].to_vec()), 0, + None, + None, )) .unwrap_err(); assert!(err.to_string().contains("sha256 not match")); @@ -1968,14 +2262,16 @@ mod tests { 0, None, usize::MAX, + None, + None, )) .unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::TimedOut); + assert_eq!(err.kind(), ErrorKind::TimedOut); } #[test] fn test_update_config_memory_use_ratio() { - // create SstImpoter with default. + // create SstImporter with default. let cfg = Config { memory_use_ratio: 0.3, ..Default::default() @@ -2022,7 +2318,7 @@ mod tests { } #[test] - fn test_do_read_kv_file() { + fn test_download_kv_file_to_mem_cache() { // create a sample kv file. let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); @@ -2038,17 +2334,17 @@ mod tests { ) .unwrap(); let ext_storage = { - importer.wrap_kms( + importer.auto_encrypt_local_file_if_needed( importer.external_storage_or_cache(&backend, "").unwrap(), - false, ) }; - // test do_read_kv_file() - let output = block_on_external_io(importer.do_read_kv_file( + let output = block_on_external_io(importer.download_kv_file_to_mem_cache( &kv_meta, ext_storage, &Limiter::new(f64::INFINITY), + None, + None, )) .unwrap(); @@ -2058,7 +2354,7 @@ mod tests { output ); - // Do not shrint nothing. + // Do not shrink nothing. let shrink_size = importer.shrink_by_tick(); assert_eq!(shrink_size, 0); assert_eq!(importer.file_locks.len(), 1); @@ -2079,7 +2375,7 @@ mod tests { } #[test] - fn test_read_kv_files_from_external_storage() { + fn test_download_kv_files_from_external_storage_to_mem() { // create a sample kv file. let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); @@ -2095,20 +2391,19 @@ mod tests { ) .unwrap(); let ext_storage = { - let inner = importer.wrap_kms( + let inner = importer.auto_encrypt_local_file_if_needed( importer.external_storage_or_cache(&backend, "").unwrap(), - false, ); Arc::new(inner) }; // test read all of the file. - let restore_config = external_storage::RestoreConfig { - expected_sha256: Some(kv_meta.get_sha256().to_vec()), + let restore_config = RestoreConfig { + expected_plaintext_file_checksum: Some(kv_meta.get_sha256().to_vec()), ..Default::default() }; - let output = block_on_external_io(importer.read_kv_files_from_external_storage( + let output = block_on_external_io(importer.download_kv_files_from_external_storage_to_mem( kv_meta.get_length(), kv_meta.get_name(), ext_storage.clone(), @@ -2126,12 +2421,12 @@ mod tests { // test read range of the file. let (offset, len) = (5, 16); - let restore_config = external_storage::RestoreConfig { + let restore_config = RestoreConfig { range: Some((offset, len)), ..Default::default() }; - let output = block_on_external_io(importer.read_kv_files_from_external_storage( + let output = block_on_external_io(importer.download_kv_files_from_external_storage_to_mem( len, kv_meta.get_name(), ext_storage, @@ -2157,15 +2452,14 @@ mod tests { let importer = SstImporter::::new( &cfg, import_dir, - Some(key_manager), + Some(key_manager.clone()), ApiVersion::V1, false, ) .unwrap(); let ext_storage = { - importer.wrap_kms( + importer.auto_encrypt_local_file_if_needed( importer.external_storage_or_cache(&backend, "").unwrap(), - false, ) }; let path = importer @@ -2176,16 +2470,18 @@ mod tests { .unwrap(); // test do_download_kv_file(). - assert!(importer.import_support_download()); - let output = block_on_external_io(importer.read_from_kv_file( + assert!(importer.download_to_disk_only()); + let output = block_on_external_io(importer.download_kv_file( &kv_meta, ext_storage, &backend, &Limiter::new(f64::INFINITY), + None, + Vec::new(), )) .unwrap(); assert_eq!(*output, buff); - check_file_exists(&path.save, None); + check_file_exists(&path.save, Some(&*key_manager)); // test shrink nothing. let shrint_files_cnt = importer.shrink_by_tick(); @@ -2197,7 +2493,7 @@ mod tests { } let shrint_files_cnt = importer.shrink_by_tick(); assert_eq!(shrint_files_cnt, 1); - check_file_not_exists(&path.save, None); + check_file_not_exists(&path.save, Some(&*key_manager)); } #[test] @@ -2220,14 +2516,13 @@ mod tests { // perform download file into .temp dir. let file_name = "sample.sst"; let path = importer.dir.get_import_path(file_name).unwrap(); - let restore_config = external_storage::RestoreConfig::default(); + let restore_config = RestoreConfig::default(); importer .download_file_from_external_storage( meta.get_length(), file_name, path.temp.clone(), &backend, - true, &Limiter::new(f64::INFINITY), restore_config, ) @@ -2248,15 +2543,15 @@ mod tests { let importer = SstImporter::::new( &Config::default(), import_dir, - Some(key_manager), + Some(key_manager.clone()), ApiVersion::V1, false, ) .unwrap(); let path = importer.dir.get_import_path(kv_meta.get_name()).unwrap(); - let restore_config = external_storage::RestoreConfig { - expected_sha256: Some(kv_meta.get_sha256().to_vec()), + let restore_config = RestoreConfig { + expected_plaintext_file_checksum: Some(kv_meta.get_sha256().to_vec()), ..Default::default() }; importer @@ -2265,13 +2560,13 @@ mod tests { kv_meta.get_name(), path.temp.clone(), &backend, - false, &Limiter::new(f64::INFINITY), restore_config, ) .unwrap(); - assert!(check_file_is_same( + check_file_exists(&path.temp, Some(&key_manager)); + assert!(!check_file_is_same( &_temp_dir.path().join(kv_meta.get_name()), &path.temp, )); @@ -2347,7 +2642,7 @@ mod tests { .unwrap(); let db_path = temp_dir.path().join("db"); - let env = get_env(Some(key_manager), None /* io_rate_limiter */).unwrap(); + let env = get_env(Some(key_manager.clone()), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), DATA_CFS, env.clone()); let range = importer @@ -2372,6 +2667,12 @@ mod tests { assert!(sst_file_metadata.is_file()); assert_eq!(sst_file_metadata.len(), meta.get_length()); + // verified the tmp files are correctly cleaned up + check_file_not_exists( + importer.dir.join_for_read(&meta).unwrap().temp.as_path(), + Some(&*key_manager), + ); + // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), Some(env)); sst_reader.verify_checksum().unwrap(); @@ -3026,7 +3327,7 @@ mod tests { let mut importer = SstImporter::::new(&cfg, &importer_dir, None, ApiVersion::V1, false) .unwrap(); - importer.set_compression_type(CF_DEFAULT, Some(SstCompressionType::Zstd)); + importer.set_compression_type(CF_DEFAULT, Some(Zstd)); let db_path = importer_dir.path().join("db"); let db = new_test_engine(db_path.to_str().unwrap(), DATA_CFS); @@ -3081,7 +3382,7 @@ mod tests { false, ) .unwrap(); - assert_eq!(importer.import_support_download(), false); + assert_eq!(importer.download_to_disk_only(), false); let import_dir = tempfile::tempdir().unwrap(); let importer = SstImporter::::new( @@ -3095,7 +3396,7 @@ mod tests { false, ) .unwrap(); - assert_eq!(importer.import_support_download(), true); + assert_eq!(importer.download_to_disk_only(), true); } #[test] @@ -3164,4 +3465,119 @@ mod tests { let _buff = v.0.clone(); assert_eq!(v.0.ref_count(), 2); } + + #[test] + fn test_download_kv_with_no_encryption() { + // test both on disk and in mem case + // + test_download_kv_with_optional_encryption(None, Vec::new(), true, true); + test_download_kv_with_optional_encryption(None, Vec::new(), false, true); + test_download_kv_with_optional_encryption(None, Vec::new(), true, false); + test_download_kv_with_optional_encryption(None, Vec::new(), false, false); + } + + #[test] + fn test_download_kv_with_plaintext_data_key() { + let data_key: [u8; 32] = rand::thread_rng().gen(); + let mut cipher = CipherInfo::new(); + cipher.set_cipher_key(data_key.to_vec()); + cipher.set_cipher_type(EncryptionMethod::Aes256Ctr); + + // test both on disk and in mem case + // + test_download_kv_with_optional_encryption(Some(cipher.clone()), Vec::new(), true, true); + test_download_kv_with_optional_encryption(Some(cipher.clone()), Vec::new(), false, true); + test_download_kv_with_optional_encryption(Some(cipher.clone()), Vec::new(), true, false); + test_download_kv_with_optional_encryption(Some(cipher), Vec::new(), false, false); + } + + #[test] + fn test_download_kv_with_master_key_based() { + // set up file backed master key + // + let hex_bytes = encryption::test_utils::generate_random_master_key(); + let (path, _dir) = encryption::test_utils::create_master_key_file_test_only(&hex_bytes); + + let mut master_key_file_proto = MasterKeyFile::new(); + master_key_file_proto.set_path(path.to_string_lossy().into_owned()); + + let mut master_key_proto = MasterKey::new(); + master_key_proto.set_file(master_key_file_proto); + + let master_key_proto_vec = vec![master_key_proto]; + + // test both on disk and in mem case + // + test_download_kv_with_optional_encryption(None, master_key_proto_vec.clone(), true, true); + test_download_kv_with_optional_encryption(None, master_key_proto_vec.clone(), false, true); + test_download_kv_with_optional_encryption(None, master_key_proto_vec.clone(), true, false); + test_download_kv_with_optional_encryption(None, master_key_proto_vec.clone(), false, false); + } + + fn test_download_kv_with_optional_encryption( + opt_cipher_info: Option, + master_key_configs: Vec, + in_mem: bool, + with_local_file_encryption: bool, + ) { + // set up external kv file + // + let (_dir, storage_backend, kv_meta, file_content) = + create_sample_external_kv_file_with_optional_encryption( + opt_cipher_info.clone(), + master_key_configs.clone(), + EncryptionMethod::Aes256Ctr, + true, + ) + .unwrap(); + + // set up importer + // + let import_dir = tempfile::tempdir().unwrap(); + let opt_key_manager = if with_local_file_encryption { + let (_, key_manager) = new_key_manager_for_test(); + Some(key_manager) + } else { + None + }; + let cfg = Config { + memory_use_ratio: if in_mem { 0.5 } else { 0.0 }, + ..Default::default() + }; + let importer = SstImporter::::new( + &cfg, + import_dir, + opt_key_manager.clone(), + ApiVersion::V1, + false, + ) + .unwrap(); + let ext_storage = { + importer.auto_encrypt_local_file_if_needed( + importer + .external_storage_or_cache(&storage_backend, "") + .unwrap(), + ) + }; + let path = importer + .dir + .get_import_path( + format!("{}_{}", kv_meta.get_name(), kv_meta.get_range_offset()).as_str(), + ) + .unwrap(); + + let output = block_on_external_io(importer.download_kv_file( + &kv_meta, + ext_storage, + &storage_backend, + &Limiter::new(f64::INFINITY), + opt_cipher_info, + master_key_configs, + )) + .unwrap(); + assert_eq!(*output, file_content); + if !in_mem { + check_file_exists(&path.save, opt_key_manager.as_deref()); + } + } } diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index a9d19120453..7cda986b48b 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -9,12 +9,10 @@ license = "Apache-2.0" api_version = { workspace = true } backup = { workspace = true } collections = { workspace = true } -concurrency_manager = { workspace = true } crc64fast = "0.1" engine_rocks = { workspace = true } engine_traits = { workspace = true } external_storage ={ workspace = true } -file_system = { workspace = true } futures = "0.3" futures-executor = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } @@ -23,7 +21,6 @@ kvproto = { workspace = true } protobuf = "2" raftstore = { workspace = true } rand = "0.8" -tempfile = "3.0" test_raftstore = { workspace = true } tidb_query_common = { workspace = true } tikv = { workspace = true } diff --git a/components/test_backup/src/disk_snap.rs b/components/test_backup/src/disk_snap.rs index af7d7f2ebb3..0f06efa53a8 100644 --- a/components/test_backup/src/disk_snap.rs +++ b/components/test_backup/src/disk_snap.rs @@ -7,7 +7,6 @@ use std::{ }; use backup::disk_snap::Env as BEnv; -use engine_rocks::RocksEngine as KTE; use futures_executor::block_on; use futures_util::{ sink::SinkExt, @@ -40,7 +39,7 @@ pub struct Node { } pub struct Suite { - pub cluster: Cluster>, + pub cluster: Cluster, pub nodes: HashMap, grpc_env: Arc, } @@ -50,7 +49,7 @@ impl Suite { let rej = Arc::new(PrepareDiskSnapObserver::default()); let rej2 = rej.clone(); let mut w = self.cluster.sim.wl(); - w.coprocessor_hooks + w.coprocessor_hosts .entry(id) .or_default() .push(Box::new(move |host| { diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index 5ea853799b5..cf68db05833 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -13,7 +13,6 @@ use backup::Task; use collections::HashMap; // NOTE: Perhaps we'd better use test engine here. But it seems for now we cannot initialize a // mock cluster with `PanicEngine` and in our CI environment clippy will complain that. -use engine_rocks::RocksEngine as KTE; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; use external_storage::make_local_backend; use futures::{channel::mpsc as future_mpsc, executor::block_on}; @@ -44,7 +43,7 @@ use txn_types::TimeStamp; pub mod disk_snap; pub struct TestSuite { - pub cluster: Cluster>, + pub cluster: Cluster, pub endpoints: HashMap>, pub tikv_cli: TikvClient, pub context: Context, diff --git a/components/test_coprocessor/Cargo.toml b/components/test_coprocessor/Cargo.toml index f3af09512eb..9ebd6f46da9 100644 --- a/components/test_coprocessor/Cargo.toml +++ b/components/test_coprocessor/Cargo.toml @@ -21,12 +21,12 @@ test-engines-panic = [ ] [dependencies] -api_version = { workspace = true } +codec = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } -engine_rocks = { workspace = true } futures = "0.3" kvproto = { workspace = true } +pd_client = { workspace = true } protobuf = "2" resource_metering = { workspace = true } test_storage = { workspace = true } diff --git a/components/test_coprocessor/src/dag.rs b/components/test_coprocessor/src/dag.rs index 76e91cc6ef5..07f7155b2f9 100644 --- a/components/test_coprocessor/src/dag.rs +++ b/components/test_coprocessor/src/dag.rs @@ -12,6 +12,7 @@ use tipb::{ Aggregation, ByItem, Chunk, ColumnInfo, DagRequest, ExecType, Executor, Expr, ExprType, IndexScan, Limit, Selection, TableScan, TopN, }; +use txn_types::TimeStamp; use super::*; @@ -25,6 +26,7 @@ pub struct DagSelect { pub key_ranges: Vec, pub output_offsets: Option>, pub paging_size: Option, + pub start_ts: Option, } impl DagSelect { @@ -48,6 +50,7 @@ impl DagSelect { key_ranges: vec![table.get_record_range_all()], output_offsets: None, paging_size: None, + start_ts: None, } } @@ -75,6 +78,7 @@ impl DagSelect { key_ranges: vec![range], output_offsets: None, paging_size: None, + start_ts: None, } } @@ -213,6 +217,11 @@ impl DagSelect { self } + pub fn start_ts(mut self, start_ts: TimeStamp) -> DagSelect { + self.start_ts = Some(start_ts.into_inner()); + self + } + pub fn build(self) -> Request { self.build_with(Context::default(), &[0]) } @@ -267,7 +276,7 @@ impl DagSelect { dag.set_output_offsets(output_offsets); let mut req = Request::default(); - req.set_start_ts(next_id() as u64); + req.set_start_ts(self.start_ts.unwrap_or_else(|| next_id() as u64)); req.set_tp(REQ_TYPE_DAG); req.set_data(dag.write_to_bytes().unwrap()); req.set_ranges(self.key_ranges.into()); diff --git a/components/test_coprocessor/src/fixture.rs b/components/test_coprocessor/src/fixture.rs index 57446b8d4f9..08299db2803 100644 --- a/components/test_coprocessor/src/fixture.rs +++ b/components/test_coprocessor/src/fixture.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; +use pd_client::PdClient; use resource_metering::ResourceTagFactory; use tidb_query_datatype::codec::{row::v2::CODEC_VERSION, Datum}; use tikv::{ @@ -45,6 +46,10 @@ impl ProductTable { .build(); ProductTable(table) } + + pub fn table_id(&self) -> i64 { + self.0.id + } } impl Default for ProductTable { @@ -71,7 +76,7 @@ pub fn init_data_with_engine_and_commit( init_data_with_details(ctx, engine, tbl, vals, commit, &Config::default()) } -pub fn init_data_with_engine_and_commit_v2_checksum( +fn init_data_with_engine_and_commit_v2_checksum( ctx: Context, engine: E, tbl: &ProductTable, @@ -100,10 +105,24 @@ pub fn init_data_with_details( commit: bool, cfg: &Config, ) -> (Store, Endpoint, Arc) { - init_data_with_details_impl(ctx, engine, tbl, vals, commit, cfg, 0, false, None) + init_data_with_details_impl(ctx, engine, tbl, vals, commit, cfg, 0, false, None, None) +} + +pub fn init_data_with_details_pd_client( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + cfg: &Config, + pd_client: Option>, +) -> (Store, Endpoint, Arc) { + init_data_with_details_impl( + ctx, engine, tbl, vals, commit, cfg, 0, false, None, pd_client, + ) } -pub fn init_data_with_details_v2_checksum( +fn init_data_with_details_v2_checksum( ctx: Context, engine: E, tbl: &ProductTable, @@ -123,6 +142,7 @@ pub fn init_data_with_details_v2_checksum( CODEC_VERSION, with_checksum, extra_checksum, + None, ) } @@ -136,11 +156,12 @@ fn init_data_with_details_impl( codec_ver: u8, with_checksum: bool, extra_checksum: Option, + pd_client: Option>, ) -> (Store, Endpoint, Arc) { let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); - let mut store = Store::from_storage(storage); + let mut store = Store::from_storage_pd_client(storage, pd_client); store.begin(); for &(id, name, count) in vals { diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index e5589969911..72d4c3ce157 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -1,9 +1,10 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::collections::BTreeMap; +use std::{collections::BTreeMap, sync::Arc, time::Duration}; use collections::HashMap; use kvproto::kvrpcpb::{Context, IsolationLevel}; +use pd_client::PdClient; use test_storage::SyncTestStorageApiV1; use tidb_query_datatype::{ codec::{ @@ -23,6 +24,7 @@ use tikv::{ SnapshotStore, StorageApiV1, TestStorageBuilderApiV1, }, }; +use tikv_util::future::block_on_timeout; use txn_types::{Key, Mutation, TimeStamp}; use super::*; @@ -166,6 +168,7 @@ pub struct Store { current_ts: TimeStamp, last_committed_ts: TimeStamp, handles: Vec>, + pd_client: Option>, } impl Store { @@ -185,11 +188,29 @@ impl Default for Store { impl Store { pub fn from_storage(storage: StorageApiV1) -> Self { + Self::from_storage_pd_client(storage, None) + } + + pub fn from_storage_pd_client( + storage: StorageApiV1, + pd_client: Option>, + ) -> Self { Self { store: SyncTestStorageApiV1::from_storage(0, storage, GcConfig::default()).unwrap(), current_ts: 1.into(), last_committed_ts: TimeStamp::zero(), handles: vec![], + pd_client, + } + } + + fn get_ts(&self) -> TimeStamp { + if let Some(client) = self.pd_client.as_ref() { + block_on_timeout(client.get_tso(), Duration::from_secs(5)) + .unwrap() + .unwrap() + } else { + (next_id() as u64).into() } } @@ -198,7 +219,7 @@ impl Store { } pub fn begin(&mut self) { - self.current_ts = (next_id() as u64).into(); + self.current_ts = self.get_ts(); self.handles.clear(); } @@ -233,7 +254,7 @@ impl Store { } pub fn commit_with_ctx(&mut self, ctx: Context) { - let commit_ts = (next_id() as u64).into(); + let commit_ts = self.get_ts(); let handles: Vec<_> = self.handles.drain(..).map(|x| Key::from_raw(&x)).collect(); if !handles.is_empty() { self.store diff --git a/components/test_coprocessor/src/table.rs b/components/test_coprocessor/src/table.rs index af070f62759..3009dc78018 100644 --- a/components/test_coprocessor/src/table.rs +++ b/components/test_coprocessor/src/table.rs @@ -2,6 +2,7 @@ use std::collections::BTreeMap; +use codec::{buffer::BufferWriter, number::NumberEncoder as _}; use kvproto::coprocessor::KeyRange; use tidb_query_datatype::codec::table; use tikv_util::codec::number::NumberEncoder; @@ -116,6 +117,13 @@ impl Table { range.set_end(table::encode_index_seek_key(self.id, idx, &buf)); range } + + pub fn get_table_prefix(&self) -> Vec { + let mut buf = vec![]; + buf.write_bytes(table::TABLE_PREFIX).unwrap(); + buf.write_i64(self.id).unwrap(); + buf + } } impl> std::ops::Index for Table { diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml index 90bf7a24759..3aca771afe7 100644 --- a/components/test_pd_client/Cargo.toml +++ b/components/test_pd_client/Cargo.toml @@ -18,6 +18,5 @@ raft = { workspace = true } slog = { workspace = true } slog-global = { workspace = true } tikv_util = { workspace = true } -tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index d275ac538e5..ec7b43f6887 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -581,6 +581,20 @@ impl PdCluster { .and_then(|k| self.regions.get(k).cloned())) } + fn scan_regions(&self, start: &[u8], end: &[u8], limit: i32) -> Vec { + let mut regions = vec![]; + for (_, region) in self.regions.range((Excluded(start.to_vec()), Unbounded)) { + if !end.is_empty() && region.start_key.as_slice() >= end { + break; + } + regions.push(region.clone()); + if regions.len() as i32 >= limit { + break; + } + } + regions + } + fn get_region_approximate_size(&self, region_id: u64) -> Option { self.region_approximate_size.get(®ion_id).cloned() } @@ -1656,6 +1670,28 @@ impl PdClient for TestPdClient { } } + fn scan_regions( + &self, + start_key: &[u8], + end_key: &[u8], + limit: i32, + ) -> Result> { + self.check_bootstrap()?; + + let result: Vec<_> = self + .cluster + .rl() + .scan_regions(start_key, end_key, limit) + .into_iter() + .map(|r| { + let mut res = pdpb::Region::new(); + res.set_region(r); + res + }) + .collect(); + Ok(result) + } + fn get_cluster_config(&self) -> Result { self.check_bootstrap()?; Ok(self.cluster.rl().meta.clone()) diff --git a/components/test_raftstore-v2/Cargo.toml b/components/test_raftstore-v2/Cargo.toml index d4a0bd3c9a2..2c36588ddd3 100644 --- a/components/test_raftstore-v2/Cargo.toml +++ b/components/test_raftstore-v2/Cargo.toml @@ -22,17 +22,13 @@ test-engines-panic = [ [dependencies] api_version = { workspace = true } -backtrace = "0.3" causal_ts = { workspace = true, features = ["testexport"] } collections = { workspace = true } concurrency_manager = { workspace = true } -crossbeam = { workspace = true } encryption_export = { workspace = true } engine_rocks = { workspace = true } -engine_rocks_helper = { workspace = true } engine_test = { workspace = true } engine_traits = { workspace = true } -fail = "0.5" file_system = { workspace = true } futures = "0.3" grpcio = { workspace = true } @@ -40,7 +36,6 @@ grpcio-health = { workspace = true } health_controller = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } -lazy_static = "1.3" log_wrappers = { workspace = true } pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } @@ -64,5 +59,4 @@ test_util = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index af256608562..47830c77730 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -68,7 +68,10 @@ use tikv::{ storage::{ self, kv::{FakeExtension, LocalTablets, RaftExtension, SnapContext}, - txn::flow_controller::{EngineFlowController, FlowController}, + txn::{ + flow_controller::{EngineFlowController, FlowController}, + txn_status_cache::TxnStatusCache, + }, Engine, Storage, }, }; @@ -147,6 +150,12 @@ impl Engine for TestRaftKv2 { self.raftkv.async_snapshot(ctx) } + type IMSnap = Self::Snap; + type IMSnapshotRes = Self::SnapshotRes; + fn async_in_memory_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::IMSnapshotRes { + self.async_snapshot(ctx) + } + type WriteRes = as Engine>::WriteRes; fn async_write( &self, @@ -422,6 +431,10 @@ impl ServerCluster { let region_info_accessor = RegionInfoAccessor::new( &mut coprocessor_host, Arc::new(|| false), // Not applicable to v2 + Box::new(|| { + // v2 does not support ime + unreachable!() + }), ); let sim_router = SimulateTransport::new(raft_router.clone()); @@ -458,8 +471,9 @@ impl ServerCluster { Default::default(), Arc::new(region_info_accessor.clone()), ); - gc_worker.start(node_id).unwrap(); + gc_worker.start(node_id, coprocessor_host.clone()).unwrap(); + let txn_status_cache = Arc::new(TxnStatusCache::new_for_test()); let rts_worker = if cfg.resolved_ts.enable { // Resolved ts worker let mut rts_worker = LazyWorker::new("resolved-ts"); @@ -477,6 +491,7 @@ impl ServerCluster { concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), + txn_status_cache.clone(), ); // Start the worker rts_worker.start(rts_endpoint); @@ -538,6 +553,7 @@ impl ServerCluster { .as_ref() .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), resource_manager.clone(), + txn_status_cache, )?; self.storages.insert(node_id, raft_kv_v2.clone()); diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 0efad0505e8..5d87a3f5b02 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -198,17 +198,31 @@ pub fn wait_for_synced( .get(&node_id) .unwrap() .clone(); - let leader = cluster.leader_of_region(region_id).unwrap(); - let epoch = cluster.get_region_epoch(region_id); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader); - ctx.set_region_epoch(epoch); - let snap_ctx = SnapContext { - pb_ctx: &ctx, - ..Default::default() + + let mut count = 0; + let snapshot = loop { + count += 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + match storage.snapshot(snap_ctx) { + Ok(s) => break s, + Err(e) => { + if count <= 5 { + continue; + } + panic!("all retry failed: {:?}", e); + } + } }; - let snapshot = storage.snapshot(snap_ctx).unwrap(); + let txn_ext = snapshot.txn_ext.clone().unwrap(); for retry in 0..10 { if txn_ext.is_max_ts_synced() { diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index 152c550e53c..45c003193cd 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -29,16 +29,15 @@ concurrency_manager = { workspace = true } crossbeam = { workspace = true } encryption_export = { workspace = true } engine_rocks = { workspace = true } -engine_rocks_helper = { workspace = true } engine_test = { workspace = true } engine_traits = { workspace = true } fail = "0.5" file_system = { workspace = true } futures = "0.3" grpcio = { workspace = true } -grpcio-health = { workspace = true } health_controller = { workspace = true } hybrid_engine = { workspace = true } +in_memory_engine = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" @@ -48,7 +47,6 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { workspace = true } raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" -range_cache_memory_engine = { workspace = true } resolved_ts = { workspace = true } resource_control = { workspace = true } resource_metering = { workspace = true } @@ -64,5 +62,4 @@ test_util = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 04b2cf2cee1..0a165ec0528 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -4,27 +4,22 @@ use std::{ collections::hash_map::Entry as MapEntry, error::Error as StdError, result, - sync::{ - mpsc::{self, sync_channel}, - Arc, Mutex, RwLock, - }, + sync::{mpsc, Arc, Mutex, RwLock}, thread, time::Duration, }; -use ::server::common::KvEngineBuilder; use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{RocksCompactedEvent, RocksEngine, RocksStatistics}; +use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - CacheRange, Engines, Iterable, KvEngine, ManualCompactionOptions, Mutable, Peekable, - RaftEngineReadOnly, SnapshotContext, SyncMutable, WriteBatch, CF_DEFAULT, CF_RAFT, + CompactExt, Engines, Iterable, ManualCompactionOptions, MiscExt, Mutable, Peekable, + RaftEngineReadOnly, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture, StreamExt}; -use keys::{DATA_MAX_KEY, DATA_MIN_KEY}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::{ApiVersion, Context, DiskFullOpt}, @@ -44,15 +39,13 @@ use raftstore::{ fsm::{ create_raft_batch_system, store::{StoreMeta, PENDING_MSG_CAP}, - RaftBatchSystem, RaftRouter, + ApplyRouter, RaftBatchSystem, RaftRouter, }, transport::CasualRouter, - util::encode_start_ts_into_flag_data, *, }, Error, Result, }; -use range_cache_memory_engine::RangeCacheMemoryEngine; use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; @@ -65,20 +58,14 @@ use tikv_util::{ }; use txn_types::WriteBatchFlags; -use self::range_cache_engine::RangCacheEngineExt; use super::*; use crate::Config; - -pub trait KvEngineWithRocks = KvEngine - + KvEngineBuilder - + RangCacheEngineExt; - // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id // isn't allocated by pd, and node id, store id are same. // E,g, for node 1, the node id and store id are both 1. -pub trait Simulator { +pub trait Simulator { // Pass 0 to let pd allocate a node id if db is empty. // If node id > 0, the node must be created in db already, // and the node id must be the same as given argument. @@ -88,11 +75,11 @@ pub trait Simulator { &mut self, node_id: u64, cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, resource_manager: &Option>, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); @@ -101,7 +88,7 @@ pub trait Simulator { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ) -> Result<()> { self.async_command_on_node_with_opts(node_id, request, cb, Default::default()) } @@ -109,13 +96,14 @@ pub trait Simulator { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, opts: RaftCmdExtraOpts, ) -> Result<()>; fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; fn get_snap_dir(&self, node_id: u64) -> String; fn get_snap_mgr(&self, node_id: u64) -> &SnapManager; - fn get_router(&self, node_id: u64) -> Option>; + fn get_router(&self, node_id: u64) -> Option>; + fn get_apply_router(&self, node_id: u64) -> Option>; fn add_send_filter(&mut self, node_id: u64, filter: Box); fn clear_send_filters(&mut self, node_id: u64); fn add_recv_filter(&mut self, node_id: u64, filter: Box); @@ -128,25 +116,23 @@ pub trait Simulator { fn read( &mut self, - snap_ctx: Option, batch_id: Option, request: RaftCmdRequest, timeout: Duration, ) -> Result { let node_id = request.get_header().get_peer().get_store_id(); - let (cb, mut rx) = make_cb::(&request); - self.async_read(snap_ctx, node_id, batch_id, request, cb); + let (cb, mut rx) = make_cb(&request); + self.async_read(node_id, batch_id, request, cb); rx.recv_timeout(timeout) .map_err(|_| Error::Timeout(format!("request timeout for {:?}", timeout))) } fn async_read( &mut self, - snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ); fn call_command_on_node( @@ -155,7 +141,7 @@ pub trait Simulator { request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, mut rx) = make_cb::(&request); + let (cb, mut rx) = make_cb(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} @@ -170,17 +156,17 @@ pub trait Simulator { } } -pub struct Cluster> { +pub struct Cluster { pub cfg: Config, leaders: HashMap, pub count: usize, pub paths: Vec, - pub dbs: Vec>, + pub dbs: Vec>, pub store_metas: HashMap>>, key_managers: Vec>>, pub io_rate_limiter: Option>, - pub engines: HashMap>, + pub engines: HashMap>, key_managers_map: HashMap>>, pub labels: HashMap>, group_props: HashMap, @@ -191,18 +177,9 @@ pub struct Cluster> { pub sim: Arc>, pub pd_client: Arc, resource_manager: Option>, - - // When this is set, the `HybridEngineImpl` will be used as the underlying KvEngine. In - // addition, it atomaticaly load the whole range when start. When we want to do something - // specific, for example, only load ranges of some regions, we may not set this. - range_cache_engine_enabled_with_whole_range: bool, } -impl Cluster -where - EK: KvEngineWithRocks, - T: Simulator, -{ +impl Cluster { // Create the default Store cluster. pub fn new( id: u64, @@ -210,7 +187,7 @@ where sim: Arc>, pd_client: Arc, api_version: ApiVersion, - ) -> Cluster { + ) -> Cluster { // TODO: In the future, maybe it's better to test both case where // `use_delete_range` is true and false Cluster { @@ -233,7 +210,6 @@ where resource_manager: Some(Arc::new(ResourceGroupManager::default())), kv_statistics: vec![], raft_statistics: vec![], - range_cache_engine_enabled_with_whole_range: false, } } @@ -273,14 +249,9 @@ where assert!(self.sst_workers_map.insert(node_id, offset).is_none()); } - fn create_engine(&mut self, router: Option>) { + fn create_engine(&mut self, router: Option>) { let (engines, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = - create_test_engine( - router, - self.io_rate_limiter.clone(), - self.pd_client.clone(), - &self.cfg, - ); + create_test_engine(router, self.io_rate_limiter.clone(), &self.cfg); self.dbs.push(engines); self.key_managers.push(key_manager); self.paths.push(dir); @@ -368,11 +339,6 @@ where self.create_engines(); self.bootstrap_region().unwrap(); self.start().unwrap(); - if self.range_cache_engine_enabled_with_whole_range { - self.engines - .iter() - .for_each(|(_, engines)| engines.kv.cache_all()); - } } // Bootstrap the store with fixed ID (like 1, 2, .. 5) and @@ -443,7 +409,7 @@ where tikv_util::thread_group::set_properties(previous_prop); } - pub fn get_engine(&self, node_id: u64) -> EK { + pub fn get_engine(&self, node_id: u64) -> RocksEngine { self.engines[&node_id].kv.clone() } @@ -451,7 +417,7 @@ where self.engines[&node_id].raft.clone() } - pub fn get_all_engines(&self, node_id: u64) -> Engines { + pub fn get_all_engines(&self, node_id: u64) -> Engines { self.engines[&node_id].clone() } @@ -480,16 +446,11 @@ where pub fn read( &self, - snap_ctx: Option, batch_id: Option, request: RaftCmdRequest, timeout: Duration, ) -> Result { - match self - .sim - .wl() - .read(snap_ctx, batch_id, request.clone(), timeout) - { + match self.sim.wl().read(batch_id, request.clone(), timeout) { Err(e) => { warn!("failed to read {:?}: {:?}", request, e); Err(e) @@ -502,15 +463,6 @@ where &self, request: RaftCmdRequest, timeout: Duration, - ) -> Result { - self.call_command_with_snap_ctx(request, timeout, None) - } - - pub fn call_command_with_snap_ctx( - &self, - request: RaftCmdRequest, - timeout: Duration, - snap_ctx: Option, ) -> Result { let mut is_read = false; for req in request.get_requests() { @@ -522,7 +474,7 @@ where } } let ret = if is_read { - self.sim.wl().read(snap_ctx, None, request.clone(), timeout) + self.sim.wl().read(None, request.clone(), timeout) } else { self.sim.rl().call_command(request.clone(), timeout) }; @@ -535,11 +487,10 @@ where } } - pub fn call_command_on_leader_with_snap_ctx( + pub fn call_command_on_leader( &mut self, mut request: RaftCmdRequest, timeout: Duration, - snap_ctx: Option, ) -> Result { let timer = Instant::now(); let region_id = request.get_header().get_region_id(); @@ -549,11 +500,10 @@ where Some(l) => l, }; request.mut_header().set_peer(leader); - let resp = - match self.call_command_with_snap_ctx(request.clone(), timeout, snap_ctx.clone()) { - e @ Err(_) => return e, - Ok(resp) => resp, - }; + let resp = match self.call_command(request.clone(), timeout) { + e @ Err(_) => return e, + Ok(resp) => resp, + }; if self.refresh_leader_if_needed(&resp, region_id) && timer.saturating_elapsed() < timeout { @@ -567,14 +517,6 @@ where } } - pub fn call_command_on_leader( - &mut self, - request: RaftCmdRequest, - timeout: Duration, - ) -> Result { - self.call_command_on_leader_with_snap_ctx(request, timeout, None) - } - fn valid_leader_id(&self, region_id: u64, leader_id: u64) -> bool { let store_ids = match self.voter_store_ids_of_region(region_id) { None => return false, @@ -832,7 +774,7 @@ where self.leaders.remove(®ion_id); } - pub fn assert_quorum bool>(&self, mut condition: F) { + pub fn assert_quorum bool>(&self, mut condition: F) { if self.engines.is_empty() { return; } @@ -920,17 +862,6 @@ where reqs: Vec, read_quorum: bool, timeout: Duration, - ) -> RaftCmdResponse { - self.request_with_snap_ctx(key, reqs, read_quorum, timeout, None) - } - - pub fn request_with_snap_ctx( - &mut self, - key: &[u8], - reqs: Vec, - read_quorum: bool, - timeout: Duration, - snap_ctx: Option, ) -> RaftCmdResponse { let timer = Instant::now(); let mut tried_times = 0; @@ -939,16 +870,13 @@ where tried_times += 1; let mut region = self.get_region(key); let region_id = region.get_id(); - let mut req = new_request( + let req = new_request( region_id, region.take_region_epoch(), reqs.clone(), read_quorum, ); - if let Some(ref ctx) = snap_ctx { - encode_start_ts_into_flag_data(req.mut_header(), ctx.read_ts); - } - let result = self.call_command_on_leader_with_snap_ctx(req, timeout, snap_ctx.clone()); + let result = self.call_command_on_leader(req, timeout); let resp = match result { e @ Err(Error::Timeout(_)) @@ -1014,48 +942,15 @@ where } pub fn get(&mut self, key: &[u8]) -> Option> { - if !self.range_cache_engine_enabled_with_whole_range { - self.get_impl(CF_DEFAULT, key, false) - } else { - let ctx = SnapshotContext { - read_ts: u64::MAX, - range: Some(CacheRange::new( - DATA_MIN_KEY.to_vec(), - DATA_MAX_KEY.to_vec(), - )), - }; - self.get_cf_with_snap_ctx(CF_DEFAULT, key, true, ctx) - } + self.get_impl(CF_DEFAULT, key, false) } pub fn get_cf(&mut self, cf: &str, key: &[u8]) -> Option> { - if !self.range_cache_engine_enabled_with_whole_range { - self.get_impl(cf, key, false) - } else { - let ctx = SnapshotContext { - read_ts: u64::MAX, - range: Some(CacheRange::new( - DATA_MIN_KEY.to_vec(), - DATA_MAX_KEY.to_vec(), - )), - }; - self.get_cf_with_snap_ctx(cf, key, true, ctx) - } + self.get_impl(cf, key, false) } pub fn must_get(&mut self, key: &[u8]) -> Option> { - if !self.range_cache_engine_enabled_with_whole_range { - self.get_impl(CF_DEFAULT, key, true) - } else { - let ctx = SnapshotContext { - read_ts: u64::MAX, - range: Some(CacheRange::new( - DATA_MIN_KEY.to_vec(), - DATA_MAX_KEY.to_vec(), - )), - }; - self.get_cf_with_snap_ctx(CF_DEFAULT, key, true, ctx) - } + self.get_impl(CF_DEFAULT, key, true) } fn get_impl(&mut self, cf: &str, key: &[u8], read_quorum: bool) -> Option> { @@ -1077,61 +972,6 @@ where } } - pub fn get_with_snap_ctx( - &mut self, - key: &[u8], - read_quorum: bool, - snap_ctx: SnapshotContext, - ) -> Option> { - self.get_cf_with_snap_ctx(CF_DEFAULT, key, read_quorum, snap_ctx) - } - - // called by range cache engine only - pub fn get_cf_with_snap_ctx( - &mut self, - cf: &str, - key: &[u8], - read_quorum: bool, - snap_ctx: SnapshotContext, - ) -> Option> { - let rx = if self.range_cache_engine_enabled_with_whole_range { - fail::remove("on_range_cache_get_value"); - let (tx, rx) = sync_channel(1); - fail::cfg_callback("on_range_cache_get_value", move || { - tx.send(true).unwrap(); - }) - .unwrap(); - Some(rx) - } else { - None - }; - - let mut resp = self.request_with_snap_ctx( - key, - vec![new_get_cf_cmd(cf, key)], - read_quorum, - Duration::from_secs(5), - Some(snap_ctx), - ); - if resp.get_header().has_error() { - panic!("response {:?} has error", resp); - } - assert_eq!(resp.get_responses().len(), 1); - assert_eq!(resp.get_responses()[0].get_cmd_type(), CmdType::Get); - let res = if resp.get_responses()[0].has_get() { - if let Some(rx) = rx { - rx.recv_timeout(Duration::from_secs(5)).unwrap(); - } - Some(resp.mut_responses()[0].mut_get().take_value()) - } else { - None - }; - if self.range_cache_engine_enabled_with_whole_range { - fail::remove("on_range_cache_get_value"); - } - res - } - pub fn async_request( &mut self, req: RaftCmdRequest, @@ -1147,7 +987,7 @@ where let region_id = req.get_header().get_region_id(); let leader = self.leader_of_region(region_id).unwrap(); req.mut_header().set_peer(leader.clone()); - let (cb, mut rx) = make_cb::(&req); + let (cb, mut rx) = make_cb(&req); self.sim .rl() .async_command_on_node_with_opts(leader.get_store_id(), req, cb, opts)?; @@ -1519,7 +1359,7 @@ where } } - pub fn restore_kv_meta(&self, region_id: u64, store_id: u64, snap: &EK::Snapshot) { + pub fn restore_kv_meta(&self, region_id: u64, store_id: u64, snap: &RocksSnapshot) { let (meta_start, meta_end) = ( keys::region_meta_prefix(region_id), keys::region_meta_prefix(region_id + 1), @@ -1647,7 +1487,7 @@ where &mut self, region: &metapb::Region, split_key: &[u8], - cb: Callback, + cb: Callback, ) { let leader = self.leader_of_region(region.get_id()).unwrap(); let router = self.sim.rl().get_router(leader.get_store_id()).unwrap(); @@ -1887,7 +1727,7 @@ where ) } - pub fn merge_region(&mut self, source: u64, target: u64, cb: Callback) { + pub fn merge_region(&mut self, source: u64, target: u64, cb: Callback) { let mut req = self.new_prepare_merge(source, target); let leader = self.leader_of_region(source).unwrap(); req.mut_header().set_peer(leader.clone()); @@ -2058,10 +1898,14 @@ where ctx } - pub fn get_router(&self, node_id: u64) -> Option> { + pub fn get_router(&self, node_id: u64) -> Option> { self.sim.rl().get_router(node_id) } + pub fn get_apply_router(&self, node_id: u64) -> Option> { + self.sim.rl().get_apply_router(node_id) + } + pub fn refresh_region_bucket_keys( &mut self, region: &metapb::Region, @@ -2158,13 +2002,9 @@ where Ok(()) } - - pub fn range_cache_engine_enabled_with_whole_range(&mut self, v: bool) { - self.range_cache_engine_enabled_with_whole_range = v; - } } -impl> Drop for Cluster { +impl Drop for Cluster { fn drop(&mut self) { test_util::clear_failpoints(); self.shutdown(); @@ -2174,7 +2014,7 @@ impl> Drop for Cluster { pub trait RawEngine: Peekable + SyncMutable { - fn range_cache_engine(&self) -> bool { + fn region_cache_engine(&self) -> bool { false } @@ -2202,38 +2042,3 @@ impl RawEngine for RocksEngine { self.get_msg_cf(CF_RAFT, &keys::raft_state_key(region_id)) } } - -impl RawEngine for HybridEngineImpl { - fn range_cache_engine(&self) -> bool { - true - } - - fn region_local_state( - &self, - region_id: u64, - ) -> engine_traits::Result> { - self.disk_engine() - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) - } - - fn raft_apply_state(&self, region_id: u64) -> engine_traits::Result> { - self.disk_engine() - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - } - - fn raft_local_state(&self, region_id: u64) -> engine_traits::Result> { - self.disk_engine() - .get_msg_cf(CF_RAFT, &keys::raft_state_key(region_id)) - } -} - -impl> Cluster { - pub fn get_range_cache_engine(&self, node_id: u64) -> RangeCacheMemoryEngine { - self.engines - .get(&node_id) - .unwrap() - .kv - .range_cache_engine() - .clone() - } -} diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 4922730595f..be38155af6c 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -11,7 +11,6 @@ extern crate tikv_util; mod cluster; mod config; mod node; -pub mod range_cache_engine; mod router; mod server; mod transport_simulate; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 3e7dc1c8dfe..26319d43e27 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -8,11 +8,10 @@ use std::{ use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; -use engine_rocks::RocksEngine; +use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; -use engine_traits::{Engines, KvEngine, SnapshotContext}; +use engine_traits::{Engines, MiscExt, Peekable}; use health_controller::HealthController; -use hybrid_engine::observer::Observer as HybridEngineObserver; use kvproto::{ kvrpcpb::ApiVersion, metapb, @@ -24,15 +23,14 @@ use raft::{eraftpb::MessageType, SnapshotStatus}; use raftstore::{ coprocessor::{config::SplitCheckConfigManager, CoprocessorHost}, errors::Error as RaftError, - router::{LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, + router::{LocalReadRouter, RaftStoreRouter, ReadContext, ServerRaftStoreRouter}, store::{ config::RaftstoreConfigManager, - fsm::{store::StoreMeta, RaftBatchSystem, RaftRouter}, + fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, SnapManagerBuilder, *, }, Result, }; -use range_cache_memory_engine::RangeCacheEngineConfig; use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use service::service_manager::GrpcServiceManager; @@ -52,18 +50,18 @@ use tikv_util::{ use super::*; use crate::Config; -pub struct ChannelTransportCore { +pub struct ChannelTransportCore { snap_paths: HashMap, - routers: HashMap, EK>>, + routers: HashMap>>, } #[derive(Clone)] -pub struct ChannelTransport { - core: Arc>>, +pub struct ChannelTransport { + core: Arc>, } -impl ChannelTransport { - pub fn new() -> ChannelTransport { +impl ChannelTransport { + pub fn new() -> ChannelTransport { ChannelTransport { core: Arc::new(Mutex::new(ChannelTransportCore { snap_paths: HashMap::default(), @@ -73,13 +71,13 @@ impl ChannelTransport { } } -impl Default for ChannelTransport { +impl Default for ChannelTransport { fn default() -> Self { Self::new() } } -impl Transport for ChannelTransport { +impl Transport for ChannelTransport { fn send(&mut self, msg: RaftMessage) -> Result<()> { let from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); @@ -152,23 +150,23 @@ impl Transport for ChannelTransport { fn flush(&mut self) {} } -type SimulateChannelTransport = SimulateTransport, EK>; +type SimulateChannelTransport = SimulateTransport; -pub struct NodeCluster { - trans: ChannelTransport, +pub struct NodeCluster { + trans: ChannelTransport, pd_client: Arc, - nodes: HashMap>, + nodes: HashMap>, snap_mgrs: HashMap, cfg_controller: HashMap, - simulate_trans: HashMap>, + simulate_trans: HashMap, concurrency_managers: HashMap, - importers: HashMap>>, + importers: HashMap>>, #[allow(clippy::type_complexity)] - post_create_coprocessor_host: Option)>>, + post_create_coprocessor_host: Option)>>, } -impl NodeCluster { - pub fn new(pd_client: Arc) -> NodeCluster { +impl NodeCluster { + pub fn new(pd_client: Arc) -> NodeCluster { NodeCluster { trans: ChannelTransport::new(), pd_client, @@ -183,12 +181,12 @@ impl NodeCluster { } } -impl NodeCluster { +impl NodeCluster { #[allow(dead_code)] pub fn get_node_router( &self, node_id: u64, - ) -> SimulateTransport, EK> { + ) -> SimulateTransport> { self.trans .core .lock() @@ -203,14 +201,17 @@ impl NodeCluster { // first argument of `op` is the node_id. // Set this before invoking `run_node`. #[allow(clippy::type_complexity)] - pub fn post_create_coprocessor_host(&mut self, op: Box)>) { + pub fn post_create_coprocessor_host( + &mut self, + op: Box)>, + ) { self.post_create_coprocessor_host = Some(op) } pub fn get_node( &mut self, node_id: u64, - ) -> Option<&mut MultiRaftServer> { + ) -> Option<&mut MultiRaftServer> { self.nodes.get_mut(&node_id) } @@ -222,21 +223,21 @@ impl NodeCluster { self.cfg_controller.get(&node_id) } - pub fn get_importer(&self, node_id: u64) -> Option>> { + pub fn get_importer(&self, node_id: u64) -> Option>> { self.importers.get(&node_id).cloned() } } -impl Simulator for NodeCluster { +impl Simulator for NodeCluster { fn run_node( &mut self, node_id: u64, cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, _resource_manager: &Option>, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); @@ -285,6 +286,7 @@ impl Simulator for NodeCluster { .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) .enable_multi_snapshot_files(true) .enable_receive_tablet_snapshot(cfg.raft_store.enable_v2_compatible_learner) + .min_ingest_snapshot_limit(cfg.server.snap_min_ingest_size) .build(tmp.path().to_str().unwrap()); (snap_mgr, Some(tmp)) } else { @@ -302,12 +304,6 @@ impl Simulator for NodeCluster { f(node_id, &mut coprocessor_host); } - // Hybrid engine observer. - if cfg.tikv.range_cache_engine.enabled { - let observer = HybridEngineObserver::new(Arc::new(engines.kv.clone())); - observer.register_to(&mut coprocessor_host); - } - let cm = ConcurrencyManager::new(1.into()); self.concurrency_managers.insert(node_id, cm.clone()); ReplicaReadLockChecker::new(cm.clone()).register(&mut coprocessor_host); @@ -324,6 +320,7 @@ impl Simulator for NodeCluster { engines.kv.clone(), StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), router.clone(), + coprocessor_host.clone(), ); let cfg_controller = ConfigController::new(cfg.tikv.clone()); @@ -435,7 +432,7 @@ impl Simulator for NodeCluster { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, opts: RaftCmdExtraOpts, ) -> Result<()> { if !self @@ -463,11 +460,10 @@ impl Simulator for NodeCluster { fn async_read( &mut self, - snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ) { if !self .trans @@ -485,7 +481,8 @@ impl Simulator for NodeCluster { } let mut guard = self.trans.core.lock().unwrap(); let router = guard.routers.get_mut(&node_id).unwrap(); - router.read(snap_ctx, batch_id, request, cb).unwrap(); + let read_ctx = ReadContext::new(batch_id, None); + router.read(read_ctx, request, cb).unwrap(); } fn send_raft_msg(&mut self, msg: raft_serverpb::RaftMessage) -> Result<()> { @@ -516,50 +513,26 @@ impl Simulator for NodeCluster { trans.routers.get_mut(&node_id).unwrap().clear_filters(); } - fn get_router(&self, node_id: u64) -> Option> { + fn get_router(&self, node_id: u64) -> Option> { self.nodes.get(&node_id).map(|node| node.get_router()) } + + fn get_apply_router(&self, node_id: u64) -> Option> { + self.nodes.get(&node_id).map(|node| node.get_apply_router()) + } } // Compare to server cluster, node cluster does not have server layer and // storage layer. -pub fn new_node_cluster(id: u64, count: usize) -> Cluster> { +pub fn new_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } -// the hybrid engine with disk engine "RocksEngine" and region cache engine -// "RangeCacheMemoryEngine" is used in the node cluster. -pub fn new_node_cluster_with_hybrid_engine( - id: u64, - count: usize, -) -> Cluster> { - let pd_client = Arc::new(TestPdClient::new(id, false)); - let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); - let mut cluster = Cluster::new(id, count, sim, pd_client, ApiVersion::V1); - cluster.range_cache_engine_enabled_with_whole_range(true); - cluster.cfg.tikv.range_cache_engine = RangeCacheEngineConfig::config_for_test(); - cluster -} - -pub fn new_node_cluster_with_hybrid_engine_with_no_range_cache( - id: u64, - count: usize, -) -> Cluster> { - let pd_client = Arc::new(TestPdClient::new(id, false)); - let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); - let mut cluster = Cluster::new(id, count, sim, pd_client, ApiVersion::V1); - cluster.cfg.tikv.range_cache_engine = RangeCacheEngineConfig::config_for_test(); - cluster -} - // This cluster does not support batch split, we expect it to transfer the // `BatchSplit` request to `split` request -pub fn new_incompatible_node_cluster( - id: u64, - count: usize, -) -> Cluster> { +pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) diff --git a/components/test_raftstore/src/range_cache_engine.rs b/components/test_raftstore/src/range_cache_engine.rs deleted file mode 100644 index 57af823211c..00000000000 --- a/components/test_raftstore/src/range_cache_engine.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_rocks::RocksEngine; -use engine_traits::CacheRange; -use keys::{DATA_MAX_KEY, DATA_MIN_KEY}; - -use crate::HybridEngineImpl; - -pub trait RangCacheEngineExt { - fn cache_all(&self); -} - -impl RangCacheEngineExt for HybridEngineImpl { - fn cache_all(&self) { - self.range_cache_engine().new_range(CacheRange::new( - DATA_MIN_KEY.to_vec(), - DATA_MAX_KEY.to_vec(), - )); - } -} - -impl RangCacheEngineExt for RocksEngine { - fn cache_all(&self) {} -} diff --git a/components/test_raftstore/src/router.rs b/components/test_raftstore/src/router.rs index 3b6b1e962c3..d6a135c9f9a 100644 --- a/components/test_raftstore/src/router.rs +++ b/components/test_raftstore/src/router.rs @@ -60,7 +60,7 @@ impl CasualRouter for MockRaftStoreRouter { fn send(&self, region_id: u64, msg: CasualMessage) -> RaftStoreResult<()> { let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { - tx.try_send(PeerMsg::CasualMessage(msg)) + tx.try_send(PeerMsg::CasualMessage(Box::new(msg))) .map_err(|e| handle_send_error(region_id, e)) } else { Err(RaftStoreError::RegionNotFound(region_id)) @@ -76,7 +76,8 @@ impl SignificantRouter for MockRaftStoreRouter { ) -> RaftStoreResult<()> { let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { - tx.force_send(PeerMsg::SignificantMsg(msg)).unwrap(); + tx.force_send(PeerMsg::SignificantMsg(Box::new(msg))) + .unwrap(); Ok(()) } else { error!("failed to send significant msg"; "msg" => ?msg); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 4497159f4cd..d73157c51ac 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -2,23 +2,28 @@ use std::{ path::Path, - sync::{atomic::AtomicU64, Arc, Mutex, RwLock}, + sync::{atomic::AtomicU64, mpsc::Receiver, Arc, Mutex, RwLock}, thread, time::Duration, usize, }; +use ::server::common::build_hybrid_engine; use api_version::{dispatch_api_version, KvFormat}; use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; -use engine_rocks::RocksEngine; +use engine_rocks::{FlowInfo, RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; -use engine_traits::{Engines, KvEngine, SnapshotContext}; +use engine_traits::{Engines, MiscExt}; use futures::executor::block_on; use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; use health_controller::HealthController; +use hybrid_engine::observer::{ + HybridSnapshotObserver, LoadEvictionObserver, RegionCacheWriteBatchObserver, +}; +use in_memory_engine::{InMemoryEngineConfig, InMemoryEngineContext, RegionCacheMemoryEngine}; use kvproto::{ deadlock::create_deadlock, debugpb::{create_debug, DebugClient}, @@ -33,7 +38,7 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CoprocessorHost, RegionInfoAccessor}, errors::Error as RaftError, - router::{CdcRaftRouter, LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, + router::{CdcRaftRouter, LocalReadRouter, RaftStoreRouter, ReadContext, ServerRaftStoreRouter}, store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, @@ -42,7 +47,6 @@ use raftstore::{ }, Result, }; -use range_cache_memory_engine::RangeCacheEngineConfig; use resource_control::ResourceGroupManager; use resource_metering::{CollectorRegHandle, ResourceTagFactory}; use security::SecurityManager; @@ -51,7 +55,8 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::{ config::ConfigController, - coprocessor, coprocessor_v2, + coprocessor::{self, Endpoint}, + coprocessor_v2, import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ @@ -69,7 +74,10 @@ use tikv::{ storage::{ self, kv::{FakeExtension, LocalTablets, SnapContext}, - txn::flow_controller::{EngineFlowController, FlowController}, + txn::{ + flow_controller::{EngineFlowController, FlowController}, + txn_status_cache::TxnStatusCache, + }, Engine, Storage, }, }; @@ -87,12 +95,12 @@ use txn_types::TxnExtraScheduler; use super::*; use crate::Config; -type SimulateStoreTransport = SimulateTransport, EK>; +type SimulateStoreTransport = SimulateTransport>; -pub type SimulateEngine = RaftKv>; -type SimulateRaftExtension = as Engine>::RaftExtension; -type SimulateServerTransport = - SimulateTransport, PdStoreAddrResolver>, EK>; +pub type SimulateEngine = RaftKv; +type SimulateRaftExtension = ::RaftExtension; +type SimulateServerTransport = + SimulateTransport>; #[derive(Default, Clone)] pub struct AddressMap { @@ -128,29 +136,31 @@ impl StoreAddrResolver for AddressMap { } } -struct ServerMeta { - node: MultiRaftServer, - server: Server>, - sim_router: SimulateStoreTransport, - sim_trans: SimulateServerTransport, - raw_router: RaftRouter, - raw_apply_router: ApplyRouter, - gc_worker: GcWorker>>, +struct ServerMeta { + node: MultiRaftServer, + server: Server, + sim_router: SimulateStoreTransport, + sim_trans: SimulateServerTransport, + raw_router: RaftRouter, + raw_apply_router: ApplyRouter, + gc_worker: GcWorker>, + _reciever: Receiver, rts_worker: Option>, rsmeter_cleanup: Box, } type PendingServices = Vec Service>>; -type CopHooks = Vec)>>; +type CopHooks = Vec)>>; -pub struct ServerCluster { - metas: HashMap>, +pub struct ServerCluster { + metas: HashMap, addrs: AddressMap, - pub storages: HashMap>, + pub storages: HashMap, + pub copr_endpoints: HashMap>, pub region_info_accessors: HashMap, - pub importers: HashMap>>, + pub importers: HashMap>>, pub pending_services: HashMap, - pub coprocessor_hooks: HashMap>, + pub coprocessor_hosts: HashMap>, pub health_controllers: HashMap, pub security_mgr: Arc, pub txn_extra_schedulers: HashMap>, @@ -163,10 +173,11 @@ pub struct ServerCluster { env: Arc, pub causal_ts_providers: HashMap>, pub encryption: Option>, + pub in_memory_engines: HashMap>, } -impl ServerCluster { - pub fn new(pd_client: Arc) -> ServerCluster { +impl ServerCluster { + pub fn new(pd_client: Arc) -> ServerCluster { let env = Arc::new( EnvBuilder::new() .cq_count(2) @@ -193,12 +204,14 @@ impl ServerCluster { pd_client, security_mgr, storages: HashMap::default(), + copr_endpoints: HashMap::default(), + in_memory_engines: HashMap::default(), region_info_accessors: HashMap::default(), importers: HashMap::default(), snap_paths: HashMap::default(), snap_mgrs: HashMap::default(), pending_services: HashMap::default(), - coprocessor_hooks: HashMap::default(), + coprocessor_hosts: HashMap::default(), health_controllers: HashMap::default(), raft_clients: HashMap::default(), conn_builder, @@ -214,16 +227,15 @@ impl ServerCluster { self.addrs.get(node_id).unwrap() } - pub fn get_apply_router(&self, node_id: u64) -> ApplyRouter { - self.metas.get(&node_id).unwrap().raw_apply_router.clone() - } - - pub fn get_server_router(&self, node_id: u64) -> SimulateStoreTransport { + pub fn get_server_router(&self, node_id: u64) -> SimulateStoreTransport { self.metas.get(&node_id).unwrap().sim_router.clone() } /// To trigger GC manually. - pub fn get_gc_worker(&self, node_id: u64) -> &GcWorker>> { + pub fn get_gc_worker( + &self, + node_id: u64, + ) -> &GcWorker> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -264,11 +276,11 @@ impl ServerCluster { &mut self, node_id: u64, mut cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, resource_manager: &Option>, ) -> ServerResult { self.encryption = key_manager.clone(); @@ -293,69 +305,107 @@ impl ServerCluster { } } - let local_reader = LocalReader::new( - engines.kv.clone(), - StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), - router.clone(), - ); - // Create coprocessor. let enable_region_stats_mgr_cb: Arc bool + Send + Sync> = - if cfg.range_cache_engine.enabled { + if cfg.in_memory_engine.enable { Arc::new(|| true) } else { Arc::new(|| false) }; let mut coprocessor_host = CoprocessorHost::new(router.clone(), cfg.coprocessor.clone()); - let region_info_accessor = - RegionInfoAccessor::new(&mut coprocessor_host, enable_region_stats_mgr_cb); + if let Some(hooks) = self.coprocessor_hosts.get(&node_id) { + for hook in hooks { + hook(&mut coprocessor_host); + } + } + + // In-memory engine + let mut in_memory_engine_config = cfg.in_memory_engine.clone(); + in_memory_engine_config.expected_region_size = cfg.coprocessor.region_split_size(); + let in_memory_engine_config = Arc::new(VersionTrack::new(in_memory_engine_config)); + let in_memory_engine_config_clone = in_memory_engine_config.clone(); + + let region_info_accessor = RegionInfoAccessor::new( + &mut coprocessor_host, + enable_region_stats_mgr_cb, + Box::new(move || { + in_memory_engine_config_clone + .value() + .mvcc_amplification_threshold + }), + ); + + let in_memory_engine_context = + InMemoryEngineContext::new(in_memory_engine_config.clone(), self.pd_client.clone()); + let in_memory_engine = if cfg.in_memory_engine.enable { + let in_memory_engine = build_hybrid_engine( + in_memory_engine_context, + engines.kv.clone(), + None, + Some(Arc::new(region_info_accessor.clone())), + Box::new(router.clone()), + ); + // Eviction observer + let observer = + LoadEvictionObserver::new(Arc::new(in_memory_engine.region_cache_engine().clone())); + observer.register_to(&mut coprocessor_host); + // Write batch observer + let write_batch_observer = + RegionCacheWriteBatchObserver::new(in_memory_engine.region_cache_engine().clone()); + write_batch_observer.register_to(&mut coprocessor_host); + // Snapshot observer + let snapshot_observer = + HybridSnapshotObserver::new(in_memory_engine.region_cache_engine().clone()); + snapshot_observer.register_to(&mut coprocessor_host); + Some(in_memory_engine) + } else { + None + }; + self.in_memory_engines + .insert(node_id, in_memory_engine.clone()); + let local_reader = LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + router.clone(), + coprocessor_host.clone(), + ); let raft_router = ServerRaftStoreRouter::new(router.clone(), local_reader); let sim_router = SimulateTransport::new(raft_router.clone()); - let raft_engine = RaftKv::new( + let mut raft_kv = RaftKv::new( sim_router.clone(), engines.kv.clone(), region_info_accessor.region_leaders(), ); - if let Some(hooks) = self.coprocessor_hooks.get(&node_id) { - for hook in hooks { - hook(&mut coprocessor_host); - } - } - // Create storage. let pd_worker = LazyWorker::new("test-pd-worker"); let pd_sender = pd_worker.scheduler(); let storage_read_pool = ReadPool::from(storage::build_read_pool( &tikv::config::StorageReadPoolConfig::default_for_test(), pd_sender.clone(), - raft_engine.clone(), + raft_kv.clone(), )); - let mut engine = RaftKv::new( - sim_router.clone(), - engines.kv.clone(), - region_info_accessor.region_leaders(), - ); if let Some(scheduler) = self.txn_extra_schedulers.remove(&node_id) { - engine.set_txn_extra_scheduler(scheduler); + raft_kv.set_txn_extra_scheduler(scheduler); } let latest_ts = block_on(self.pd_client.get_tso()).expect("failed to get timestamp from PD"); let concurrency_manager = ConcurrencyManager::new(latest_ts); - let (tx, _rx) = std::sync::mpsc::channel(); + let (tx, rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( - engine.clone(), + raft_kv.clone(), tx, cfg.gc.clone(), Default::default(), Arc::new(region_info_accessor.clone()), ); - gc_worker.start(node_id).unwrap(); + gc_worker.start(node_id, coprocessor_host.clone()).unwrap(); + let txn_status_cache = Arc::new(TxnStatusCache::new_for_test()); let rts_worker = if cfg.resolved_ts.enable { // Resolved ts worker let mut rts_worker = LazyWorker::new("resolved-ts"); @@ -373,6 +423,7 @@ impl ServerCluster { concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), + txn_status_cache.clone(), ); // Start the worker rts_worker.start(rts_endpoint); @@ -415,9 +466,9 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); - let extension = engine.raft_extension(); + let extension = raft_kv.raft_extension(); let store = Storage::<_, _, F>::from_engine( - engine.clone(), + raft_kv.clone(), &cfg.storage, storage_read_pool.handle(), lock_mgr.clone(), @@ -433,8 +484,9 @@ impl ServerCluster { .as_ref() .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), resource_manager.clone(), + txn_status_cache, )?; - self.storages.insert(node_id, raft_engine); + self.storages.insert(node_id, raft_kv.clone()); ReplicaReadLockChecker::new(concurrency_manager.clone()).register(&mut coprocessor_host); @@ -455,7 +507,7 @@ impl ServerCluster { let import_service = ImportSstService::new( cfg.import.clone(), cfg.raft_store.raft_entry_max_size, - engine, + raft_kv, LocalTablets::Singleton(engines.kv.clone()), Arc::clone(&importer), None, @@ -477,6 +529,7 @@ impl ServerCluster { .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) .enable_multi_snapshot_files(true) .enable_receive_tablet_snapshot(cfg.raft_store.enable_v2_compatible_learner) + .min_ingest_snapshot_limit(cfg.server.snap_min_ingest_size) .build(tmp_str); self.snap_mgrs.insert(node_id, snap_mgr.clone()); let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); @@ -594,6 +647,7 @@ impl ServerCluster { let simulate_trans = SimulateTransport::new(trans); let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + self.copr_endpoints.insert(node_id, copr); // Register the role change observer of the lock manager. lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); @@ -664,6 +718,7 @@ impl ServerCluster { sim_router, sim_trans: simulate_trans, gc_worker, + _reciever: rx, rts_worker, rsmeter_cleanup, }, @@ -682,18 +737,28 @@ impl ServerCluster { let w = meta.rts_worker.as_ref()?; Some(w.scheduler()) } + + pub fn get_region_cache_engine(&self, node_id: u64) -> RegionCacheMemoryEngine { + self.in_memory_engines + .get(&node_id) + .unwrap() + .as_ref() + .unwrap() + .region_cache_engine() + .clone() + } } -impl Simulator for ServerCluster { +impl Simulator for ServerCluster { fn run_node( &mut self, node_id: u64, cfg: Config, - engines: Engines, + engines: Engines, store_meta: Arc>, key_manager: Option>, - router: RaftRouter, - system: RaftBatchSystem, + router: RaftRouter, + system: RaftBatchSystem, resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( @@ -744,7 +809,7 @@ impl Simulator for ServerCluster { &self, node_id: u64, request: RaftCmdRequest, - cb: Callback, + cb: Callback, opts: RaftCmdExtraOpts, ) -> Result<()> { let router = match self.metas.get(&node_id) { @@ -756,11 +821,10 @@ impl Simulator for ServerCluster { fn async_read( &mut self, - snap_ctx: Option, node_id: u64, batch_id: Option, request: RaftCmdRequest, - cb: Callback, + cb: Callback, ) { match self.metas.get_mut(&node_id) { None => { @@ -770,9 +834,8 @@ impl Simulator for ServerCluster { cb.invoke_with_response(resp); } Some(meta) => { - meta.sim_router - .read(snap_ctx, batch_id, request, cb) - .unwrap(); + let read_ctx = ReadContext::new(batch_id, None); + meta.sim_router.read(read_ctx, request, cb).unwrap(); } }; } @@ -819,13 +882,17 @@ impl Simulator for ServerCluster { .clear_filters(); } - fn get_router(&self, node_id: u64) -> Option> { + fn get_router(&self, node_id: u64) -> Option> { self.metas.get(&node_id).map(|m| m.raw_router.clone()) } + + fn get_apply_router(&self, node_id: u64) -> Option> { + self.metas.get(&node_id).map(|m| m.raw_apply_router.clone()) + } } -impl Cluster> { - pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { +impl Cluster { + pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { self.must_get_snapshot_of_region_with_ctx(region_id, Default::default()) } @@ -833,8 +900,8 @@ impl Cluster> { &mut self, region_id: u64, snap_ctx: SnapContext<'_>, - ) -> RegionSnapshot { - let mut try_snapshot = || -> Option> { + ) -> RegionSnapshot { + let mut try_snapshot = || -> Option> { let leader = self.leader_of_region(region_id)?; let store_id = leader.store_id; let epoch = self.get_region_epoch(region_id); @@ -859,7 +926,7 @@ impl Cluster> { panic!("failed to get snapshot of region {}", region_id); } - pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { + pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { self.sim.rl().storages[&node_id].raft_extension() } @@ -867,36 +934,31 @@ impl Cluster> { self.sim.rl().get_addr(node_id) } - pub fn register_hook(&self, node_id: u64, register: Box)>) { + pub fn register_hook( + &self, + node_id: u64, + register: Box)>, + ) { self.sim .wl() - .coprocessor_hooks + .coprocessor_hosts .entry(node_id) .or_default() .push(register); } } -pub fn new_server_cluster( - id: u64, - count: usize, -) -> Cluster> { +pub fn new_server_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } -// the hybrid engine with disk engine "RocksEngine" and region cache engine -// "RangeCacheMemoryEngine" is used in the server cluster. -pub fn new_server_cluster_with_hybrid_engine( - id: u64, - count: usize, -) -> Cluster> { +pub fn new_server_cluster_with_hybrid_engine(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); let mut cluster = Cluster::new(id, count, sim, pd_client, ApiVersion::V1); - cluster.range_cache_engine_enabled_with_whole_range(true); - cluster.cfg.tikv.range_cache_engine = RangeCacheEngineConfig::config_for_test(); + cluster.cfg.tikv.in_memory_engine = InMemoryEngineConfig::config_for_test(); cluster } @@ -904,49 +966,32 @@ pub fn new_server_cluster_with_api_ver( id: u64, count: usize, api_ver: ApiVersion, -) -> Cluster> { +) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, api_ver) } -pub fn new_incompatible_server_cluster( - id: u64, - count: usize, -) -> Cluster> { +pub fn new_incompatible_server_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } -pub fn must_new_cluster_mul( - count: usize, -) -> ( - Cluster>, - metapb::Peer, - Context, -) { +pub fn must_new_cluster_mul(count: usize) -> (Cluster, metapb::Peer, Context) { must_new_and_configure_cluster_mul(count, |_| ()) } pub fn must_new_and_configure_cluster( - configure: impl FnMut(&mut Cluster>), -) -> ( - Cluster>, - metapb::Peer, - Context, -) { + configure: impl FnMut(&mut Cluster), +) -> (Cluster, metapb::Peer, Context) { must_new_and_configure_cluster_mul(1, configure) } fn must_new_and_configure_cluster_mul( count: usize, - mut configure: impl FnMut(&mut Cluster>), -) -> ( - Cluster>, - metapb::Peer, - Context, -) { + mut configure: impl FnMut(&mut Cluster), +) -> (Cluster, metapb::Peer, Context) { let mut cluster = new_server_cluster(0, count); configure(&mut cluster); cluster.run(); @@ -961,32 +1006,20 @@ fn must_new_and_configure_cluster_mul( (cluster, leader, ctx) } -pub fn must_new_cluster_and_kv_client() -> ( - Cluster>, - TikvClient, - Context, -) { +pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, Context) { must_new_cluster_and_kv_client_mul(1) } pub fn must_new_cluster_and_kv_client_mul( count: usize, -) -> ( - Cluster>, - TikvClient, - Context, -) { +) -> (Cluster, TikvClient, Context) { must_new_cluster_with_cfg_and_kv_client_mul(count, |_| {}) } pub fn must_new_cluster_with_cfg_and_kv_client_mul( count: usize, - configure: impl FnMut(&mut Cluster>), -) -> ( - Cluster>, - TikvClient, - Context, -) { + configure: impl FnMut(&mut Cluster), +) -> (Cluster, TikvClient, Context) { let (cluster, leader, ctx) = must_new_and_configure_cluster_mul(count, configure); let env = Arc::new(Environment::new(1)); let channel = @@ -996,11 +1029,7 @@ pub fn must_new_cluster_with_cfg_and_kv_client_mul( (cluster, client, ctx) } -pub fn must_new_cluster_and_debug_client() -> ( - Cluster>, - DebugClient, - u64, -) { +pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClient, u64) { let (cluster, leader, _) = must_new_cluster_mul(1); let env = Arc::new(Environment::new(1)); @@ -1011,12 +1040,8 @@ pub fn must_new_cluster_and_debug_client() -> ( (cluster, client, leader.get_store_id()) } -pub fn must_new_cluster_kv_client_and_debug_client() -> ( - Cluster>, - TikvClient, - DebugClient, - Context, -) { +pub fn must_new_cluster_kv_client_and_debug_client() +-> (Cluster, TikvClient, DebugClient, Context) { let (cluster, leader, ctx) = must_new_cluster_mul(1); let env = Arc::new(Environment::new(1)); @@ -1030,12 +1055,8 @@ pub fn must_new_cluster_kv_client_and_debug_client() -> ( } pub fn must_new_and_configure_cluster_and_kv_client( - configure: impl FnMut(&mut Cluster>), -) -> ( - Cluster>, - TikvClient, - Context, -) { + configure: impl FnMut(&mut Cluster), +) -> (Cluster, TikvClient, Context) { let (cluster, leader, ctx) = must_new_and_configure_cluster(configure); let env = Arc::new(Environment::new(1)); @@ -1046,12 +1067,7 @@ pub fn must_new_and_configure_cluster_and_kv_client( (cluster, client, ctx) } -pub fn setup_cluster() -> ( - Cluster>, - TikvClient, - String, - Context, -) { +pub fn setup_cluster() -> (Cluster, TikvClient, String, Context) { let mut cluster = new_server_cluster(0, 3); cluster.run(); diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 5ce376b6873..050f735aac3 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -11,21 +11,21 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, SnapshotContext}; +use engine_rocks::{RocksEngine, RocksSnapshot}; use kvproto::{ raft_cmdpb::RaftCmdRequest, raft_serverpb::{ExtraMessageType, RaftMessage}, }; use raft::eraftpb::MessageType; use raftstore::{ - router::{LocalReadRouter, RaftStoreRouter}, + router::{LocalReadRouter, RaftStoreRouter, ReadContext}, store::{ Callback, CasualMessage, CasualRouter, PeerMsg, ProposalRouter, RaftCommand, SignificantMsg, SignificantRouter, StoreMsg, StoreRouter, Transport, }, DiscardReason, Error, Result as RaftStoreResult, Result, }; -use tikv_util::{time::ThreadReadId, Either, HandyRwLock}; +use tikv_util::{Either, HandyRwLock}; pub fn check_messages(msgs: &[RaftMessage]) -> Result<()> { if msgs.is_empty() { @@ -143,19 +143,16 @@ impl Filter for DelayFilter { } #[derive(Clone)] -pub struct SimulateTransport { +pub struct SimulateTransport { filters: Arc>>>, ch: C, - - _p: PhantomData, } -impl SimulateTransport { - pub fn new(ch: C) -> SimulateTransport { +impl SimulateTransport { + pub fn new(ch: C) -> SimulateTransport { SimulateTransport { filters: Arc::new(RwLock::new(vec![])), ch, - _p: PhantomData, } } @@ -201,7 +198,7 @@ where res } -impl Transport for SimulateTransport { +impl Transport for SimulateTransport { fn send(&mut self, m: RaftMessage) -> Result<()> { let ch = &mut self.ch; filter_send(&self.filters, m, |m| ch.send(m)) @@ -220,52 +217,49 @@ impl Transport for SimulateTransport { } } -impl> StoreRouter for SimulateTransport { - fn send(&self, msg: StoreMsg) -> Result<()> { +impl> StoreRouter for SimulateTransport { + fn send(&self, msg: StoreMsg) -> Result<()> { StoreRouter::send(&self.ch, msg) } } -impl> ProposalRouter<::Snapshot> - for SimulateTransport -{ +impl> ProposalRouter for SimulateTransport { fn send( &self, - cmd: RaftCommand<::Snapshot>, - ) -> std::result::Result<(), TrySendError::Snapshot>>> { - ProposalRouter::<::Snapshot>::send(&self.ch, cmd) + cmd: RaftCommand, + ) -> std::result::Result<(), TrySendError>> { + ProposalRouter::::send(&self.ch, cmd) } } -impl> CasualRouter for SimulateTransport { - fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { - CasualRouter::::send(&self.ch, region_id, msg) +impl> CasualRouter for SimulateTransport { + fn send(&self, region_id: u64, msg: CasualMessage) -> Result<()> { + CasualRouter::::send(&self.ch, region_id, msg) } } -impl> SignificantRouter for SimulateTransport { - fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { +impl> SignificantRouter for SimulateTransport { + fn significant_send(&self, region_id: u64, msg: SignificantMsg) -> Result<()> { self.ch.significant_send(region_id, msg) } } -impl> RaftStoreRouter for SimulateTransport { +impl> RaftStoreRouter for SimulateTransport { fn send_raft_msg(&self, msg: RaftMessage) -> Result<()> { filter_send(&self.filters, msg, |m| self.ch.send_raft_msg(m)) } - fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) {} + fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) {} } -impl> LocalReadRouter for SimulateTransport { +impl> LocalReadRouter for SimulateTransport { fn read( &mut self, - snap_ctx: Option, - read_id: Option, + ctx: ReadContext, req: RaftCmdRequest, - cb: Callback, + cb: Callback, ) -> RaftStoreResult<()> { - self.ch.read(snap_ctx, read_id, req, cb) + self.ch.read(ctx, req, cb) } fn release_snapshot_cache(&mut self) { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 82e774c7915..f6f513b767b 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -4,10 +4,7 @@ use std::{ fmt::Write, path::Path, str::FromStr, - sync::{ - mpsc::{self}, - Arc, Mutex, - }, + sync::{mpsc, Arc, Mutex}, thread, time::Duration, }; @@ -16,9 +13,7 @@ use collections::HashMap; use encryption_export::{ data_key_manager_from_config, DataKeyManager, FileConfig, MasterKeyConfig, }; -use engine_rocks::{ - config::BlobRunMode, RocksCompactedEvent, RocksEngine, RocksSnapshot, RocksStatistics, -}; +use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ CfName, CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, @@ -29,6 +24,7 @@ use file_system::IoRateLimiter; use futures::{executor::block_on, future::BoxFuture, StreamExt}; use grpcio::{ChannelBuilder, Environment}; use hybrid_engine::HybridEngine; +use in_memory_engine::RegionCacheMemoryEngine; use kvproto::{ encryptionpb::EncryptionMethod, kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, @@ -50,8 +46,7 @@ use raftstore::{ RaftRouterCompactedEventSender, Result, }; use rand::{seq::SliceRandom, RngCore}; -use range_cache_memory_engine::{RangeCacheEngineContext, RangeCacheMemoryEngine}; -use server::common::{ConfiguredRaftEngine, KvEngineBuilder}; +use server::common::ConfiguredRaftEngine; use tempfile::TempDir; use test_pd_client::TestPdClient; use test_util::eventually; @@ -74,9 +69,9 @@ use tikv_util::{ }; use txn_types::Key; -use crate::{Cluster, Config, KvEngineWithRocks, RawEngine, ServerCluster, Simulator}; +use crate::{Cluster, Config, RawEngine, ServerCluster, Simulator}; -pub type HybridEngineImpl = HybridEngine; +pub type HybridEngineImpl = HybridEngine; pub fn must_get( engine: &impl RawEngine, @@ -324,6 +319,12 @@ pub fn new_region_leader_cmd() -> StatusRequest { cmd } +pub fn new_compute_hash_request(region_id: u64, epoch: &RegionEpoch) -> RaftCmdRequest { + let mut admin = AdminRequest::default(); + admin.set_cmd_type(AdminCmdType::ComputeHash); + new_admin_request(region_id, epoch, admin) +} + pub fn new_admin_request( region_id: u64, epoch: &RegionEpoch, @@ -431,17 +432,17 @@ pub fn check_raft_cmd_request(cmd: &RaftCmdRequest) -> bool { pub fn make_cb_rocks( cmd: &RaftCmdRequest, ) -> (Callback, future::Receiver) { - make_cb::(cmd) + make_cb(cmd) } -pub fn make_cb( +pub fn make_cb( cmd: &RaftCmdRequest, -) -> (Callback, future::Receiver) { +) -> (Callback, future::Receiver) { let is_read = check_raft_cmd_request(cmd); let (tx, rx) = future::bounded(1, future::WakePolicy::Immediately); let mut detector = CallbackLeakDetector::default(); let cb = if is_read { - Callback::read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { detector.called = true; // we don't care error actually. let _ = tx.send(resp.response); @@ -460,8 +461,8 @@ pub fn make_cb_ext( cmd: &RaftCmdRequest, proposed: Option, committed: Option, -) -> (Callback, future::Receiver) { - let (cb, receiver) = make_cb::(cmd); +) -> (Callback, future::Receiver) { + let (cb, receiver) = make_cb(cmd); if let Callback::Write { cb, .. } = cb { (Callback::write_ext(cb, proposed, committed), receiver) } else { @@ -470,8 +471,8 @@ pub fn make_cb_ext( } // Issue a read request on the specified peer. -pub fn read_on_peer>( - cluster: &mut Cluster, +pub fn read_on_peer( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -485,11 +486,11 @@ pub fn read_on_peer>( read_quorum, ); request.mut_header().set_peer(peer); - cluster.read(None, None, request, timeout) + cluster.read(None, request, timeout) } -pub fn async_read_on_peer>( - cluster: &mut Cluster, +pub fn async_read_on_peer( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -507,20 +508,17 @@ pub fn async_read_on_peer>( request.mut_header().set_replica_read(replica_read); let (tx, mut rx) = future::bounded(1, future::WakePolicy::Immediately); let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); - cluster - .sim - .wl() - .async_read(None, node_id, None, request, cb); + cluster.sim.wl().async_read(node_id, None, request, cb); Box::pin(async move { let fut = rx.next(); fut.await.unwrap() }) } -pub fn batch_read_on_peer>( - cluster: &mut Cluster, +pub fn batch_read_on_peer( + cluster: &mut Cluster, requests: &[(metapb::Peer, metapb::Region)], -) -> Vec> { +) -> Vec> { let batch_id = Some(ThreadReadId::new()); let (tx, rx) = mpsc::sync_channel(3); let mut results = vec![]; @@ -541,7 +539,7 @@ pub fn batch_read_on_peer>( cluster .sim .wl() - .async_read(None, node_id, batch_id.clone(), request, cb); + .async_read(node_id, batch_id.clone(), request, cb); len += 1; } while results.len() < len { @@ -551,8 +549,8 @@ pub fn batch_read_on_peer>( results.into_iter().map(|resp| resp.1).collect() } -pub fn read_index_on_peer>( - cluster: &mut Cluster, +pub fn read_index_on_peer( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, read_quorum: bool, @@ -565,11 +563,11 @@ pub fn read_index_on_peer>( read_quorum, ); request.mut_header().set_peer(peer); - cluster.read(None, None, request, timeout) + cluster.read(None, request, timeout) } -pub fn async_read_index_on_peer>( - cluster: &mut Cluster, +pub fn async_read_index_on_peer( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -590,22 +588,19 @@ pub fn async_read_index_on_peer>( request.mut_header().set_peer(peer); let (tx, mut rx) = future::bounded(1, future::WakePolicy::Immediately); let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); - cluster - .sim - .wl() - .async_read(None, node_id, None, request, cb); + cluster.sim.wl().async_read(node_id, None, request, cb); Box::pin(async move { let fut = rx.next(); fut.await.unwrap() }) } -pub fn async_command_on_node>( - cluster: &mut Cluster, +pub fn async_command_on_node( + cluster: &mut Cluster, node_id: u64, request: RaftCmdRequest, ) -> BoxFuture<'static, RaftCmdResponse> { - let (cb, mut rx) = make_cb::(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -627,8 +622,8 @@ pub fn must_get_value(resp: &RaftCmdResponse) -> Vec { resp.get_responses()[0].get_get().get_value().to_vec() } -pub fn must_read_on_peer>( - cluster: &mut Cluster, +pub fn must_read_on_peer( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -646,8 +641,8 @@ pub fn must_read_on_peer>( } } -pub fn must_error_read_on_peer>( - cluster: &mut Cluster, +pub fn must_error_read_on_peer( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -673,23 +668,19 @@ pub fn must_contains_error(resp: &RaftCmdResponse, msg: &str) { assert!(err_msg.contains(msg), "{:?}", resp); } -pub fn create_test_engine( +pub fn create_test_engine( // TODO: pass it in for all cases. - router: Option>, + router: Option>, limiter: Option>, - pd_client: Arc, cfg: &Config, ) -> ( - Engines, + Engines, Option>, TempDir, LazyWorker, Arc, Option>, -) -where - EK: KvEngine + KvEngineBuilder, -{ +) { let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); let mut cfg = cfg.clone(); cfg.storage.data_dir = dir.path().to_str().unwrap().to_string(); @@ -717,15 +708,8 @@ where })); } let factory = builder.build(); - let disk_engine = factory.create_shared_db(dir.path()).unwrap(); - let config = Arc::new(VersionTrack::new(cfg.tikv.range_cache_engine.clone())); - let kv_engine: EK = KvEngineBuilder::build( - RangeCacheEngineContext::new(config, pd_client), - disk_engine, - None, - None, - ); - let engines = Engines::new(kv_engine, raft_engine); + let engine = factory.create_shared_db(dir.path()).unwrap(); + let engines = Engines::new(engine, raft_engine); ( engines, key_manager, @@ -802,8 +786,8 @@ pub fn configure_for_lease_read( election_timeout } -pub fn configure_for_enable_titan>( - cluster: &mut Cluster, +pub fn configure_for_enable_titan( + cluster: &mut Cluster, min_blob_size: ReadableSize, ) { cluster.cfg.rocksdb.titan.enabled = Some(true); @@ -814,15 +798,11 @@ pub fn configure_for_enable_titan>( cluster.cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize::kb(0); } -pub fn configure_for_disable_titan>( - cluster: &mut Cluster, -) { +pub fn configure_for_disable_titan(cluster: &mut Cluster) { cluster.cfg.rocksdb.titan.enabled = Some(false); } -pub fn configure_for_encryption>( - cluster: &mut Cluster, -) { +pub fn configure_for_encryption(cluster: &mut Cluster) { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let master_key_file = manifest_dir.join("src/master-key.data"); @@ -836,8 +816,8 @@ pub fn configure_for_encryption>( } } -pub fn configure_for_causal_ts>( - cluster: &mut Cluster, +pub fn configure_for_causal_ts( + cluster: &mut Cluster, renew_interval: &str, renew_batch_min_size: u32, ) { @@ -847,24 +827,24 @@ pub fn configure_for_causal_ts>( } /// Keep putting random kvs until specified size limit is reached. -pub fn put_till_size>( - cluster: &mut Cluster, +pub fn put_till_size( + cluster: &mut Cluster, limit: u64, range: &mut dyn Iterator, ) -> Vec { put_cf_till_size(cluster, CF_DEFAULT, limit, range) } -pub fn put_till_count>( - cluster: &mut Cluster, +pub fn put_till_count( + cluster: &mut Cluster, limit: u64, range: &mut dyn Iterator, ) -> Vec { put_cf_till_count(cluster, CF_WRITE, limit, range) } -pub fn put_cf_till_size>( - cluster: &mut Cluster, +pub fn put_cf_till_size( + cluster: &mut Cluster, cf: &'static str, limit: u64, range: &mut dyn Iterator, @@ -895,8 +875,8 @@ pub fn put_cf_till_size>( key.into_bytes() } -pub fn put_cf_till_count>( - cluster: &mut Cluster, +pub fn put_cf_till_count( + cluster: &mut Cluster, cf: &'static str, limit: u64, range: &mut dyn Iterator, @@ -1641,11 +1621,7 @@ pub struct PeerClient { } impl PeerClient { - pub fn new( - cluster: &Cluster>, - region_id: u64, - peer: metapb::Peer, - ) -> PeerClient { + pub fn new(cluster: &Cluster, region_id: u64, peer: metapb::Peer) -> PeerClient { let cli = { let env = Arc::new(Environment::new(1)); let channel = @@ -1752,11 +1728,7 @@ pub fn peer_on_store(region: &metapb::Region, store_id: u64) -> metapb::Peer { .clone() } -pub fn wait_for_synced( - cluster: &mut Cluster>, - node_id: u64, - region_id: u64, -) { +pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, region_id: u64) { let mut storage = cluster .sim .read() @@ -1786,10 +1758,7 @@ pub fn wait_for_synced( assert!(snapshot.ext().is_max_ts_synced()); } -pub fn test_delete_range>( - cluster: &mut Cluster, - cf: CfName, -) { +pub fn test_delete_range(cluster: &mut Cluster, cf: CfName) { let data_set: Vec<_> = (1..500) .map(|i| { ( @@ -1822,8 +1791,8 @@ pub fn test_delete_range>( } } -pub fn put_with_timeout>( - cluster: &mut Cluster, +pub fn put_with_timeout( + cluster: &mut Cluster, node_id: u64, key: &[u8], value: &[u8], @@ -1840,11 +1809,7 @@ pub fn put_with_timeout>( cluster.call_command_on_node(node_id, req, timeout) } -pub fn wait_down_peers>( - cluster: &Cluster, - count: u64, - peer: Option, -) { +pub fn wait_down_peers(cluster: &Cluster, count: u64, peer: Option) { let mut peers = cluster.get_down_peers(); for _ in 1..1000 { if peers.len() == count as usize && peer.as_ref().map_or(true, |p| peers.contains_key(p)) { diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index c80473ee67c..198b04b67b9 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -28,7 +28,6 @@ engine_rocks = { workspace = true } engine_traits = { workspace = true } futures = "0.3" kvproto = { workspace = true } -pd_client = { workspace = true } raftstore = { workspace = true } test_raftstore = { workspace = true } tikv = { workspace = true } diff --git a/components/test_storage/src/assert_storage.rs b/components/test_storage/src/assert_storage.rs index d4cdbdb2698..3a641a322a2 100644 --- a/components/test_storage/src/assert_storage.rs +++ b/components/test_storage/src/assert_storage.rs @@ -1,7 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use api_version::{ApiV1, KvFormat}; -use engine_rocks::RocksEngine as RocksDb; use kvproto::{ kvrpcpb::{Context, KeyRange, LockInfo}, metapb, @@ -45,11 +44,11 @@ impl AssertionStorage { } } -impl AssertionStorage, F> { +impl AssertionStorage { pub fn new_raft_storage_with_store_count( count: usize, key: &str, - ) -> (Cluster>, Self) { + ) -> (Cluster, Self) { let (cluster, store, ctx) = new_raft_storage_with_store_count::(count, key); let storage = Self { store, ctx }; (cluster, storage) @@ -57,7 +56,7 @@ impl AssertionStorage, F> { pub fn update_with_key_byte( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, key: &[u8], ) -> metapb::Region { // ensure the leader of range which contains current key has been elected @@ -80,7 +79,7 @@ impl AssertionStorage, F> { pub fn delete_ok_for_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -99,7 +98,7 @@ impl AssertionStorage, F> { fn get_from_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, key: &[u8], ts: impl Into, ) -> Option { @@ -117,7 +116,7 @@ impl AssertionStorage, F> { pub fn get_none_from_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, key: &[u8], ts: impl Into, ) { @@ -126,7 +125,7 @@ impl AssertionStorage, F> { pub fn put_ok_for_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, key: &[u8], value: &[u8], start_ts: impl Into, @@ -139,7 +138,7 @@ impl AssertionStorage, F> { pub fn batch_put_ok_for_cluster<'a>( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, keys: &[impl AsRef<[u8]>], vals: impl Iterator, start_ts: impl Into, @@ -163,7 +162,7 @@ impl AssertionStorage, F> { fn two_pc_ok_for_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, prewrite_mutations: Vec, key: &[u8], commit_keys: Vec, @@ -207,7 +206,7 @@ impl AssertionStorage, F> { pub fn gc_ok_for_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, region_key: &[u8], mut region: metapb::Region, safe_point: impl Into, @@ -226,7 +225,7 @@ impl AssertionStorage, F> { pub fn test_txn_store_gc3_for_cluster( &mut self, - cluster: &mut Cluster>, + cluster: &mut Cluster, key_prefix: u8, ) { let key_len = 10_000; diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index ea023019b75..78916e2ea3b 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -13,7 +13,9 @@ use kvproto::{ kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}, metapb, }; -use raftstore::coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}; +use raftstore::coprocessor::{ + region_info_accessor::MockRegionInfoProvider, CoprocessorHost, RegionInfoProvider, +}; use tikv::{ server::gc_worker::{AutoGcConfig, GcConfig, GcSafePointProvider, GcWorker}, storage::{ @@ -126,7 +128,8 @@ impl SyncTestStorage { Default::default(), Arc::new(MockRegionInfoProvider::new(Vec::new())), ); - gc_worker.start(store_id)?; + let coprocessor = CoprocessorHost::default(); + gc_worker.start(store_id, coprocessor)?; Ok(Self { gc_worker, store: storage, diff --git a/components/test_storage/src/util.rs b/components/test_storage/src/util.rs index 54f82375afe..e91125ba001 100644 --- a/components/test_storage/src/util.rs +++ b/components/test_storage/src/util.rs @@ -1,7 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use api_version::KvFormat; -use engine_rocks::RocksEngine; use kvproto::kvrpcpb::Context; use test_raftstore::{new_server_cluster, Cluster, ServerCluster, SimulateEngine}; use tikv_util::HandyRwLock; @@ -56,11 +55,7 @@ macro_rules! follower_raft_engine { pub fn new_raft_engine( count: usize, key: &str, -) -> ( - Cluster>, - SimulateEngine, - Context, -) { +) -> (Cluster, SimulateEngine, Context) { let mut cluster = new_server_cluster(0, count); let (engine, ctx) = prepare_raft_engine!(cluster, key); (cluster, engine, ctx) @@ -70,8 +65,8 @@ pub fn new_raft_storage_with_store_count( count: usize, key: &str, ) -> ( - Cluster>, - SyncTestStorage, F>, + Cluster, + SyncTestStorage, Context, ) { let (cluster, engine, ctx) = new_raft_engine(count, key); diff --git a/components/test_util/data/.gitignore b/components/test_util/data/.gitignore new file mode 100644 index 00000000000..0773e460402 --- /dev/null +++ b/components/test_util/data/.gitignore @@ -0,0 +1,2 @@ +ca.key +server.csr diff --git a/components/test_util/data/ca.pem b/components/test_util/data/ca.pem index e130a8eece9..05015192501 100644 --- a/components/test_util/data/ca.pem +++ b/components/test_util/data/ca.pem @@ -1,22 +1,19 @@ -----BEGIN CERTIFICATE----- -MIIDojCCAoqgAwIBAgIUdZFW8VQoZZzek8cA+5GGu6ZInjowDQYJKoZIhvcNAQEL -BQAwVzELMAkGA1UEBhMCQ04xEDAOBgNVBAgTB0JlaWppbmcxEDAOBgNVBAcTB0Jl -aWppbmcxEDAOBgNVBAoTB1BpbmdDQVAxEjAQBgNVBAMTCU15IG93biBDQTAeFw0x -OTA5MDIwNjEyMDBaFw0yNDA4MzEwNjEyMDBaMFcxCzAJBgNVBAYTAkNOMRAwDgYD -VQQIEwdCZWlqaW5nMRAwDgYDVQQHEwdCZWlqaW5nMRAwDgYDVQQKEwdQaW5nQ0FQ -MRIwEAYDVQQDEwlNeSBvd24gQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK -AoIBAQDcDtQ7UX+xlVY0vpklp1uUmPoFsN0U6fqRzHU+LvYS5AM5RPJMVLiKBiSi -zGsB+XPmXZ8H7rZZ+osZsEmDIF3HdyiSNpPNzRJKxsz4KVRzfoKZXL9D41TpuE27 -+7tN6qGytYrnAy8cHMA0S1TnQ0biOFTcXZrwh5lvlIcx7ceUamGuEl94tblxSSJl -2SkpHkKIDv0kcgoGmmh4y8SzAtmnwcCjkCSoITvvwKklp5830pFKOnpN9uZJzkXa -tuUSpSji/JG79nQfH91LtL7xMprORVtg9YAa3aJm0Uf33WFvaCTSrt//7CVK8nqK -xayS3u7dNH3GV9b81OGtlR76leFlAgMBAAGjZjBkMA4GA1UdDwEB/wQEAwIBBjAS -BgNVHRMBAf8ECDAGAQH/AgECMB0GA1UdDgQWBBS3hxTaN9B7eF8xr0DKLZ3b5vFn -rDAfBgNVHSMEGDAWgBS3hxTaN9B7eF8xr0DKLZ3b5vFnrDANBgkqhkiG9w0BAQsF -AAOCAQEAi9WiEvTQQjmb7ekXHf1tKwdLNu5akQXIwTKeZSWRSeMgqVQcoyTZMPBX -ythl6K3175RUIMtCwO4uZTOpRU1mTl0pIjoEcJGHYX91zyA5BjWahXZttvt7/hyX -UwJN9clBXLfZTCp1ysLCtarLcip4WxWNsxEwXFUisE2gbu3F9ELHAbRSVUe/CwC6 -8BkY+G+fovazjGoTV4NadJVFRzTR/zsWkBNllBOBTrop8FH23ePVh3hXafzJlcip -bDbRxNqSzNtLr88mwswklgiIHXF6PY2TkyscsXVkHPAswZnrv4lLov7M3VjL8ITA -uYm4Me5Tmj+6pb+Foky15+ehmicQbA== +MIIDITCCAgmgAwIBAgIUVe4Q3uw8yW0seqG9yQMfXrSXLHswDQYJKoZIhvcNAQEL +BQAwFzEVMBMGA1UEAwwMdGlrdl90ZXN0X2NhMCAXDTI0MDMyNzAwMDAwMFoYDzIx +MjQwMzI3MDAwMDAwWjAXMRUwEwYDVQQDDAx0aWt2X3Rlc3RfY2EwggEiMA0GCSqG +SIb3DQEBAQUAA4IBDwAwggEKAoIBAQC+cR9jZ0LtX4ztcupCEyrR8CNmw1TkIsOQ +rPhP43FkdggQN2vqkM9ZtKxlcODuNul748saEFoK1AGv4MgxgKcbt6sTucdz4oC4 +O1rM31eicU630PblPNU8Bstvlta2jCZAbERBhiAm1C3zQncodyVr9Oa2Ff9SRXcW +7icpv4CTsesPi19nF+EhBAuCifeI3Vj1Uvd5wvsK/m0D0gpp3Vp7CNYwHLv9gfPu +Jui0Q8NM5ENBcIfUBK8zOvr8a5glqV36KUA4m7yDXyYHIR2SrD/y3XDc6cbRgxKS +qbQMbc67H2XJHWjHgp7gv9rdU9HGxfv49j+TnxwYNPb3aflBgk1JAgMBAAGjYzBh +MB0GA1UdDgQWBBS+Qw4MGLTrjFTO2xlGwlj+yy1o7zAfBgNVHSMEGDAWgBS+Qw4M +GLTrjFTO2xlGwlj+yy1o7zASBgNVHRMBAf8ECDAGAQH/AgEAMAsGA1UdDwQEAwIB +BjANBgkqhkiG9w0BAQsFAAOCAQEAezNpYWbkxvIv/MfWxpbF+TBA1ssWT1xyuwUu +P9EdXj72XKnpkmZXxysyCSZR1ZH9XwuqHgQkegxQMKMeiv5UQLouTFEa5LUJxlQw +A3O1Ky1r6dv6p/JkOAbMxh+VoWAFCW5Ioo81rwZLGu3DS3+gsauDBpevIqLlL29H +FAQ+JO33pzuAP+PBHnO0Zi3ddvgDNULpHQhC8BUR9fI/NsxKuS4QwPGK1fnd/Qvg +w5aP0PZ8CNheIvVy4qkeUsh2kS7vghMpwa/KkCGurmwg2C6sgCTDKJVgkTOEutx4 +5LfVfuwAmWWnrqWwLtAyJI5SAUhXLtSu2DeyC5ppP5DFRBP3qw== -----END CERTIFICATE----- diff --git a/components/test_util/data/generate_certs.sh b/components/test_util/data/generate_certs.sh new file mode 100755 index 00000000000..08156aa5e68 --- /dev/null +++ b/components/test_util/data/generate_certs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +set -euo pipefail + +SCRIPT_PATH="$(realpath "$0")" +CERT_DIR="$(dirname "$SCRIPT_PATH")" +CA_KEY="$CERT_DIR/ca.key" +CA_CERT="$CERT_DIR/ca.pem" +SERVER_KEY="$CERT_DIR/key.pem" +SERVER_CSR="$CERT_DIR/server.csr" +SERVER_CERT="$CERT_DIR/server.pem" +VALID_DAYS=3650 +RSA_KEY_SIZE=2048 + +# CA certs. +openssl genrsa -out "$CA_KEY" "$RSA_KEY_SIZE" +openssl req -new -x509 -days "$VALID_DAYS" -key "$CA_KEY" -out "$CA_CERT" \ + -subj "/CN=tikv_test_ca" \ + -addext "basicConstraints = critical,CA:TRUE,pathlen:0" \ + -addext "keyUsage = cRLSign, keyCertSign" +echo "CA certificate:" +openssl x509 -text -in "$CA_CERT" -noout + +# Server certs. +openssl genrsa -out "$SERVER_KEY" "$RSA_KEY_SIZE" +openssl req -new -key "$SERVER_KEY" -out "$SERVER_CSR" \ + -extensions v3_ca \ + -subj "/CN=tikv-server" \ + -addext "basicConstraints = critical, CA:FALSE" \ + -addext "keyUsage = critical, digitalSignature, keyEncipherment" \ + -addext "extendedKeyUsage = serverAuth, clientAuth" \ + -addext "subjectAltName = IP.1:172.16.5.40, IP.2:127.0.0.1" +openssl x509 -req -days "$VALID_DAYS" \ + -CA "$CA_CERT" -CAkey "$CA_KEY" -CAcreateserial \ + -copy_extensions copyall \ + -in "$SERVER_CSR" -out "$SERVER_CERT" +echo "Server certificate:" +openssl x509 -text -in "$SERVER_CERT" -noout diff --git a/components/test_util/data/key.pem b/components/test_util/data/key.pem index c7f9fa8c340..61ab0c3f029 100644 --- a/components/test_util/data/key.pem +++ b/components/test_util/data/key.pem @@ -1,27 +1,28 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEogIBAAKCAQEAsRpq/E/VC82YxsC5LlKFvI9HJuchMtKskn53anW4rNE3sfN0 -WDS6qCyxNumUVBqO98J18xxbz/XkV7aP6TcXZrNgEqw07PZWTDoyZVi+n9HXyWwl -BeiE2WWrCESqsar+cXV5UE3oE7Y4CT56tMN+awKqnf1zLyRl9DlqSg1/GabheVzz -fGhdqddqdpAZcaOHH8UMEWdnZ4qTFaaGNRlrRy3W0VjzgIocQorpvvtZkQM5iCxx -z9wuF9/6gGdopTA0J2SvZKa+oI/867NLpN5Hx+cn/ThHhCTh1N34Ulloa0aiou72 -mGgyMIdQxYAsRnG62EHn+9aPtegIjQd13Be9/wIDAQABAoIBAHJ8v3iIKxNMP10M -rSlS032HqdluRLnUExdIhe3eWBnvze9NkIKM47Vf3te+u9J6sL1dil40kO2o6YoC -TJnYsVoEzzCC/lvJCxSP8pAthF1QjAx7yps9KtRWsu/PZAEipwW1iUzub/5+J09i -gnRkhE6tFJq5g0KQZxAwJZPlkaqEcZIOObfh9zD9hutvCPmXBtB600EbQU4XzyjP -KaU08LtNZVm4mhKMuhXuFt8LBkjjfuw6zNcjsvgMkyflFTLc/SgWWIpq1ALHQCsq -OiFfTPyuLy+8tGTbawvRIqiHHRd23XttPcfkdfWbNVTSBfodTOhXGFaVYbJ6EVA4 -OzVzftECgYEAz/D99wpWbjU8vye5cjKjZCY/+QnY0t76YsUmfD9+fQNBDSQnKCKj -6nO6oYFQ9RI/vPMfrNX0sqo5hKfufNBCr/MILDXR6vtcEuaqd84DgaPVPRjHef6v -paYUi0Enb3gF3LXYggTN1mz9leEW8BablTN/DLP5AAvMfM/XSkVzlIsCgYEA2gjc -mcUDL1smAvriFVmpD4IrPzaZ9kINOfFNqkp/+y7S0BZGeS5ESSodrs0CIojttp3o -9GL7QLhZ9DehJWfh2qfA5mvzKGzUeM2oapR2Ts/m3voS4ErPTm+cTBOjRe3gGSSN -4sAJ5LA071RfNjEZBSktow//WX/oWrhIyovnxt0CgYBxyge/4xlO77URSdSySEGf -MUs6pYfQRRKxb/9SaJB4KoqzfUAsN2CJkNDlRlWd9mGIrWZ89wwTpREapabdCD4l -+JFVWBJKS0ikUzOfoc3LaHLtHx0xhgxqUkrVtU62MfDLSXt0Etrs5vGRzf32Xfi/ -mdGBiw7MVqiM+FNwojbQZwKBgDly5E1P78rmhVl7qV5exYDkl2iMhnywYrPFtOUN -xDL2320csWz0l+F/S1rngYx/78KSUPMzsWgYKvuCPN+SQ5xNXzJXdzZLlqBN7/ZF -L/cMKJTP53FZxM2x8sjI09h1GPsG+quoVfL/yrLU1FF/FkyZ0QCKEooOfbaJoARe -YK+xAoGAfT0P200WsLKRl73XYJZNYQl5+h5s7Sk9J8QuPwFWqm/mGwYKTLI042jg -lsAym4krAR0c1CHTW3aHRimYpYbi7/kztZU1zUQgcGL+79afer3ZuFF7mGzR+I/r -yOQ2dEfmVASfl/fMh1qyExpcCaMuejaODWyILlxOwvnywHWMSCU= ------END RSA PRIVATE KEY----- +-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCzMRjAbG2MVTMs +x7Rr+eeIi4jNyhvaQ8LcTB08BdgY8618blS037dB/85GxKTfZMdJZkCygbSsyiVJ +owqyucsT6IKYnZ/kDxfXXYEBIQcOYLnAEU1NNnQLPYSTth7xjnSKvg78gH+wY2LP +4z6QD64XboKa/fmVuByO/QlnZntkr4kiH6O5rJyt1Hm+OzVRYs5RumGU0Mpn3Bjd +xmGqB7+Wldbu2ECfOFFDaC/uizWVr1OD5lsdVau20ZEwZN37udRXsBc0qTp+CaA0 +sTFNCfqq9/Z3SVOGmWldvVOfF33+n9N+n1yyeZ8TlZ3JB2daKoDHDxLY0KKZtu6D +M9RlcMdJAgMBAAECggEAExOZ//pLG1KCtTS5h3k+ZfH86XOnzW/DUfKkdhxlKhrf +EE0qpWrftp4GFtCegM3YzX6eSqK2WmLW32UFIYvYC9GH/bliKURWFG526mjauHQh +sknlQeAXMI8nCxaraqnwzEh5jfYcIeoiveECaxHQRdxU+S0qof7Mw4g6tRBa+Jft +TUW1aa/m6p8s19xTKebtpCj+p1zwUdU8t4fw8eq5qFn4t1jlWaaCLgJv4OAE52IB +81LEJjG0vevVIvifsm+2t58IOqYJhEo0qMw2X0AMQDEOJZPeW1puFg4cdvh1bar6 +RDxVhX0IkDKz7w62VoGb0ZptZkO3vrG8OXN2TI+L0QKBgQC30ASVENK2l7Jo+n5m +Qxf7s4ZoJ9jjWjO8N+Mkd5/Cu+xePlv5EmcBZCDONSmAdtaZXDs9DVO8E4yFXecJ +fidQnvRhhRxrG/LEEwHNzR8lMlm5tc4wx7g2y844Qjan71O8gawUd6eZyRmVDnmk +st6DLUwyWTkwaa7VkDaGFFqjVQKBgQD5kIS0fiq1hhOzrPt6U2FCHybrNnWCyFN5 +ISYJpl1vn7YqFV2wgXwn+A9bcDi7DMK8hx1b49Xx2lwo9n7Cb5Pd0ykhdjo12hUQ +WBqiFEjInsQ84RvivyTzlrvBduVMRtWA8lxp4gFjXFf/avHzoRkM21IfU46Q9QNn +Y8rKTFJ8JQKBgGRgv6/b0QYPj0lOyen0Z8t3PUXxxLpbTvdRGcSXYvJIB4tryHQa +/Y8/675QP8q8mvKC8IKlnX2Ou2U1Y27GqpeXRmNe+qbvS0KSEqEdjA2XEnKc+u2e +k1WxNHt6hThuNK8zrRI8SZVswYCpt/oeB+9gtESmftmWTPipWW0c/mZFAoGBAIbK +pLJr9ptmmFuSUZ1nTX+OHdIq7ts9YcBpxAawQYUZqSUftyOvPCTGWV0Uxr9YjBGR +lKzd6N8hBmflgt93rlDATVXSamxNptTWEUR7WjhpcCpFl28nuEiMoEpE8mH5XDWy +MXHK7N8CsFC3LYld+I62Iqvi0HzAqR79ijkrcd21AoGBAIc+y9eKyMxFfHkz/pZH +cupJiF21Y3+cw7h2TRNvqSLvHTp3enCDo7WG0R0K74BHIvwTDcZxsxThm6KwB+y9 +WIuKQC064e5ASjdF1dfwFFlNpwphL2kebWuzIkpEVtCcGJPUuJ704R7tD3y8q4BN +aSrpjjRGIVr6mLcxXGgHJa5R +-----END PRIVATE KEY----- diff --git a/components/test_util/data/server.pem b/components/test_util/data/server.pem index 09200bd82f6..42deadba2db 100644 --- a/components/test_util/data/server.pem +++ b/components/test_util/data/server.pem @@ -1,22 +1,20 @@ -----BEGIN CERTIFICATE----- -MIIDlTCCAn2gAwIBAgIUGKdjy/Uqp64ZiwqMwpTMGP5tKT0wDQYJKoZIhvcNAQEL -BQAwVzELMAkGA1UEBhMCQ04xEDAOBgNVBAgTB0JlaWppbmcxEDAOBgNVBAcTB0Jl -aWppbmcxEDAOBgNVBAoTB1BpbmdDQVAxEjAQBgNVBAMTCU15IG93biBDQTAgFw0x -OTA5MDIwNjEzMDBaGA8yMTE5MDgwOTA2MTMwMFowFjEUMBIGA1UEAxMLdGlrdi1z -ZXJ2ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCxGmr8T9ULzZjG -wLkuUoW8j0cm5yEy0qySfndqdbis0Tex83RYNLqoLLE26ZRUGo73wnXzHFvP9eRX -to/pNxdms2ASrDTs9lZMOjJlWL6f0dfJbCUF6ITZZasIRKqxqv5xdXlQTegTtjgJ -Pnq0w35rAqqd/XMvJGX0OWpKDX8ZpuF5XPN8aF2p12p2kBlxo4cfxQwRZ2dnipMV -poY1GWtHLdbRWPOAihxCium++1mRAzmILHHP3C4X3/qAZ2ilMDQnZK9kpr6gj/zr -s0uk3kfH5yf9OEeEJOHU3fhSWWhrRqKi7vaYaDIwh1DFgCxGcbrYQef71o+16AiN -B3XcF73/AgMBAAGjgZcwgZQwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQGCCsG -AQUFBwMBBggrBgEFBQcDAjAMBgNVHRMBAf8EAjAAMB0GA1UdDgQWBBTw7yUYqbAv -BJw3zZctLUfUi0vyqzAfBgNVHSMEGDAWgBS3hxTaN9B7eF8xr0DKLZ3b5vFnrDAV -BgNVHREEDjAMhwSsEAUohwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQCBljfge2fC -5X+tt1v7AkWoH5xpymEVvuIWWJmT/6FNTn1rdnIaxWCQzJbBCXjZS/75lKnwfrTB -ZK7iMv1GQaBevT/qm+7GcApsr5nFrI/MvzrvY+XRqvU8gsRhUjHYI+JPLGWxhzZD -pQdJTAGvsDLHu1VVdHR2KsE4M8ceGq58f7zPSq/suf+8SYEOFP8zfuXX1HfUrFVe -69ZQw8PZh4EYL0PYtE5BYfe9iJyFNNtZiejiribMQz/NtNkKM3M+Hm40ULGuwHXq -bKDjDq1PvmpVb/kKO/xADTIAbqproXETZ4W2keI3hwm6NxysvEbYV9+puQBXQqwT -KOt9Lo4ofSAF +MIIDVTCCAj2gAwIBAgIUUCvVn7LZjm7FD+xeAd5g1oKFSrIwDQYJKoZIhvcNAQEL +BQAwFzEVMBMGA1UEAwwMdGlrdl90ZXN0X2NhMCAXDTI0MDMyNzAwMDAwMFoYDzIx +MjQwMzI3MDAwMDAwWjAWMRQwEgYDVQQDDAt0aWt2LXNlcnZlcjCCASIwDQYJKoZI +hvcNAQEBBQADggEPADCCAQoCggEBALMxGMBsbYxVMyzHtGv554iLiM3KG9pDwtxM +HTwF2BjzrXxuVLTft0H/zkbEpN9kx0lmQLKBtKzKJUmjCrK5yxPogpidn+QPF9dd +gQEhBw5gucARTU02dAs9hJO2HvGOdIq+DvyAf7BjYs/jPpAPrhdugpr9+ZW4HI79 +CWdme2SviSIfo7msnK3Ueb47NVFizlG6YZTQymfcGN3GYaoHv5aV1u7YQJ84UUNo +L+6LNZWvU4PmWx1Vq7bRkTBk3fu51FewFzSpOn4JoDSxMU0J+qr39ndJU4aZaV29 +U58Xff6f036fXLJ5nxOVnckHZ1oqgMcPEtjQopm27oMz1GVwx0kCAwEAAaOBlzCB +lDAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEF +BQcDAQYIKwYBBQUHAwIwFQYDVR0RBA4wDIcErBAFKIcEfwAAATAdBgNVHQ4EFgQU +Z4ACSrFMAFHP3iQAlZihuxlTk64wHwYDVR0jBBgwFoAUvkMODBi064xUztsZRsJY +/sstaO8wDQYJKoZIhvcNAQELBQADggEBALxhZMiXDQvUJCtSGPaCJhvs51O7Sb+8 +xrByuQrtXhcNVsOcq+0OjT/roUzD0x5mf75cTcJm6XZuzg2BPgN7wQU5GPuhOcJv +XFx4uoRDNBzN5FlxZu+ln4Qqw/M/4zsRwD3qkp/J50RpbCOmf1x/b1M6+s1uQcT5 +6sMErUXnOzvY5ey4vCJFiveYu5Z7GIHPB8xlhJtiu3T8FN1o3Us75evFk7hHfJjf +zU1Efd6W9RU/bEPAPvqnLBkSHdx7Urw0hNHlW2IDjaX1zIV5Ibeiw61olyQAiXjy +N3VJrMbuSTRv5BZxp4sKwnan4dAtfXvSSle36pYhN5UTjD72NNlLe8A= -----END CERTIFICATE----- diff --git a/components/tidb_query_aggr/src/impl_first.rs b/components/tidb_query_aggr/src/impl_first.rs index 3eb9a8e04aa..fbd23822f1f 100644 --- a/components/tidb_query_aggr/src/impl_first.rs +++ b/components/tidb_query_aggr/src/impl_first.rs @@ -62,6 +62,7 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserFirst { Bytes => BytesRef<'static>, Enum => EnumRef<'static>, Set => SetRef<'static>, + VectorFloat32 => VectorFloat32Ref<'static>, ], match eval_type { EvalType::TT => Ok(Box::new(AggrFnFirst::::new())), diff --git a/components/tidb_query_aggr/src/impl_max_min.rs b/components/tidb_query_aggr/src/impl_max_min.rs index c18710b3645..aada14e7bfa 100644 --- a/components/tidb_query_aggr/src/impl_max_min.rs +++ b/components/tidb_query_aggr/src/impl_max_min.rs @@ -92,6 +92,7 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserExtremum Decimal => &'static Decimal, DateTime => &'static DateTime, Json => JsonRef<'static>, + VectorFloat32 => VectorFloat32Ref<'static>, ], match eval_type { EvalType::T => Ok(Box::new(AggFnExtremum::::new())), @@ -183,7 +184,12 @@ where return Ok(()); } - if C::sort_compare(self.extremum.as_ref().unwrap(), value.as_ref().unwrap())? == E::ORD { + if C::sort_compare( + self.extremum.as_ref().unwrap(), + value.as_ref().unwrap(), + false, + )? == E::ORD + { self.extremum = value.map(|x| x.into_owned_value()); } Ok(()) diff --git a/components/tidb_query_aggr/src/lib.rs b/components/tidb_query_aggr/src/lib.rs index c6ddfb96d2f..8b53c8a078d 100644 --- a/components/tidb_query_aggr/src/lib.rs +++ b/components/tidb_query_aggr/src/lib.rs @@ -76,6 +76,7 @@ pub trait AggrFunctionState: + AggrFunctionStateUpdatePartial> + AggrFunctionStateUpdatePartial> + AggrFunctionStateUpdatePartial> + + AggrFunctionStateUpdatePartial> { // TODO: A better implementation is to specialize different push result targets. // However current aggregation executor cannot utilize it. diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index ea3017d5d02..8b414b64a63 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -497,7 +497,7 @@ impl RpnFnRefEvaluableType { match self { RpnFnRefEvaluableType::Ref(x) => quote! { &#lifetime #x }, RpnFnRefEvaluableType::Type(x) => { - if is_json(x) || is_bytes(x) || is_enum(x) || is_set(x) { + if is_json(x) || is_bytes(x) || is_enum(x) || is_set(x) || is_vector_float32(x) { quote! { #x <#lifetime> } @@ -846,6 +846,14 @@ fn is_json(ty: &TypePath) -> bool { } } +/// Checks if parameter type is VectorFloat32 +fn is_vector_float32(ty: &TypePath) -> bool { + match ty.path.get_ident() { + Some(x) => *x == "VectorFloat32Ref" || *x == "VectorFloat32", + None => false, + } +} + /// Checks if parameter type is Bytes fn is_bytes(ty: &TypePath) -> bool { match ty.path.get_ident() { @@ -880,6 +888,8 @@ fn get_vargs_buf(ty: &TypePath) -> TokenStream { quote! { VARG_PARAM_BUF_JSON_REF } } else if *x == "BytesRef" { quote! { VARG_PARAM_BUF_BYTES_REF } + } else if *x == "VectorFloat32Ref" { + quote! { VARG_PARAM_BUF_VECTOR_FLOAT32_REF } } else { quote! { VARG_PARAM_BUF } } @@ -896,6 +906,8 @@ fn get_vectoried_type(ty: &TypePath) -> TokenStream { Some(x) => { if *x == "JsonRef" { quote! { JsonRef } + } else if *x == "VectorFloat32Ref" { + quote! { VectorFloat32Ref } } else if *x == "BytesRef" { quote! { BytesRef } } else if *x == "EnumRef" { @@ -1048,6 +1060,10 @@ impl VargsRpnFn { quote! { let arg: Option = unsafe { std::mem::transmute::, Option>>(arg) }; } + } else if is_vector_float32(arg_type) { + quote! { + let arg: Option = unsafe { std::mem::transmute::, Option>>(arg) }; + } } else if is_bytes(arg_type) { quote! { let arg: Option = unsafe { std::mem::transmute::, Option>>(arg) }; diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index ff7c0ca58a2..80bd380b9d7 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -9,7 +9,6 @@ license = "Apache-2.0" [dependencies] anyhow = "1.0" api_version = { workspace = true } -async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } futures = "0.3" diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index e789e8c856d..ccfb82750c0 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -13,6 +13,7 @@ bitfield = "0.13.2" bitflags = "1.0.1" boolinator = "2.4.0" bstr = "0.2.8" +bytemuck = "1.14.3" chrono = { workspace = true } chrono-tz = "0.5.1" codec = { workspace = true } @@ -42,3 +43,11 @@ tidb_query_common = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } tipb = { workspace = true } + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "bench_vector_distance" +path = "benches/bench_vector_distance.rs" +harness = false diff --git a/components/tidb_query_datatype/benches/bench_vector_distance.rs b/components/tidb_query_datatype/benches/bench_vector_distance.rs new file mode 100644 index 00000000000..62cf2b2ad98 --- /dev/null +++ b/components/tidb_query_datatype/benches/bench_vector_distance.rs @@ -0,0 +1,191 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use tidb_query_datatype::codec::mysql::VectorFloat32Ref; + +fn bench_l1_distance_3d(c: &mut Criterion) { + let va: Vec = vec![1.1, 2.2, 3.3]; + let vb: Vec = vec![1.1, 2.2, 3.3]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("l1_distance_3d", |b| { + b.iter(|| { + black_box(black_box(vec_va).l1_distance(black_box(vec_vb)).unwrap()); + }); + }); +} + +fn bench_l1_distance_784d(c: &mut Criterion) { + let va: Vec = vec![1.0; 784]; + let vb: Vec = vec![1.0; 784]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("l1_distance_784d", |b| { + b.iter(|| { + black_box(black_box(vec_va).l1_distance(black_box(vec_vb)).unwrap()); + }); + }); +} + +fn bench_l2_squared_distance_3d(c: &mut Criterion) { + let va: Vec = vec![1.1, 2.2, 3.3]; + let vb: Vec = vec![1.1, 2.2, 3.3]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("l2_squared_distance_3d", |b| { + b.iter(|| { + black_box( + black_box(vec_va) + .l2_squared_distance(black_box(vec_vb)) + .unwrap(), + ); + }); + }); +} + +fn bench_l2_squared_distance_784d(c: &mut Criterion) { + let va: Vec = vec![1.0; 784]; + let vb: Vec = vec![1.0; 784]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("l2_squared_distance_784d", |b| { + b.iter(|| { + black_box( + black_box(vec_va) + .l2_squared_distance(black_box(vec_vb)) + .unwrap(), + ); + }); + }); +} + +fn bench_l2_distance_3d(c: &mut Criterion) { + let va: Vec = vec![1.1, 2.2, 3.3]; + let vb: Vec = vec![1.1, 2.2, 3.3]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("l2_distance_3d", |b| { + b.iter(|| { + black_box(black_box(vec_va).l2_distance(black_box(vec_vb)).unwrap()); + }); + }); +} + +fn bench_l2_distance_784d(c: &mut Criterion) { + let va: Vec = vec![1.0; 784]; + let vb: Vec = vec![1.0; 784]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("l2_distance_784d", |b| { + b.iter(|| { + black_box(black_box(vec_va).l2_distance(black_box(vec_vb)).unwrap()); + }); + }); +} + +fn bench_inner_product_3d(c: &mut Criterion) { + let va: Vec = vec![1.1, 2.2, 3.3]; + let vb: Vec = vec![1.1, 2.2, 3.3]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("inner_product_3d", |b| { + b.iter(|| { + black_box(black_box(vec_va).inner_product(black_box(vec_vb)).unwrap()); + }); + }); +} + +fn bench_inner_product_784d(c: &mut Criterion) { + let va: Vec = vec![1.0; 784]; + let vb: Vec = vec![1.0; 784]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("inner_product_784d", |b| { + b.iter(|| { + black_box(black_box(vec_va).inner_product(black_box(vec_vb)).unwrap()); + }); + }); +} + +fn bench_cosine_distance_3d(c: &mut Criterion) { + let va: Vec = vec![1.1, 2.2, 3.3]; + let vb: Vec = vec![1.1, 2.2, 3.3]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("cosine_distance_3d", |b| { + b.iter(|| { + black_box( + black_box(vec_va) + .cosine_distance(black_box(vec_vb)) + .unwrap(), + ); + }); + }); +} + +fn bench_cosine_distance_784d(c: &mut Criterion) { + let va: Vec = vec![1.0; 784]; + let vb: Vec = vec![1.0; 784]; + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + let vec_vb = VectorFloat32Ref::from_f32(vb.as_slice()).unwrap(); + + c.bench_function("cosine_distance_784d", |b| { + b.iter(|| { + black_box( + black_box(vec_va) + .cosine_distance(black_box(vec_vb)) + .unwrap(), + ); + }); + }); +} + +fn bench_l2_norm_3d(c: &mut Criterion) { + let va: Vec = vec![1.1, 2.2, 3.3]; + + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + + c.bench_function("l2_norm_3d", |b| { + b.iter(|| { + black_box(black_box(vec_va).l2_norm()); + }); + }); +} + +fn bench_l2_norm_784d(c: &mut Criterion) { + let va: Vec = vec![1.0; 784]; + + let vec_va = VectorFloat32Ref::from_f32(va.as_slice()).unwrap(); + + c.bench_function("l2_norm_784d", |b| { + b.iter(|| { + black_box(black_box(vec_va).l2_norm()); + }); + }); +} + +criterion_group!( + benches, + bench_l1_distance_3d, + bench_l1_distance_784d, + bench_l2_squared_distance_3d, + bench_l2_squared_distance_784d, + bench_l2_distance_3d, + bench_l2_distance_784d, + bench_inner_product_3d, + bench_inner_product_784d, + bench_cosine_distance_3d, + bench_cosine_distance_784d, + bench_l2_norm_3d, + bench_l2_norm_784d, +); +criterion_main!(benches); diff --git a/components/tidb_query_datatype/src/codec/chunk/column.rs b/components/tidb_query_datatype/src/codec/chunk/column.rs index d308248e4eb..1068f0d7ba5 100644 --- a/components/tidb_query_datatype/src/codec/chunk/column.rs +++ b/components/tidb_query_datatype/src/codec/chunk/column.rs @@ -10,7 +10,7 @@ use tipb::FieldType; use super::{Error, Result}; use crate::{ codec::{ - data_type::{ChunkRef, VectorValue}, + data_type::{ChunkRef, VectorFloat32Ref, VectorValue}, datum, datum_codec::DatumPayloadDecoder, mysql::{ @@ -24,6 +24,10 @@ use crate::{ enums::{Enum, EnumDatumPayloadChunkEncoder, EnumDecoder, EnumEncoder, EnumRef}, json::{Json, JsonDatumPayloadChunkEncoder, JsonDecoder, JsonEncoder, JsonRef}, time::{Time, TimeDatumPayloadChunkEncoder, TimeDecoder, TimeEncoder}, + vector::{ + VectorFloat32, VectorFloat32DatumPayloadChunkEncoder, VectorFloat32Decoder, + VectorFloat32Encoder, + }, }, Datum, }, @@ -130,6 +134,11 @@ impl Column { col.append_json_datum(&raw_datums[row_index])? } } + EvalType::VectorFloat32 => { + for &row_index in logical_rows { + col.append_vector_float32_datum(&raw_datums[row_index])? + } + } EvalType::Enum => { for &row_index in logical_rows { col.append_enum_datum(&raw_datums[row_index], field_type)? @@ -274,6 +283,16 @@ impl Column { } } } + VectorValue::VectorFloat32(vec) => { + for &row_index in logical_rows { + match vec.get_option_ref(row_index) { + None => { + col.append_null(); + } + Some(val) => col.append_vector_float32(val)?, + } + } + } VectorValue::Enum(vec) => { for &row_index in logical_rows { match vec.get_option_ref(row_index) { @@ -315,6 +334,7 @@ impl Column { FieldTypeTp::Duration => Datum::Dur(self.get_duration(idx, field_type.decimal())?), FieldTypeTp::NewDecimal => Datum::Dec(self.get_decimal(idx)?), FieldTypeTp::Json => Datum::Json(self.get_json(idx)?), + FieldTypeTp::TiDbVectorFloat32 => Datum::VectorFloat32(self.get_vector_float32(idx)?), FieldTypeTp::Enum => Datum::Enum(self.get_enum(idx)?), FieldTypeTp::Bit => Datum::Bytes(self.get_bytes(idx).to_vec()), FieldTypeTp::Set => { @@ -901,6 +921,39 @@ impl Column { data.read_json() } + pub fn append_vector_float32(&mut self, v: VectorFloat32Ref<'_>) -> Result<()> { + self.data.write_vector_float32(v)?; + self.finished_append_var(); + Ok(()) + } + + pub fn append_vector_float32_datum(&mut self, src_datum: &[u8]) -> Result<()> { + let raw_datum = &src_datum[1..]; + let flag = src_datum[0]; + match flag { + datum::NIL_FLAG => self.append_null(), + datum::VECTOR_FLOAT32_FLAG => { + self.data + .write_vector_float32_to_chunk_by_datum_payload(raw_datum)?; + self.finished_append_var(); + } + _ => { + return Err(Error::InvalidDataType(format!( + "Unsupported datum flag {} for VectorFloat32 vector", + flag + ))); + } + } + Ok(()) + } + + pub fn get_vector_float32(&self, idx: usize) -> Result { + let start = self.var_offsets[idx]; + let end = self.var_offsets[idx + 1]; + let mut data = &self.data[start..end]; + data.read_vector_float32() + } + // Append an Enum datum to the column #[inline] pub fn append_enum(&mut self, e: EnumRef<'_>) -> Result<()> { diff --git a/components/tidb_query_datatype/src/codec/collation/charset.rs b/components/tidb_query_datatype/src/codec/collation/charset.rs index 9ea76f16b92..2781273ae7c 100644 --- a/components/tidb_query_datatype/src/codec/collation/charset.rs +++ b/components/tidb_query_datatype/src/codec/collation/charset.rs @@ -60,3 +60,6 @@ impl Charset for CharsetUtf8mb4 { // gbk character data actually stored with utf8mb4 character encoding. pub type CharsetGbk = CharsetUtf8mb4; + +// gb18030 character data actually stored with utf8mb4 character encoding. +pub type CharsetGb18030 = CharsetUtf8mb4; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/binary.rs b/components/tidb_query_datatype/src/codec/collation/collator/binary.rs index 6f183a215c7..653d0c415cd 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/binary.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/binary.rs @@ -24,7 +24,7 @@ impl Collator for CollatorBinary { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { + fn sort_compare(a: &[u8], b: &[u8], _force_no_pad: bool) -> Result { Ok(a.cmp(b)) } diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gb18030_bin.data b/components/tidb_query_datatype/src/codec/collation/collator/gb18030_bin.data new file mode 100644 index 00000000000..172e21ee596 Binary files /dev/null and b/components/tidb_query_datatype/src/codec/collation/collator/gb18030_bin.data differ diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gb18030_chinese_ci.data b/components/tidb_query_datatype/src/codec/collation/collator/gb18030_chinese_ci.data new file mode 100644 index 00000000000..90a8971c213 Binary files /dev/null and b/components/tidb_query_datatype/src/codec/collation/collator/gb18030_chinese_ci.data differ diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gb18030_collation.rs b/components/tidb_query_datatype/src/codec/collation/collator/gb18030_collation.rs new file mode 100644 index 00000000000..ff21559ded6 --- /dev/null +++ b/components/tidb_query_datatype/src/codec/collation/collator/gb18030_collation.rs @@ -0,0 +1,181 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use super::*; + +/// Collator for `gb18030_bin` +#[derive(Debug)] +pub struct CollatorGb18030Bin; + +impl Collator for CollatorGb18030Bin { + type Charset = CharsetGb18030; + type Weight = u32; + const IS_CASE_INSENSITIVE: bool = false; + + #[inline] + fn char_weight(ch: char) -> u32 { + // If the incoming character is not, convert it to '?'. This should not + // happened. + let r = ch as usize; + if r > 0x10FFFF { + return '?' as u32; + } + + (&GB18030_BIN_TABLE[r * 4..r * 4 + 4]) + .read_u32_le() + .unwrap() + } + + #[inline] + fn write_sort_key(writer: &mut W, bstr: &[u8]) -> Result { + let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); + let mut n = 0; + for ch in s.chars() { + let weight = Self::char_weight(ch); + if weight > 0xFFFF { + writer.write_u32_be(weight)?; + n += 4; + } else if weight > 0xFF { + writer.write_u16_be(weight as u16)?; + n += 2; + } else { + writer.write_u8(weight as u8)?; + n += 1; + } + } + Ok(n * std::mem::size_of::()) + } + + #[inline] + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result { + let mut sa = str::from_utf8(a)?; + let mut sb = str::from_utf8(b)?; + if !force_no_pad { + sa = sa.trim_end_matches(PADDING_SPACE); + sb = sb.trim_end_matches(PADDING_SPACE); + } + Ok(sa + .chars() + .map(Self::char_weight) + .cmp(sb.chars().map(Self::char_weight))) + } + + #[inline] + fn sort_hash(state: &mut H, bstr: &[u8]) -> Result<()> { + let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); + for ch in s.chars().map(Self::char_weight) { + ch.hash(state); + } + Ok(()) + } +} + +/// Collator for `gb18030_chinese_ci` +#[derive(Debug)] +pub struct CollatorGb18030ChineseCi; + +impl Collator for CollatorGb18030ChineseCi { + type Charset = CharsetGb18030; + type Weight = u32; + const IS_CASE_INSENSITIVE: bool = true; + + #[inline] + fn char_weight(ch: char) -> u32 { + // If the incoming character is not, convert it to '?'. This should not + // happened. + let r = ch as usize; + if r > 0x10FFFF { + return '?' as u32; + } + + (&GB18030_CHINESE_CI_TABLE[r * 4..r * 4 + 4]) + .read_u32_le() + .unwrap() + } + + #[inline] + fn write_sort_key(writer: &mut W, bstr: &[u8]) -> Result { + let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); + let mut n = 0; + for ch in s.chars() { + let weight = Self::char_weight(ch); + if weight > 0xFFFF { + writer.write_u32_be(weight)?; + n += 4; + } else if weight > 0xFF { + writer.write_u16_be(weight as u16)?; + n += 2; + } else { + writer.write_u8(weight as u8)?; + n += 1; + } + } + Ok(n * std::mem::size_of::()) + } + + #[inline] + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result { + let mut sa = str::from_utf8(a)?; + let mut sb = str::from_utf8(b)?; + if !force_no_pad { + sa = sa.trim_end_matches(PADDING_SPACE); + sb = sb.trim_end_matches(PADDING_SPACE); + } + Ok(sa + .chars() + .map(Self::char_weight) + .cmp(sb.chars().map(Self::char_weight))) + } + + #[inline] + fn sort_hash(state: &mut H, bstr: &[u8]) -> Result<()> { + let s = str::from_utf8(bstr)?.trim_end_matches(PADDING_SPACE); + for ch in s.chars().map(Self::char_weight) { + ch.hash(state); + } + Ok(()) + } +} + +const TABLE_SIZE_FOR_GB18030: usize = 4 * (0x10FFFF + 1); + +// GB18030_BIN_TABLE are the encoding tables from Unicode to GB18030 code. +const GB18030_BIN_TABLE: &[u8; TABLE_SIZE_FOR_GB18030] = include_bytes!("gb18030_bin.data"); + +// GB18030_CHINESE_CI_TABLE are the sort key tables for GB18030 codepoint. +const GB18030_CHINESE_CI_TABLE: &[u8; TABLE_SIZE_FOR_GB18030] = + include_bytes!("gb18030_chinese_ci.data"); + +#[cfg(test)] +mod tests { + use crate::codec::collation::{ + collator::{CollatorGb18030Bin, CollatorGb18030ChineseCi}, + Collator, + }; + + #[test] + fn test_weight() { + let cases: Vec<(char, u32, u32)> = vec![ + ('中', 0xFFA09BC1, 0xD6D0), + ('€', 0xA2E3, 0xA2E3), + ('', 0xFF001D21, 0x8135F437), + ('ḿ', 0xFF001D20, 0xA8BC), + ('ǹ', 0xFF000154, 0xA8BF), + ('䦃', 0xFFA09E8A, 0xFE89), + ]; + + for (case, exp_chinese_ci, exp_bin) in cases { + let chinese_ci = CollatorGb18030ChineseCi::char_weight(case); + let bin = CollatorGb18030Bin::char_weight(case); + assert_eq!( + exp_bin, bin, + "{} expected:{:02X?}, but got:{:02X?}", + case, exp_bin, bin + ); + assert_eq!( + exp_chinese_ci, chinese_ci, + "{} expected:{:02X?}, but got:{:02X?}", + case, exp_chinese_ci, chinese_ci + ); + } + } +} diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs index 31685ca08d5..fe71e8e1713 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs @@ -2,9 +2,9 @@ use super::*; -pub trait GbkCollator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug { +trait GbkCollator: 'static + Send + Sync + std::fmt::Debug { const IS_CASE_INSENSITIVE: bool; - const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2]; + const WEIGHT_TABLE: &'static [u8; TABLE_SIZE_FOR_GBK]; } impl Collator for T { @@ -43,9 +43,13 @@ impl Collator for T { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { - let sa = str::from_utf8(a)?.trim_end_matches(PADDING_SPACE); - let sb = str::from_utf8(b)?.trim_end_matches(PADDING_SPACE); + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result { + let mut sa = str::from_utf8(a)?; + let mut sb = str::from_utf8(b)?; + if !force_no_pad { + sa = sa.trim_end_matches(PADDING_SPACE); + sb = sb.trim_end_matches(PADDING_SPACE); + } Ok(sa .chars() .map(Self::char_weight) @@ -68,7 +72,7 @@ pub struct CollatorGbkBin; impl GbkCollator for CollatorGbkBin { const IS_CASE_INSENSITIVE: bool = false; - const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_BIN_TABLE; + const WEIGHT_TABLE: &'static [u8; TABLE_SIZE_FOR_GBK] = GBK_BIN_TABLE; } /// Collator for `gbk_chinese_ci` collation with padding behavior (trims right @@ -78,15 +82,17 @@ pub struct CollatorGbkChineseCi; impl GbkCollator for CollatorGbkChineseCi { const IS_CASE_INSENSITIVE: bool = true; - const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_CHINESE_CI_TABLE; + const WEIGHT_TABLE: &'static [u8; TABLE_SIZE_FOR_GBK] = GBK_CHINESE_CI_TABLE; } +const TABLE_SIZE_FOR_GBK: usize = (0xffff + 1) * 2; + // GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally // the same with golang's GBK encoding. If there is no mapping code in GBK, use // 0x3F(?) instead. It should not happened. -const GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); +const GBK_BIN_TABLE: &[u8; TABLE_SIZE_FOR_GBK] = include_bytes!("gbk_bin.data"); // GBK_CHINESE_CI_TABLE are the sort key tables for GBK codepoint. // If there is no mapping code in GBK, use 0x3F(?) instead. It should not // happened. -const GBK_CHINESE_CI_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_chinese_ci.data"); +const GBK_CHINESE_CI_TABLE: &[u8; TABLE_SIZE_FOR_GBK] = include_bytes!("gbk_chinese_ci.data"); diff --git a/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs b/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs index c70deb08cd1..a63a5aed7e2 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs @@ -28,10 +28,14 @@ impl Collator for CollatorLatin1Bin { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { - Ok(B(a) - .trim_end_with(|c| c == PADDING_SPACE) - .cmp(B(b).trim_end_with(|c| c == PADDING_SPACE))) + fn sort_compare(mut a: &[u8], mut b: &[u8], force_no_pad: bool) -> Result { + if !force_no_pad { + a = a.trim_end_with(|c| c == PADDING_SPACE); + } + if !force_no_pad { + b = b.trim_end_with(|c| c == PADDING_SPACE); + } + Ok(a.cmp(b)) } #[inline] diff --git a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs index 913d1dced9f..4385f83e74b 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod binary; +mod gb18030_collation; mod gbk_collation; mod latin1_bin; mod utf8mb4_binary; @@ -15,6 +16,7 @@ use std::{ pub use binary::*; use codec::prelude::*; +pub use gb18030_collation::*; pub use gbk_collation::*; pub use latin1_bin::*; pub use utf8mb4_binary::*; @@ -45,6 +47,8 @@ mod tests { (Collation::GbkChineseCi, 6), (Collation::Utf8Mb40900AiCi, 7), (Collation::Utf8Mb40900Bin, 8), + (Collation::Gb18030Bin, 9), + (Collation::Gb18030ChineseCi, 10), ]; let cases = vec![ // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, @@ -62,6 +66,8 @@ mod tests { Ordering::Equal, Ordering::Equal, Ordering::Equal, + Ordering::Equal, + Ordering::Equal, ], ), ( @@ -77,6 +83,8 @@ mod tests { Ordering::Equal, Ordering::Less, Ordering::Less, + Ordering::Equal, + Ordering::Equal, ], ), ( @@ -92,6 +100,8 @@ mod tests { Ordering::Equal, Ordering::Less, Ordering::Greater, + Ordering::Greater, + Ordering::Equal, ], ), ( @@ -107,6 +117,8 @@ mod tests { Ordering::Greater, Ordering::Greater, Ordering::Greater, + Ordering::Greater, + Ordering::Greater, ], ), ( @@ -122,6 +134,8 @@ mod tests { Ordering::Less, Ordering::Less, Ordering::Less, + Ordering::Less, + Ordering::Less, ], ), ( @@ -137,6 +151,8 @@ mod tests { Ordering::Less, Ordering::Equal, Ordering::Less, + Ordering::Less, + Ordering::Less, ], ), ( @@ -152,6 +168,8 @@ mod tests { Ordering::Less, Ordering::Greater, Ordering::Less, + Ordering::Less, + Ordering::Less, ], ), ( @@ -167,6 +185,8 @@ mod tests { Ordering::Less, Ordering::Equal, Ordering::Greater, + Ordering::Greater, + Ordering::Greater, ], ), ( @@ -182,6 +202,8 @@ mod tests { Ordering::Greater, Ordering::Less, Ordering::Less, + Ordering::Greater, + Ordering::Greater, ], ), ( @@ -197,6 +219,8 @@ mod tests { Ordering::Less, Ordering::Less, Ordering::Less, + Ordering::Less, + Ordering::Less, ], ), ]; @@ -212,7 +236,7 @@ mod tests { hasher.finish() }; - let cmp = TT::sort_compare(sa, sb).unwrap(); + let cmp = TT::sort_compare(sa, sb, false).unwrap(); let ha = eval_hash(sa); let hb = eval_hash(sb); (cmp, ha, hb) @@ -255,6 +279,8 @@ mod tests { (Collation::GbkChineseCi, 6), (Collation::Utf8Mb40900AiCi, 7), (Collation::Utf8Mb40900Bin, 8), + (Collation::Gb18030Bin, 9), + (Collation::Gb18030ChineseCi, 10), ]; let cases = vec![ // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, @@ -271,6 +297,8 @@ mod tests { vec![0x41], vec![0x1C, 0x47], vec![0x61], + vec![0x61], + vec![0x41], ], ), ( @@ -285,6 +313,8 @@ mod tests { vec![0x41], vec![0x1C, 0x47, 0x2, 0x9], vec![0x41, 0x20], + vec![0x41], + vec![0x41], ], ), ( @@ -299,6 +329,8 @@ mod tests { vec![0x41], vec![0x1C, 0x47], vec![0x41], + vec![0x41], + vec![0x41], ], ), ( @@ -313,6 +345,8 @@ mod tests { vec![0x3F], vec![0x15, 0xFE], vec![0xF0, 0x9F, 0x98, 0x83], + vec![0x94, 0x39, 0xFC, 0x39], + vec![0xFF, 0x03, 0xD8, 0x4B], ], ), ( @@ -364,6 +398,16 @@ mod tests { 0x9D, 0x8C, 0x86, 0x20, 0x62, 0x61, 0x7A, 0x20, 0xE2, 0x98, 0x83, 0x20, 0x71, 0x75, 0x78, ], + vec![ + 0x46, 0x6F, 0x6F, 0x20, 0x81, 0x30, 0x84, 0x38, 0x20, 0x62, 0x61, 0x72, + 0x20, 0x94, 0x32, 0xEF, 0x32, 0x20, 0x62, 0x61, 0x7A, 0x20, 0x81, 0x37, + 0xA3, 0x30, 0x20, 0x71, 0x75, 0x78, + ], + vec![ + 0x46, 0x4F, 0x4F, 0x20, 0xFF, 0x00, 0x00, 0x26, 0x20, 0x42, 0x41, 0x52, + 0x20, 0xFF, 0x03, 0xB5, 0x4E, 0x20, 0x42, 0x41, 0x5A, 0x20, 0xFF, 0x00, + 0x23, 0xC8, 0x20, 0x51, 0x55, 0x58, + ], ], ), ( @@ -384,6 +428,8 @@ mod tests { 0x9C, 0x23, 0xB1, ], vec![0xEF, 0xB7, 0xBB], + vec![0x84, 0x30, 0xFE, 0x35], + vec![0xFF, 0x00, 0x98, 0x8F], ], ), ( @@ -398,6 +444,8 @@ mod tests { vec![0xD3, 0x21, 0xC1, 0xAD], vec![0xFB, 0x40, 0xCE, 0x2D, 0xFB, 0x40, 0xE5, 0x87], vec![0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87], + vec![0xD6, 0xD0, 0xCE, 0xC4], + vec![0xFF, 0xA0, 0x9B, 0xC1, 0xFF, 0xA0, 0x78, 0xBD], ], ), ]; @@ -448,7 +496,7 @@ mod tests { hasher.finish() }; - let cmp = CollatorLatin1Bin::sort_compare(sa.as_slice(), sb.as_slice()).unwrap(); + let cmp = CollatorLatin1Bin::sort_compare(sa.as_slice(), sb.as_slice(), false).unwrap(); let ha = eval_hash(sa.as_slice()); let hb = eval_hash(sb.as_slice()); diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs index 959664b1854..e5333411cdb 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs @@ -26,9 +26,13 @@ impl Collator for CollatorUtf8Mb4Bin { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { - let sa = str::from_utf8(a)?.trim_end_matches(PADDING_SPACE); - let sb = str::from_utf8(b)?.trim_end_matches(PADDING_SPACE); + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result { + let mut sa = str::from_utf8(a)?; + let mut sb = str::from_utf8(b)?; + if !force_no_pad { + sa = sa.trim_end_matches(PADDING_SPACE); + sb = sb.trim_end_matches(PADDING_SPACE); + } Ok(sa.as_bytes().cmp(sb.as_bytes())) } @@ -63,7 +67,7 @@ impl Collator for CollatorUtf8Mb4BinNoPadding { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { + fn sort_compare(a: &[u8], b: &[u8], _force_no_pad: bool) -> Result { str::from_utf8(a)?; str::from_utf8(b)?; Ok(a.cmp(b)) diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs index 2cc9a738372..7b0f27d70d0 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs @@ -38,9 +38,13 @@ impl Collator for CollatorUtf8Mb4GeneralCi { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { - let sa = str::from_utf8(a)?.trim_end_matches(PADDING_SPACE); - let sb = str::from_utf8(b)?.trim_end_matches(PADDING_SPACE); + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result { + let mut sa = str::from_utf8(a)?; + let mut sb = str::from_utf8(b)?; + if !force_no_pad { + sa = sa.trim_end_matches(PADDING_SPACE); + sb = sb.trim_end_matches(PADDING_SPACE); + } Ok(sa .chars() .map(Self::char_weight) diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_uca/mod.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_uca/mod.rs index b90d28d0e11..cb653f5170d 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_uca/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_uca/mod.rs @@ -53,9 +53,16 @@ impl Collator for CollatorUca { } #[inline] - fn sort_compare(a: &[u8], b: &[u8]) -> Result { - let mut ca = T::preprocess(str::from_utf8(a)?).chars(); - let mut cb = T::preprocess(str::from_utf8(b)?).chars(); + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result { + let mut sa = str::from_utf8(a)?; + let mut sb = str::from_utf8(b)?; + if !force_no_pad { + sa = T::preprocess(sa); + sb = T::preprocess(sb); + } + + let mut ca = sa.chars(); + let mut cb = sb.chars(); let mut an = 0; let mut bn = 0; diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/gb18030.rs b/components/tidb_query_datatype/src/codec/collation/encoding/gb18030.rs new file mode 100644 index 00000000000..b66512af0f7 --- /dev/null +++ b/components/tidb_query_datatype/src/codec/collation/encoding/gb18030.rs @@ -0,0 +1,434 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use collections::HashMap; +use encoding_rs::GB18030; +use lazy_static::*; + +use self::gb18030_data::GB18030_TO_UNICODE; +use super::*; +use crate::codec::data_type::{BytesGuard, BytesWriter}; + +lazy_static! { + static ref DECODE_MAP: HashMap = GB18030_TO_UNICODE.iter().copied().collect(); + static ref ENCODE_MAP: HashMap> = GB18030_TO_UNICODE + .iter() + .map(|(gb18030, ch)| { + let mut gb18030_bytes = gb18030.to_be_bytes().to_vec(); + let mut pos = 0; + while pos < gb18030_bytes.len() && gb18030_bytes[pos] == 0 { + pos += 1; + } + gb18030_bytes = gb18030_bytes[pos..].to_vec(); + (*ch, gb18030_bytes) + }) + .collect(); +} + +#[derive(Debug)] +pub struct EncodingGb18030 {} + +impl Encoding for EncodingGb18030 { + #[inline] + fn decode(data: BytesRef<'_>) -> Result { + let mut res = Vec::::new(); + let l = data.len(); + if l == 0 { + return Ok(res); + } + let mut base = 0; + while base < l { + // 1. decide the length of next character + let offset; + match data[base] { + ..=0x7f => offset = 1, + 0x81..=0xfe => { + if base + 1 >= l { + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gb18030", + )); + } + if 0x40 <= data[base + 1] && data[base + 1] <= 0xfe && data[base + 1] != 0x7f { + offset = 2; + } else if base + 3 < l + && data[base + 1] >= 0x30 + && data[base + 1] <= 0x39 + && data[base + 2] >= 0x81 + && data[base + 2] <= 0xfe + && data[base + 3] >= 0x30 + && data[base + 3] <= 0x39 + { + offset = 4; + } else { + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gb18030", + )); + } + } + _ => { + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gb18030", + )); + } + } + + // 2. decode next character + let v: u32 = match offset { + 1 => u32::from(data[base]), + 2 => u32::from(data[base]) << 8 | u32::from(data[base + 1]), + 4 => { + u32::from(data[base]) << 24 + | u32::from(data[base + 1]) << 16 + | u32::from(data[base + 2]) << 8 + | u32::from(data[base + 3]) + } + _ => { + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gb18030", + )); + } + }; + if DECODE_MAP.contains_key(&v) { + let mut buffer = [0; 4]; + let utf8_bytes = DECODE_MAP + .get(&v) + .unwrap() + .encode_utf8(&mut buffer) + .as_bytes(); + res.extend(utf8_bytes.to_vec()); + } else { + match GB18030 + .decode_without_bom_handling_and_without_replacement(&data[base..base + offset]) + { + Some(v) => { + res.extend(v.as_bytes()); + } + None => { + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gb18030", + )); + } + } + } + base += offset; + } + + Ok(res) + } + + #[inline] + fn encode(data: BytesRef<'_>) -> Result { + let mut res = Vec::::new(); + let utf8_str = str::from_utf8(data)?; + // encode each character one by one + for ch in utf8_str.chars() { + if ENCODE_MAP.contains_key(&ch) { + res.extend(ENCODE_MAP.get(&ch).unwrap().iter().copied()); + } else { + res.extend(GB18030.encode(&ch.to_string()).0.iter()); + } + } + + Ok(res) + } + + #[inline] + fn lower(s: &str, writer: BytesWriter) -> BytesGuard { + let res = s.chars().flat_map(|ch| { + let c = ch as u32; + match c { + 0xB5 => char::from_u32(c + 775), + 0x3D0 => char::from_u32(c - 30), + 0x3D1 => char::from_u32(c - 25), + 0x3D5 => char::from_u32(c - 15), + 0x3D6 => char::from_u32(c - 22), + 0x3F0 => char::from_u32(c - 54), + 0x3F1 => char::from_u32(c - 48), + 0x3F5 => char::from_u32(c - 64), + 0x1E9B => char::from_u32(c - 58), + 0x1FBE => char::from_u32(c - 7173), + 0x1C5 | 0x1C8 | 0x1CB | 0x1F2 | 0x3C2 => char::from_u32(c + 1), + 0x25C + | 0x261 + | 0x265..=0x266 + | 0x26A + | 0x26C + | 0x282 + | 0x287 + | 0x29D..=0x29E + | 0x37F + | 0x3F3 + | 0x526..=0x52F + | 0x10C7 + | 0x10CD + | 0x10D0..=0x10FA + | 0x10FD..=0x10FF + | 0x13A0..=0x13F5 + | 0x13F8..=0x13FD + | 0x1C80..=0x1C88 + | 0x1C90..=0x1CBA + | 0x1CBD..=0x1CBF + | 0x1D79 + | 0x1D7D + | 0x1D8E + | 0x2CF2..=0x2CF3 + | 0x2D27 + | 0x2D2D + | 0xA660..=0xA661 + | 0xA698..=0xA69B + | 0xA78D + | 0xA790..=0xA794 + | 0xA796..=0xA7AE + | 0xA7B0..=0xA7BF + | 0xA7C2..=0xA7CA + | 0xA7F5..=0xA7F6 + | 0xAB53 + | 0xAB70..=0xABBF + | 0x104B0..=0x104D3 + | 0x104D8..=0x104FB + | 0x10C80..=0x10CB2 + | 0x10CC0..=0x10CF2 + | 0x118A0..=0x118DF + | 0x16E40..=0x16E7F + | 0x1E900..=0x1E943 => char::from_u32(c), + _ => unicode_to_lower(ch), + } + }); + writer.write_from_char_iter(res) + } + + #[inline] + fn upper(s: &str, writer: BytesWriter) -> BytesGuard { + let res = s.chars().flat_map(|ch| { + let c = ch as u32; + match c { + 0xB5 + | 0x1C5 + | 0x1C8 + | 0x1CB + | 0x1F2 + | 0x25C + | 0x261 + | 0x265..=0x266 + | 0x26A + | 0x26C + | 0x282 + | 0x287 + | 0x29D..=0x29E + | 0x37F + | 0x3C2 + | 0x3D0 + | 0x3D1 + | 0x3D5 + | 0x3D6 + | 0x3F0 + | 0x3F1 + | 0x3F3 + | 0x3F5 + | 0x526..=0x52F + | 0x10C7 + | 0x10CD + | 0x10D0..=0x10FA + | 0x10FD..=0x10FF + | 0x13A0..=0x13F5 + | 0x13F8..=0x13FD + | 0x1C80..=0x1C88 + | 0x1C90..=0x1CBA + | 0x1CBD..=0x1CBF + | 0x1D79 + | 0x1D7D + | 0x1D8E + | 0x1E9B + | 0x1FBE + | 0x2CF2..=0x2CF3 + | 0x2D27 + | 0x2D2D + | 0xA660..=0xA661 + | 0xA698..=0xA69B + | 0xA78D + | 0xA790..=0xA794 + | 0xA796..=0xA7AE + | 0xA7B0..=0xA7BF + | 0xA7C2..=0xA7CA + | 0xA7F5..=0xA7F6 + | 0xAB53 + | 0xAB70..=0xABBF + | 0x104B0..=0x104D3 + | 0x104D8..=0x104FB + | 0x10C80..=0x10CB2 + | 0x10CC0..=0x10CF2 + | 0x118A0..=0x118DF + | 0x16E40..=0x16E7F + | 0x1E900..=0x1E943 => char::from_u32(c), + _ => unicode_to_upper(ch), + } + }); + writer.write_from_char_iter(res) + } +} + +#[cfg(test)] +mod tests { + use bstr::ByteSlice; + + use crate::codec::collation::{encoding::EncodingGb18030, Encoding}; + + #[test] + fn test_encode() { + let cases = vec![ + ("中文", vec![0xD6, 0xD0, 0xCE, 0xC4]), + ("€", vec![0xA2, 0xE3]), + ("ḿ", vec![0xA8, 0xBC]), + ("", vec![0x81, 0x35, 0xF4, 0x37]), + ("€ḿ", vec![0xA2, 0xE3, 0xA8, 0xBC]), + ("😃", vec![0x94, 0x39, 0xFC, 0x39]), + ( + "Foo © bar 𝌆 baz ☃ qux", + vec![ + 0x46, 0x6F, 0x6F, 0x20, 0x81, 0x30, 0x84, 0x38, 0x20, 0x62, 0x61, 0x72, 0x20, + 0x94, 0x32, 0xEF, 0x32, 0x20, 0x62, 0x61, 0x7A, 0x20, 0x81, 0x37, 0xA3, 0x30, + 0x20, 0x71, 0x75, 0x78, + ], + ), + ("ﷻ", vec![0x84, 0x30, 0xFE, 0x35]), + // GB18030-2005 + ( + "〾⿰⿱⿲⿳⿴⿵⿶⿷⿸⿹⿺⿻", + vec![ + 0xA9, 0x89, 0xA9, 0x8A, 0xA9, 0x8B, 0xA9, 0x8C, 0xA9, 0x8D, 0xA9, 0x8E, 0xA9, + 0x8F, 0xA9, 0x90, 0xA9, 0x91, 0xA9, 0x92, 0xA9, 0x93, 0xA9, 0x94, 0xA9, 0x95, + ], + ), + ("ǹ", vec![0xA8, 0xBF]), + ( + "⺁㧟㩳㧐", + vec![0xFE, 0x50, 0xFE, 0x63, 0xFE, 0x64, 0xFE, 0x65], + ), + ("䦃", vec![0xFE, 0x89]), + ("︐", vec![0xA6, 0xD9]), + ("𠂇𠂉", vec![0x95, 0x32, 0x90, 0x31, 0x95, 0x32, 0x90, 0x33]), + ("\u{e816}\u{e855}", vec![0xFE, 0x51, 0xFE, 0x91]), + // GB18038-2022 + ("\u{f9f1}", vec![0xFD, 0xA0]), + ( + "\u{fa0c}\u{fa0d}\u{fa0e}", + vec![0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42], + ), + ( + "\u{2e81}\u{e816}\u{e817}\u{e818}\u{2e84}", + vec![0xFE, 0x50, 0xFE, 0x51, 0xFE, 0x52, 0xFE, 0x53, 0xFE, 0x54], + ), + ( + "\u{e831}\u{9fb8}\u{2eaa}\u{4056}", + vec![0xFE, 0x6C, 0xFE, 0x6D, 0xFE, 0x6E, 0xFE, 0x6F], + ), + ( + "\u{f92c}\u{f979}\u{f995}\u{f9e7}\u{f9f1}\u{fa0c}\u{fa0d}\u{fa18}\u{fa20}", + vec![ + 0xFD, 0x9C, 0xFD, 0x9D, 0xFD, 0x9E, 0xFD, 0x9F, 0xFD, 0xA0, 0xFE, 0x40, 0xFE, + 0x41, 0xFE, 0x47, 0xFE, 0x49, + ], + ), + ("\u{e5e5}\u{e000}", vec![0xA3, 0xA0, 0xAA, 0xA1]), + ]; + for (case, expected) in cases { + let res = EncodingGb18030::encode(case.to_string().as_bytes()); + match res { + Ok(bytes) => { + assert_eq!( + expected, bytes, + "{} expected:{:02X?}, but got:{:02X?}", + case, expected, bytes + ); + } + _ => panic!("Should succeed to encode"), + } + } + } + + #[test] + fn test_decode() { + let cases: Vec<(Vec, &str)> = vec![ + (vec![0xD6, 0xD0, 0xCE, 0xC4], "中文"), + (vec![0xA2, 0xE3], "€"), + (vec![0xA8, 0xBC], "ḿ"), + (vec![0x81, 0x35, 0xF4, 0x37], ""), + (vec![0xA2, 0xE3, 0xA8, 0xBC], "€ḿ"), + (vec![0x94, 0x39, 0xFC, 0x39], "😃"), + ( + vec![ + 0x46, 0x6F, 0x6F, 0x20, 0x81, 0x30, 0x84, 0x38, 0x20, 0x62, 0x61, 0x72, 0x20, + 0x94, 0x32, 0xEF, 0x32, 0x20, 0x62, 0x61, 0x7A, 0x20, 0x81, 0x37, 0xA3, 0x30, + 0x20, 0x71, 0x75, 0x78, + ], + "Foo © bar 𝌆 baz ☃ qux", + ), + (vec![0x84, 0x30, 0xFE, 0x35], "ﷻ"), + // GB18030-2005 + ( + vec![ + 0xA9, 0x89, 0xA9, 0x8A, 0xA9, 0x8B, 0xA9, 0x8C, 0xA9, 0x8D, 0xA9, 0x8E, 0xA9, + 0x8F, 0xA9, 0x90, 0xA9, 0x91, 0xA9, 0x92, 0xA9, 0x93, 0xA9, 0x94, 0xA9, 0x95, + ], + "〾⿰⿱⿲⿳⿴⿵⿶⿷⿸⿹⿺⿻", + ), + (vec![0xA8, 0xBF], "ǹ"), + ( + vec![ + 0xFE, 0x50, 0xFE, 0x54, 0xFE, 0x55, 0xFE, 0x56, 0xFE, 0x57, 0xFE, 0x58, 0xFE, + 0x5A, 0xFE, 0x5B, 0xFE, 0x5C, 0xFE, 0x5D, 0xFE, 0x5E, 0xFE, 0x5F, 0xFE, 0x60, + 0xFE, 0x62, 0xFE, 0x63, 0xFE, 0x64, 0xFE, 0x65, 0xFE, 0x68, 0xFE, 0x69, 0xFE, + 0x6A, 0xFE, 0x6B, 0xFE, 0x6E, 0xFE, 0x6F, + ], + "⺁⺄㑳㑇⺈⺋㖞㘚㘎⺌⺗㥮㤘㧏㧟㩳㧐㭎㱮㳠⺧⺪䁖", + ), + (vec![0xFE, 0x76], "\u{e83b}"), + (vec![0xFE, 0x89], "䦃"), + (vec![0xA6, 0xD9], "︐"), + (vec![0x95, 0x32, 0x90, 0x31, 0x95, 0x32, 0x90, 0x33], "𠂇𠂉"), + (vec![0xFE, 0x51, 0xFE, 0x91], "\u{e816}\u{e855}"), + // GB18030-2022 + (vec![0xFD, 0xA0], "\u{f9f1}"), + ( + vec![0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42], + "\u{fa0c}\u{fa0d}\u{fa0e}", + ), + ( + vec![0xFE, 0x50, 0xFE, 0x51, 0xFE, 0x52, 0xFE, 0x53, 0xFE, 0x54], + "\u{2e81}\u{e816}\u{e817}\u{e818}\u{2e84}", + ), + ( + vec![0xFE, 0x6C, 0xFE, 0x6D, 0xFE, 0x6E, 0xFE, 0x6F], + "\u{e831}\u{9fb8}\u{2eaa}\u{4056}", + ), + ( + vec![ + 0xFD, 0x9C, 0xFD, 0x9D, 0xFD, 0x9E, 0xFD, 0x9F, 0xFD, 0xA0, 0xFE, 0x40, 0xFE, + 0x41, 0xFE, 0x47, 0xFE, 0x49, + ], + "\u{f92c}\u{f979}\u{f995}\u{f9e7}\u{f9f1}\u{fa0c}\u{fa0d}\u{fa18}\u{fa20}", + ), + (vec![0xA3, 0xA0, 0xAA, 0xA1], "\u{e5e5}\u{e000}"), + ]; + for (case, expected) in cases { + let res = EncodingGb18030::decode(case.as_bytes()); + match res { + Ok(bytes) => { + let s = bytes.to_str().unwrap(); + assert_eq!( + expected, s, + "{:02X?} expected:{}, but got:{}", + case, expected, s + ) + } + Err(e) => { + panic!("Should succeed to decode;\n{}", e); + } + } + } + } +} diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/gb18030_data.rs b/components/tidb_query_datatype/src/codec/collation/encoding/gb18030_data.rs new file mode 100644 index 00000000000..e720fbd3fd0 --- /dev/null +++ b/components/tidb_query_datatype/src/codec/collation/encoding/gb18030_data.rs @@ -0,0 +1,2107 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +pub const GB18030_TO_UNICODE: &[(u32, char)] = &[ + (0xA140, '\u{e4c6}'), + (0xA141, '\u{e4c7}'), + (0xA142, '\u{e4c8}'), + (0xA143, '\u{e4c9}'), + (0xA144, '\u{e4ca}'), + (0xA145, '\u{e4cb}'), + (0xA146, '\u{e4cc}'), + (0xA147, '\u{e4cd}'), + (0xA148, '\u{e4ce}'), + (0xA149, '\u{e4cf}'), + (0xA14A, '\u{e4d0}'), + (0xA14B, '\u{e4d1}'), + (0xA14C, '\u{e4d2}'), + (0xA14D, '\u{e4d3}'), + (0xA14E, '\u{e4d4}'), + (0xA14F, '\u{e4d5}'), + (0xA150, '\u{e4d6}'), + (0xA151, '\u{e4d7}'), + (0xA152, '\u{e4d8}'), + (0xA153, '\u{e4d9}'), + (0xA154, '\u{e4da}'), + (0xA155, '\u{e4db}'), + (0xA156, '\u{e4dc}'), + (0xA157, '\u{e4dd}'), + (0xA158, '\u{e4de}'), + (0xA159, '\u{e4df}'), + (0xA15A, '\u{e4e0}'), + (0xA15B, '\u{e4e1}'), + (0xA15C, '\u{e4e2}'), + (0xA15D, '\u{e4e3}'), + (0xA15E, '\u{e4e4}'), + (0xA15F, '\u{e4e5}'), + (0xA160, '\u{e4e6}'), + (0xA161, '\u{e4e7}'), + (0xA162, '\u{e4e8}'), + (0xA163, '\u{e4e9}'), + (0xA164, '\u{e4ea}'), + (0xA165, '\u{e4eb}'), + (0xA166, '\u{e4ec}'), + (0xA167, '\u{e4ed}'), + (0xA168, '\u{e4ee}'), + (0xA169, '\u{e4ef}'), + (0xA16A, '\u{e4f0}'), + (0xA16B, '\u{e4f1}'), + (0xA16C, '\u{e4f2}'), + (0xA16D, '\u{e4f3}'), + (0xA16E, '\u{e4f4}'), + (0xA16F, '\u{e4f5}'), + (0xA170, '\u{e4f6}'), + (0xA171, '\u{e4f7}'), + (0xA172, '\u{e4f8}'), + (0xA173, '\u{e4f9}'), + (0xA174, '\u{e4fa}'), + (0xA175, '\u{e4fb}'), + (0xA176, '\u{e4fc}'), + (0xA177, '\u{e4fd}'), + (0xA178, '\u{e4fe}'), + (0xA179, '\u{e4ff}'), + (0xA17A, '\u{e500}'), + (0xA17B, '\u{e501}'), + (0xA17C, '\u{e502}'), + (0xA17D, '\u{e503}'), + (0xA17E, '\u{e504}'), + (0xA180, '\u{e505}'), + (0xA181, '\u{e506}'), + (0xA182, '\u{e507}'), + (0xA183, '\u{e508}'), + (0xA184, '\u{e509}'), + (0xA185, '\u{e50a}'), + (0xA186, '\u{e50b}'), + (0xA187, '\u{e50c}'), + (0xA188, '\u{e50d}'), + (0xA189, '\u{e50e}'), + (0xA18A, '\u{e50f}'), + (0xA18B, '\u{e510}'), + (0xA18C, '\u{e511}'), + (0xA18D, '\u{e512}'), + (0xA18E, '\u{e513}'), + (0xA18F, '\u{e514}'), + (0xA190, '\u{e515}'), + (0xA191, '\u{e516}'), + (0xA192, '\u{e517}'), + (0xA193, '\u{e518}'), + (0xA194, '\u{e519}'), + (0xA195, '\u{e51a}'), + (0xA196, '\u{e51b}'), + (0xA197, '\u{e51c}'), + (0xA198, '\u{e51d}'), + (0xA199, '\u{e51e}'), + (0xA19A, '\u{e51f}'), + (0xA19B, '\u{e520}'), + (0xA19C, '\u{e521}'), + (0xA19D, '\u{e522}'), + (0xA19E, '\u{e523}'), + (0xA19F, '\u{e524}'), + (0xA1A0, '\u{e525}'), + (0xA240, '\u{e526}'), + (0xA241, '\u{e527}'), + (0xA242, '\u{e528}'), + (0xA243, '\u{e529}'), + (0xA244, '\u{e52a}'), + (0xA245, '\u{e52b}'), + (0xA246, '\u{e52c}'), + (0xA247, '\u{e52d}'), + (0xA248, '\u{e52e}'), + (0xA249, '\u{e52f}'), + (0xA24A, '\u{e530}'), + (0xA24B, '\u{e531}'), + (0xA24C, '\u{e532}'), + (0xA24D, '\u{e533}'), + (0xA24E, '\u{e534}'), + (0xA24F, '\u{e535}'), + (0xA250, '\u{e536}'), + (0xA251, '\u{e537}'), + (0xA252, '\u{e538}'), + (0xA253, '\u{e539}'), + (0xA254, '\u{e53a}'), + (0xA255, '\u{e53b}'), + (0xA256, '\u{e53c}'), + (0xA257, '\u{e53d}'), + (0xA258, '\u{e53e}'), + (0xA259, '\u{e53f}'), + (0xA25A, '\u{e540}'), + (0xA25B, '\u{e541}'), + (0xA25C, '\u{e542}'), + (0xA25D, '\u{e543}'), + (0xA25E, '\u{e544}'), + (0xA25F, '\u{e545}'), + (0xA260, '\u{e546}'), + (0xA261, '\u{e547}'), + (0xA262, '\u{e548}'), + (0xA263, '\u{e549}'), + (0xA264, '\u{e54a}'), + (0xA265, '\u{e54b}'), + (0xA266, '\u{e54c}'), + (0xA267, '\u{e54d}'), + (0xA268, '\u{e54e}'), + (0xA269, '\u{e54f}'), + (0xA26A, '\u{e550}'), + (0xA26B, '\u{e551}'), + (0xA26C, '\u{e552}'), + (0xA26D, '\u{e553}'), + (0xA26E, '\u{e554}'), + (0xA26F, '\u{e555}'), + (0xA270, '\u{e556}'), + (0xA271, '\u{e557}'), + (0xA272, '\u{e558}'), + (0xA273, '\u{e559}'), + (0xA274, '\u{e55a}'), + (0xA275, '\u{e55b}'), + (0xA276, '\u{e55c}'), + (0xA277, '\u{e55d}'), + (0xA278, '\u{e55e}'), + (0xA279, '\u{e55f}'), + (0xA27A, '\u{e560}'), + (0xA27B, '\u{e561}'), + (0xA27C, '\u{e562}'), + (0xA27D, '\u{e563}'), + (0xA27E, '\u{e564}'), + (0xA280, '\u{e565}'), + (0xA281, '\u{e566}'), + (0xA282, '\u{e567}'), + (0xA283, '\u{e568}'), + (0xA284, '\u{e569}'), + (0xA285, '\u{e56a}'), + (0xA286, '\u{e56b}'), + (0xA287, '\u{e56c}'), + (0xA288, '\u{e56d}'), + (0xA289, '\u{e56e}'), + (0xA28A, '\u{e56f}'), + (0xA28B, '\u{e570}'), + (0xA28C, '\u{e571}'), + (0xA28D, '\u{e572}'), + (0xA28E, '\u{e573}'), + (0xA28F, '\u{e574}'), + (0xA290, '\u{e575}'), + (0xA291, '\u{e576}'), + (0xA292, '\u{e577}'), + (0xA293, '\u{e578}'), + (0xA294, '\u{e579}'), + (0xA295, '\u{e57a}'), + (0xA296, '\u{e57b}'), + (0xA297, '\u{e57c}'), + (0xA298, '\u{e57d}'), + (0xA299, '\u{e57e}'), + (0xA29A, '\u{e57f}'), + (0xA29B, '\u{e580}'), + (0xA29C, '\u{e581}'), + (0xA29D, '\u{e582}'), + (0xA29E, '\u{e583}'), + (0xA29F, '\u{e584}'), + (0xA2A0, '\u{e585}'), + (0xA2AB, '\u{e766}'), + (0xA2AC, '\u{e767}'), + (0xA2AD, '\u{e768}'), + (0xA2AE, '\u{e769}'), + (0xA2AF, '\u{e76a}'), + (0xA2B0, '\u{e76b}'), + (0xA2E3, '\u{20ac}'), + (0xA2E4, '\u{e76d}'), + (0xA2EF, '\u{e76e}'), + (0xA2F0, '\u{e76f}'), + (0xA2FD, '\u{e770}'), + (0xA2FE, '\u{e771}'), + (0xA340, '\u{e586}'), + (0xA341, '\u{e587}'), + (0xA342, '\u{e588}'), + (0xA343, '\u{e589}'), + (0xA344, '\u{e58a}'), + (0xA345, '\u{e58b}'), + (0xA346, '\u{e58c}'), + (0xA347, '\u{e58d}'), + (0xA348, '\u{e58e}'), + (0xA349, '\u{e58f}'), + (0xA34A, '\u{e590}'), + (0xA34B, '\u{e591}'), + (0xA34C, '\u{e592}'), + (0xA34D, '\u{e593}'), + (0xA34E, '\u{e594}'), + (0xA34F, '\u{e595}'), + (0xA350, '\u{e596}'), + (0xA351, '\u{e597}'), + (0xA352, '\u{e598}'), + (0xA353, '\u{e599}'), + (0xA354, '\u{e59a}'), + (0xA355, '\u{e59b}'), + (0xA356, '\u{e59c}'), + (0xA357, '\u{e59d}'), + (0xA358, '\u{e59e}'), + (0xA359, '\u{e59f}'), + (0xA35A, '\u{e5a0}'), + (0xA35B, '\u{e5a1}'), + (0xA35C, '\u{e5a2}'), + (0xA35D, '\u{e5a3}'), + (0xA35E, '\u{e5a4}'), + (0xA35F, '\u{e5a5}'), + (0xA360, '\u{e5a6}'), + (0xA361, '\u{e5a7}'), + (0xA362, '\u{e5a8}'), + (0xA363, '\u{e5a9}'), + (0xA364, '\u{e5aa}'), + (0xA365, '\u{e5ab}'), + (0xA366, '\u{e5ac}'), + (0xA367, '\u{e5ad}'), + (0xA368, '\u{e5ae}'), + (0xA369, '\u{e5af}'), + (0xA36A, '\u{e5b0}'), + (0xA36B, '\u{e5b1}'), + (0xA36C, '\u{e5b2}'), + (0xA36D, '\u{e5b3}'), + (0xA36E, '\u{e5b4}'), + (0xA36F, '\u{e5b5}'), + (0xA370, '\u{e5b6}'), + (0xA371, '\u{e5b7}'), + (0xA372, '\u{e5b8}'), + (0xA373, '\u{e5b9}'), + (0xA374, '\u{e5ba}'), + (0xA375, '\u{e5bb}'), + (0xA376, '\u{e5bc}'), + (0xA377, '\u{e5bd}'), + (0xA378, '\u{e5be}'), + (0xA379, '\u{e5bf}'), + (0xA37A, '\u{e5c0}'), + (0xA37B, '\u{e5c1}'), + (0xA37C, '\u{e5c2}'), + (0xA37D, '\u{e5c3}'), + (0xA37E, '\u{e5c4}'), + (0xA380, '\u{e5c5}'), + (0xA381, '\u{e5c6}'), + (0xA382, '\u{e5c7}'), + (0xA383, '\u{e5c8}'), + (0xA384, '\u{e5c9}'), + (0xA385, '\u{e5ca}'), + (0xA386, '\u{e5cb}'), + (0xA387, '\u{e5cc}'), + (0xA388, '\u{e5cd}'), + (0xA389, '\u{e5ce}'), + (0xA38A, '\u{e5cf}'), + (0xA38B, '\u{e5d0}'), + (0xA38C, '\u{e5d1}'), + (0xA38D, '\u{e5d2}'), + (0xA38E, '\u{e5d3}'), + (0xA38F, '\u{e5d4}'), + (0xA390, '\u{e5d5}'), + (0xA391, '\u{e5d6}'), + (0xA392, '\u{e5d7}'), + (0xA393, '\u{e5d8}'), + (0xA394, '\u{e5d9}'), + (0xA395, '\u{e5da}'), + (0xA396, '\u{e5db}'), + (0xA397, '\u{e5dc}'), + (0xA398, '\u{e5dd}'), + (0xA399, '\u{e5de}'), + (0xA39A, '\u{e5df}'), + (0xA39B, '\u{e5e0}'), + (0xA39C, '\u{e5e1}'), + (0xA39D, '\u{e5e2}'), + (0xA39E, '\u{e5e3}'), + (0xA39F, '\u{e5e4}'), + (0xA3A0, '\u{e5e5}'), + (0xA440, '\u{e5e6}'), + (0xA441, '\u{e5e7}'), + (0xA442, '\u{e5e8}'), + (0xA443, '\u{e5e9}'), + (0xA444, '\u{e5ea}'), + (0xA445, '\u{e5eb}'), + (0xA446, '\u{e5ec}'), + (0xA447, '\u{e5ed}'), + (0xA448, '\u{e5ee}'), + (0xA449, '\u{e5ef}'), + (0xA44A, '\u{e5f0}'), + (0xA44B, '\u{e5f1}'), + (0xA44C, '\u{e5f2}'), + (0xA44D, '\u{e5f3}'), + (0xA44E, '\u{e5f4}'), + (0xA44F, '\u{e5f5}'), + (0xA450, '\u{e5f6}'), + (0xA451, '\u{e5f7}'), + (0xA452, '\u{e5f8}'), + (0xA453, '\u{e5f9}'), + (0xA454, '\u{e5fa}'), + (0xA455, '\u{e5fb}'), + (0xA456, '\u{e5fc}'), + (0xA457, '\u{e5fd}'), + (0xA458, '\u{e5fe}'), + (0xA459, '\u{e5ff}'), + (0xA45A, '\u{e600}'), + (0xA45B, '\u{e601}'), + (0xA45C, '\u{e602}'), + (0xA45D, '\u{e603}'), + (0xA45E, '\u{e604}'), + (0xA45F, '\u{e605}'), + (0xA460, '\u{e606}'), + (0xA461, '\u{e607}'), + (0xA462, '\u{e608}'), + (0xA463, '\u{e609}'), + (0xA464, '\u{e60a}'), + (0xA465, '\u{e60b}'), + (0xA466, '\u{e60c}'), + (0xA467, '\u{e60d}'), + (0xA468, '\u{e60e}'), + (0xA469, '\u{e60f}'), + (0xA46A, '\u{e610}'), + (0xA46B, '\u{e611}'), + (0xA46C, '\u{e612}'), + (0xA46D, '\u{e613}'), + (0xA46E, '\u{e614}'), + (0xA46F, '\u{e615}'), + (0xA470, '\u{e616}'), + (0xA471, '\u{e617}'), + (0xA472, '\u{e618}'), + (0xA473, '\u{e619}'), + (0xA474, '\u{e61a}'), + (0xA475, '\u{e61b}'), + (0xA476, '\u{e61c}'), + (0xA477, '\u{e61d}'), + (0xA478, '\u{e61e}'), + (0xA479, '\u{e61f}'), + (0xA47A, '\u{e620}'), + (0xA47B, '\u{e621}'), + (0xA47C, '\u{e622}'), + (0xA47D, '\u{e623}'), + (0xA47E, '\u{e624}'), + (0xA480, '\u{e625}'), + (0xA481, '\u{e626}'), + (0xA482, '\u{e627}'), + (0xA483, '\u{e628}'), + (0xA484, '\u{e629}'), + (0xA485, '\u{e62a}'), + (0xA486, '\u{e62b}'), + (0xA487, '\u{e62c}'), + (0xA488, '\u{e62d}'), + (0xA489, '\u{e62e}'), + (0xA48A, '\u{e62f}'), + (0xA48B, '\u{e630}'), + (0xA48C, '\u{e631}'), + (0xA48D, '\u{e632}'), + (0xA48E, '\u{e633}'), + (0xA48F, '\u{e634}'), + (0xA490, '\u{e635}'), + (0xA491, '\u{e636}'), + (0xA492, '\u{e637}'), + (0xA493, '\u{e638}'), + (0xA494, '\u{e639}'), + (0xA495, '\u{e63a}'), + (0xA496, '\u{e63b}'), + (0xA497, '\u{e63c}'), + (0xA498, '\u{e63d}'), + (0xA499, '\u{e63e}'), + (0xA49A, '\u{e63f}'), + (0xA49B, '\u{e640}'), + (0xA49C, '\u{e641}'), + (0xA49D, '\u{e642}'), + (0xA49E, '\u{e643}'), + (0xA49F, '\u{e644}'), + (0xA4A0, '\u{e645}'), + (0xA4F4, '\u{e772}'), + (0xA4F5, '\u{e773}'), + (0xA4F6, '\u{e774}'), + (0xA4F7, '\u{e775}'), + (0xA4F8, '\u{e776}'), + (0xA4F9, '\u{e777}'), + (0xA4FA, '\u{e778}'), + (0xA4FB, '\u{e779}'), + (0xA4FC, '\u{e77a}'), + (0xA4FD, '\u{e77b}'), + (0xA4FE, '\u{e77c}'), + (0xA540, '\u{e646}'), + (0xA541, '\u{e647}'), + (0xA542, '\u{e648}'), + (0xA543, '\u{e649}'), + (0xA544, '\u{e64a}'), + (0xA545, '\u{e64b}'), + (0xA546, '\u{e64c}'), + (0xA547, '\u{e64d}'), + (0xA548, '\u{e64e}'), + (0xA549, '\u{e64f}'), + (0xA54A, '\u{e650}'), + (0xA54B, '\u{e651}'), + (0xA54C, '\u{e652}'), + (0xA54D, '\u{e653}'), + (0xA54E, '\u{e654}'), + (0xA54F, '\u{e655}'), + (0xA550, '\u{e656}'), + (0xA551, '\u{e657}'), + (0xA552, '\u{e658}'), + (0xA553, '\u{e659}'), + (0xA554, '\u{e65a}'), + (0xA555, '\u{e65b}'), + (0xA556, '\u{e65c}'), + (0xA557, '\u{e65d}'), + (0xA558, '\u{e65e}'), + (0xA559, '\u{e65f}'), + (0xA55A, '\u{e660}'), + (0xA55B, '\u{e661}'), + (0xA55C, '\u{e662}'), + (0xA55D, '\u{e663}'), + (0xA55E, '\u{e664}'), + (0xA55F, '\u{e665}'), + (0xA560, '\u{e666}'), + (0xA561, '\u{e667}'), + (0xA562, '\u{e668}'), + (0xA563, '\u{e669}'), + (0xA564, '\u{e66a}'), + (0xA565, '\u{e66b}'), + (0xA566, '\u{e66c}'), + (0xA567, '\u{e66d}'), + (0xA568, '\u{e66e}'), + (0xA569, '\u{e66f}'), + (0xA56A, '\u{e670}'), + (0xA56B, '\u{e671}'), + (0xA56C, '\u{e672}'), + (0xA56D, '\u{e673}'), + (0xA56E, '\u{e674}'), + (0xA56F, '\u{e675}'), + (0xA570, '\u{e676}'), + (0xA571, '\u{e677}'), + (0xA572, '\u{e678}'), + (0xA573, '\u{e679}'), + (0xA574, '\u{e67a}'), + (0xA575, '\u{e67b}'), + (0xA576, '\u{e67c}'), + (0xA577, '\u{e67d}'), + (0xA578, '\u{e67e}'), + (0xA579, '\u{e67f}'), + (0xA57A, '\u{e680}'), + (0xA57B, '\u{e681}'), + (0xA57C, '\u{e682}'), + (0xA57D, '\u{e683}'), + (0xA57E, '\u{e684}'), + (0xA580, '\u{e685}'), + (0xA581, '\u{e686}'), + (0xA582, '\u{e687}'), + (0xA583, '\u{e688}'), + (0xA584, '\u{e689}'), + (0xA585, '\u{e68a}'), + (0xA586, '\u{e68b}'), + (0xA587, '\u{e68c}'), + (0xA588, '\u{e68d}'), + (0xA589, '\u{e68e}'), + (0xA58A, '\u{e68f}'), + (0xA58B, '\u{e690}'), + (0xA58C, '\u{e691}'), + (0xA58D, '\u{e692}'), + (0xA58E, '\u{e693}'), + (0xA58F, '\u{e694}'), + (0xA590, '\u{e695}'), + (0xA591, '\u{e696}'), + (0xA592, '\u{e697}'), + (0xA593, '\u{e698}'), + (0xA594, '\u{e699}'), + (0xA595, '\u{e69a}'), + (0xA596, '\u{e69b}'), + (0xA597, '\u{e69c}'), + (0xA598, '\u{e69d}'), + (0xA599, '\u{e69e}'), + (0xA59A, '\u{e69f}'), + (0xA59B, '\u{e6a0}'), + (0xA59C, '\u{e6a1}'), + (0xA59D, '\u{e6a2}'), + (0xA59E, '\u{e6a3}'), + (0xA59F, '\u{e6a4}'), + (0xA5A0, '\u{e6a5}'), + (0xA5F7, '\u{e77d}'), + (0xA5F8, '\u{e77e}'), + (0xA5F9, '\u{e77f}'), + (0xA5FA, '\u{e780}'), + (0xA5FB, '\u{e781}'), + (0xA5FC, '\u{e782}'), + (0xA5FD, '\u{e783}'), + (0xA5FE, '\u{e784}'), + (0xA640, '\u{e6a6}'), + (0xA641, '\u{e6a7}'), + (0xA642, '\u{e6a8}'), + (0xA643, '\u{e6a9}'), + (0xA644, '\u{e6aa}'), + (0xA645, '\u{e6ab}'), + (0xA646, '\u{e6ac}'), + (0xA647, '\u{e6ad}'), + (0xA648, '\u{e6ae}'), + (0xA649, '\u{e6af}'), + (0xA64A, '\u{e6b0}'), + (0xA64B, '\u{e6b1}'), + (0xA64C, '\u{e6b2}'), + (0xA64D, '\u{e6b3}'), + (0xA64E, '\u{e6b4}'), + (0xA64F, '\u{e6b5}'), + (0xA650, '\u{e6b6}'), + (0xA651, '\u{e6b7}'), + (0xA652, '\u{e6b8}'), + (0xA653, '\u{e6b9}'), + (0xA654, '\u{e6ba}'), + (0xA655, '\u{e6bb}'), + (0xA656, '\u{e6bc}'), + (0xA657, '\u{e6bd}'), + (0xA658, '\u{e6be}'), + (0xA659, '\u{e6bf}'), + (0xA65A, '\u{e6c0}'), + (0xA65B, '\u{e6c1}'), + (0xA65C, '\u{e6c2}'), + (0xA65D, '\u{e6c3}'), + (0xA65E, '\u{e6c4}'), + (0xA65F, '\u{e6c5}'), + (0xA660, '\u{e6c6}'), + (0xA661, '\u{e6c7}'), + (0xA662, '\u{e6c8}'), + (0xA663, '\u{e6c9}'), + (0xA664, '\u{e6ca}'), + (0xA665, '\u{e6cb}'), + (0xA666, '\u{e6cc}'), + (0xA667, '\u{e6cd}'), + (0xA668, '\u{e6ce}'), + (0xA669, '\u{e6cf}'), + (0xA66A, '\u{e6d0}'), + (0xA66B, '\u{e6d1}'), + (0xA66C, '\u{e6d2}'), + (0xA66D, '\u{e6d3}'), + (0xA66E, '\u{e6d4}'), + (0xA66F, '\u{e6d5}'), + (0xA670, '\u{e6d6}'), + (0xA671, '\u{e6d7}'), + (0xA672, '\u{e6d8}'), + (0xA673, '\u{e6d9}'), + (0xA674, '\u{e6da}'), + (0xA675, '\u{e6db}'), + (0xA676, '\u{e6dc}'), + (0xA677, '\u{e6dd}'), + (0xA678, '\u{e6de}'), + (0xA679, '\u{e6df}'), + (0xA67A, '\u{e6e0}'), + (0xA67B, '\u{e6e1}'), + (0xA67C, '\u{e6e2}'), + (0xA67D, '\u{e6e3}'), + (0xA67E, '\u{e6e4}'), + (0xA680, '\u{e6e5}'), + (0xA681, '\u{e6e6}'), + (0xA682, '\u{e6e7}'), + (0xA683, '\u{e6e8}'), + (0xA684, '\u{e6e9}'), + (0xA685, '\u{e6ea}'), + (0xA686, '\u{e6eb}'), + (0xA687, '\u{e6ec}'), + (0xA688, '\u{e6ed}'), + (0xA689, '\u{e6ee}'), + (0xA68A, '\u{e6ef}'), + (0xA68B, '\u{e6f0}'), + (0xA68C, '\u{e6f1}'), + (0xA68D, '\u{e6f2}'), + (0xA68E, '\u{e6f3}'), + (0xA68F, '\u{e6f4}'), + (0xA690, '\u{e6f5}'), + (0xA691, '\u{e6f6}'), + (0xA692, '\u{e6f7}'), + (0xA693, '\u{e6f8}'), + (0xA694, '\u{e6f9}'), + (0xA695, '\u{e6fa}'), + (0xA696, '\u{e6fb}'), + (0xA697, '\u{e6fc}'), + (0xA698, '\u{e6fd}'), + (0xA699, '\u{e6fe}'), + (0xA69A, '\u{e6ff}'), + (0xA69B, '\u{e700}'), + (0xA69C, '\u{e701}'), + (0xA69D, '\u{e702}'), + (0xA69E, '\u{e703}'), + (0xA69F, '\u{e704}'), + (0xA6A0, '\u{e705}'), + (0xA6B9, '\u{e785}'), + (0xA6BA, '\u{e786}'), + (0xA6BB, '\u{e787}'), + (0xA6BC, '\u{e788}'), + (0xA6BD, '\u{e789}'), + (0xA6BE, '\u{e78a}'), + (0xA6BF, '\u{e78b}'), + (0xA6C0, '\u{e78c}'), + (0xA6D9, '\u{fe10}'), + (0xA6DA, '\u{fe12}'), + (0xA6DB, '\u{fe11}'), + (0xA6DC, '\u{fe13}'), + (0xA6DD, '\u{fe14}'), + (0xA6DE, '\u{fe15}'), + (0xA6DF, '\u{fe16}'), + (0xA6EC, '\u{fe17}'), + (0xA6ED, '\u{fe18}'), + (0xA6F3, '\u{fe19}'), + (0xA6F6, '\u{e797}'), + (0xA6F7, '\u{e798}'), + (0xA6F8, '\u{e799}'), + (0xA6F9, '\u{e79a}'), + (0xA6FA, '\u{e79b}'), + (0xA6FB, '\u{e79c}'), + (0xA6FC, '\u{e79d}'), + (0xA6FD, '\u{e79e}'), + (0xA6FE, '\u{e79f}'), + (0xA740, '\u{e706}'), + (0xA741, '\u{e707}'), + (0xA742, '\u{e708}'), + (0xA743, '\u{e709}'), + (0xA744, '\u{e70a}'), + (0xA745, '\u{e70b}'), + (0xA746, '\u{e70c}'), + (0xA747, '\u{e70d}'), + (0xA748, '\u{e70e}'), + (0xA749, '\u{e70f}'), + (0xA74A, '\u{e710}'), + (0xA74B, '\u{e711}'), + (0xA74C, '\u{e712}'), + (0xA74D, '\u{e713}'), + (0xA74E, '\u{e714}'), + (0xA74F, '\u{e715}'), + (0xA750, '\u{e716}'), + (0xA751, '\u{e717}'), + (0xA752, '\u{e718}'), + (0xA753, '\u{e719}'), + (0xA754, '\u{e71a}'), + (0xA755, '\u{e71b}'), + (0xA756, '\u{e71c}'), + (0xA757, '\u{e71d}'), + (0xA758, '\u{e71e}'), + (0xA759, '\u{e71f}'), + (0xA75A, '\u{e720}'), + (0xA75B, '\u{e721}'), + (0xA75C, '\u{e722}'), + (0xA75D, '\u{e723}'), + (0xA75E, '\u{e724}'), + (0xA75F, '\u{e725}'), + (0xA760, '\u{e726}'), + (0xA761, '\u{e727}'), + (0xA762, '\u{e728}'), + (0xA763, '\u{e729}'), + (0xA764, '\u{e72a}'), + (0xA765, '\u{e72b}'), + (0xA766, '\u{e72c}'), + (0xA767, '\u{e72d}'), + (0xA768, '\u{e72e}'), + (0xA769, '\u{e72f}'), + (0xA76A, '\u{e730}'), + (0xA76B, '\u{e731}'), + (0xA76C, '\u{e732}'), + (0xA76D, '\u{e733}'), + (0xA76E, '\u{e734}'), + (0xA76F, '\u{e735}'), + (0xA770, '\u{e736}'), + (0xA771, '\u{e737}'), + (0xA772, '\u{e738}'), + (0xA773, '\u{e739}'), + (0xA774, '\u{e73a}'), + (0xA775, '\u{e73b}'), + (0xA776, '\u{e73c}'), + (0xA777, '\u{e73d}'), + (0xA778, '\u{e73e}'), + (0xA779, '\u{e73f}'), + (0xA77A, '\u{e740}'), + (0xA77B, '\u{e741}'), + (0xA77C, '\u{e742}'), + (0xA77D, '\u{e743}'), + (0xA77E, '\u{e744}'), + (0xA780, '\u{e745}'), + (0xA781, '\u{e746}'), + (0xA782, '\u{e747}'), + (0xA783, '\u{e748}'), + (0xA784, '\u{e749}'), + (0xA785, '\u{e74a}'), + (0xA786, '\u{e74b}'), + (0xA787, '\u{e74c}'), + (0xA788, '\u{e74d}'), + (0xA789, '\u{e74e}'), + (0xA78A, '\u{e74f}'), + (0xA78B, '\u{e750}'), + (0xA78C, '\u{e751}'), + (0xA78D, '\u{e752}'), + (0xA78E, '\u{e753}'), + (0xA78F, '\u{e754}'), + (0xA790, '\u{e755}'), + (0xA791, '\u{e756}'), + (0xA792, '\u{e757}'), + (0xA793, '\u{e758}'), + (0xA794, '\u{e759}'), + (0xA795, '\u{e75a}'), + (0xA796, '\u{e75b}'), + (0xA797, '\u{e75c}'), + (0xA798, '\u{e75d}'), + (0xA799, '\u{e75e}'), + (0xA79A, '\u{e75f}'), + (0xA79B, '\u{e760}'), + (0xA79C, '\u{e761}'), + (0xA79D, '\u{e762}'), + (0xA79E, '\u{e763}'), + (0xA79F, '\u{e764}'), + (0xA7A0, '\u{e765}'), + (0xA7C2, '\u{e7a0}'), + (0xA7C3, '\u{e7a1}'), + (0xA7C4, '\u{e7a2}'), + (0xA7C5, '\u{e7a3}'), + (0xA7C6, '\u{e7a4}'), + (0xA7C7, '\u{e7a5}'), + (0xA7C8, '\u{e7a6}'), + (0xA7C9, '\u{e7a7}'), + (0xA7CA, '\u{e7a8}'), + (0xA7CB, '\u{e7a9}'), + (0xA7CC, '\u{e7aa}'), + (0xA7CD, '\u{e7ab}'), + (0xA7CE, '\u{e7ac}'), + (0xA7CF, '\u{e7ad}'), + (0xA7D0, '\u{e7ae}'), + (0xA7F2, '\u{e7af}'), + (0xA7F3, '\u{e7b0}'), + (0xA7F4, '\u{e7b1}'), + (0xA7F5, '\u{e7b2}'), + (0xA7F6, '\u{e7b3}'), + (0xA7F7, '\u{e7b4}'), + (0xA7F8, '\u{e7b5}'), + (0xA7F9, '\u{e7b6}'), + (0xA7FA, '\u{e7b7}'), + (0xA7FB, '\u{e7b8}'), + (0xA7FC, '\u{e7b9}'), + (0xA7FD, '\u{e7ba}'), + (0xA7FE, '\u{e7bb}'), + (0xA896, '\u{e7bc}'), + (0xA897, '\u{e7bd}'), + (0xA898, '\u{e7be}'), + (0xA899, '\u{e7bf}'), + (0xA89A, '\u{e7c0}'), + (0xA89B, '\u{e7c1}'), + (0xA89C, '\u{e7c2}'), + (0xA89D, '\u{e7c3}'), + (0xA89E, '\u{e7c4}'), + (0xA89F, '\u{e7c5}'), + (0xA8A0, '\u{e7c6}'), + (0xA8BC, '\u{1e3f}'), + (0xA8C1, '\u{e7c9}'), + (0xA8C2, '\u{e7ca}'), + (0xA8C3, '\u{e7cb}'), + (0xA8C4, '\u{e7cc}'), + (0xA8EA, '\u{e7cd}'), + (0xA8EB, '\u{e7ce}'), + (0xA8EC, '\u{e7cf}'), + (0xA8ED, '\u{e7d0}'), + (0xA8EE, '\u{e7d1}'), + (0xA8EF, '\u{e7d2}'), + (0xA8F0, '\u{e7d3}'), + (0xA8F1, '\u{e7d4}'), + (0xA8F2, '\u{e7d5}'), + (0xA8F3, '\u{e7d6}'), + (0xA8F4, '\u{e7d7}'), + (0xA8F5, '\u{e7d8}'), + (0xA8F6, '\u{e7d9}'), + (0xA8F7, '\u{e7da}'), + (0xA8F8, '\u{e7db}'), + (0xA8F9, '\u{e7dc}'), + (0xA8FA, '\u{e7dd}'), + (0xA8FB, '\u{e7de}'), + (0xA8FC, '\u{e7df}'), + (0xA8FD, '\u{e7e0}'), + (0xA8FE, '\u{e7e1}'), + (0xA958, '\u{e7e2}'), + (0xA95B, '\u{e7e3}'), + (0xA95D, '\u{e7e4}'), + (0xA95E, '\u{e7e5}'), + (0xA95F, '\u{e7e6}'), + (0xA997, '\u{e7f4}'), + (0xA998, '\u{e7f5}'), + (0xA999, '\u{e7f6}'), + (0xA99A, '\u{e7f7}'), + (0xA99B, '\u{e7f8}'), + (0xA99C, '\u{e7f9}'), + (0xA99D, '\u{e7fa}'), + (0xA99E, '\u{e7fb}'), + (0xA99F, '\u{e7fc}'), + (0xA9A0, '\u{e7fd}'), + (0xA9A1, '\u{e7fe}'), + (0xA9A2, '\u{e7ff}'), + (0xA9A3, '\u{e800}'), + (0xA9F0, '\u{e801}'), + (0xA9F1, '\u{e802}'), + (0xA9F2, '\u{e803}'), + (0xA9F3, '\u{e804}'), + (0xA9F4, '\u{e805}'), + (0xA9F5, '\u{e806}'), + (0xA9F6, '\u{e807}'), + (0xA9F7, '\u{e808}'), + (0xA9F8, '\u{e809}'), + (0xA9F9, '\u{e80a}'), + (0xA9FA, '\u{e80b}'), + (0xA9FB, '\u{e80c}'), + (0xA9FC, '\u{e80d}'), + (0xA9FD, '\u{e80e}'), + (0xA9FE, '\u{e80f}'), + (0xAAA1, '\u{e000}'), + (0xAAA2, '\u{e001}'), + (0xAAA3, '\u{e002}'), + (0xAAA4, '\u{e003}'), + (0xAAA5, '\u{e004}'), + (0xAAA6, '\u{e005}'), + (0xAAA7, '\u{e006}'), + (0xAAA8, '\u{e007}'), + (0xAAA9, '\u{e008}'), + (0xAAAA, '\u{e009}'), + (0xAAAB, '\u{e00a}'), + (0xAAAC, '\u{e00b}'), + (0xAAAD, '\u{e00c}'), + (0xAAAE, '\u{e00d}'), + (0xAAAF, '\u{e00e}'), + (0xAAB0, '\u{e00f}'), + (0xAAB1, '\u{e010}'), + (0xAAB2, '\u{e011}'), + (0xAAB3, '\u{e012}'), + (0xAAB4, '\u{e013}'), + (0xAAB5, '\u{e014}'), + (0xAAB6, '\u{e015}'), + (0xAAB7, '\u{e016}'), + (0xAAB8, '\u{e017}'), + (0xAAB9, '\u{e018}'), + (0xAABA, '\u{e019}'), + (0xAABB, '\u{e01a}'), + (0xAABC, '\u{e01b}'), + (0xAABD, '\u{e01c}'), + (0xAABE, '\u{e01d}'), + (0xAABF, '\u{e01e}'), + (0xAAC0, '\u{e01f}'), + (0xAAC1, '\u{e020}'), + (0xAAC2, '\u{e021}'), + (0xAAC3, '\u{e022}'), + (0xAAC4, '\u{e023}'), + (0xAAC5, '\u{e024}'), + (0xAAC6, '\u{e025}'), + (0xAAC7, '\u{e026}'), + (0xAAC8, '\u{e027}'), + (0xAAC9, '\u{e028}'), + (0xAACA, '\u{e029}'), + (0xAACB, '\u{e02a}'), + (0xAACC, '\u{e02b}'), + (0xAACD, '\u{e02c}'), + (0xAACE, '\u{e02d}'), + (0xAACF, '\u{e02e}'), + (0xAAD0, '\u{e02f}'), + (0xAAD1, '\u{e030}'), + (0xAAD2, '\u{e031}'), + (0xAAD3, '\u{e032}'), + (0xAAD4, '\u{e033}'), + (0xAAD5, '\u{e034}'), + (0xAAD6, '\u{e035}'), + (0xAAD7, '\u{e036}'), + (0xAAD8, '\u{e037}'), + (0xAAD9, '\u{e038}'), + (0xAADA, '\u{e039}'), + (0xAADB, '\u{e03a}'), + (0xAADC, '\u{e03b}'), + (0xAADD, '\u{e03c}'), + (0xAADE, '\u{e03d}'), + (0xAADF, '\u{e03e}'), + (0xAAE0, '\u{e03f}'), + (0xAAE1, '\u{e040}'), + (0xAAE2, '\u{e041}'), + (0xAAE3, '\u{e042}'), + (0xAAE4, '\u{e043}'), + (0xAAE5, '\u{e044}'), + (0xAAE6, '\u{e045}'), + (0xAAE7, '\u{e046}'), + (0xAAE8, '\u{e047}'), + (0xAAE9, '\u{e048}'), + (0xAAEA, '\u{e049}'), + (0xAAEB, '\u{e04a}'), + (0xAAEC, '\u{e04b}'), + (0xAAED, '\u{e04c}'), + (0xAAEE, '\u{e04d}'), + (0xAAEF, '\u{e04e}'), + (0xAAF0, '\u{e04f}'), + (0xAAF1, '\u{e050}'), + (0xAAF2, '\u{e051}'), + (0xAAF3, '\u{e052}'), + (0xAAF4, '\u{e053}'), + (0xAAF5, '\u{e054}'), + (0xAAF6, '\u{e055}'), + (0xAAF7, '\u{e056}'), + (0xAAF8, '\u{e057}'), + (0xAAF9, '\u{e058}'), + (0xAAFA, '\u{e059}'), + (0xAAFB, '\u{e05a}'), + (0xAAFC, '\u{e05b}'), + (0xAAFD, '\u{e05c}'), + (0xAAFE, '\u{e05d}'), + (0xABA1, '\u{e05e}'), + (0xABA2, '\u{e05f}'), + (0xABA3, '\u{e060}'), + (0xABA4, '\u{e061}'), + (0xABA5, '\u{e062}'), + (0xABA6, '\u{e063}'), + (0xABA7, '\u{e064}'), + (0xABA8, '\u{e065}'), + (0xABA9, '\u{e066}'), + (0xABAA, '\u{e067}'), + (0xABAB, '\u{e068}'), + (0xABAC, '\u{e069}'), + (0xABAD, '\u{e06a}'), + (0xABAE, '\u{e06b}'), + (0xABAF, '\u{e06c}'), + (0xABB0, '\u{e06d}'), + (0xABB1, '\u{e06e}'), + (0xABB2, '\u{e06f}'), + (0xABB3, '\u{e070}'), + (0xABB4, '\u{e071}'), + (0xABB5, '\u{e072}'), + (0xABB6, '\u{e073}'), + (0xABB7, '\u{e074}'), + (0xABB8, '\u{e075}'), + (0xABB9, '\u{e076}'), + (0xABBA, '\u{e077}'), + (0xABBB, '\u{e078}'), + (0xABBC, '\u{e079}'), + (0xABBD, '\u{e07a}'), + (0xABBE, '\u{e07b}'), + (0xABBF, '\u{e07c}'), + (0xABC0, '\u{e07d}'), + (0xABC1, '\u{e07e}'), + (0xABC2, '\u{e07f}'), + (0xABC3, '\u{e080}'), + (0xABC4, '\u{e081}'), + (0xABC5, '\u{e082}'), + (0xABC6, '\u{e083}'), + (0xABC7, '\u{e084}'), + (0xABC8, '\u{e085}'), + (0xABC9, '\u{e086}'), + (0xABCA, '\u{e087}'), + (0xABCB, '\u{e088}'), + (0xABCC, '\u{e089}'), + (0xABCD, '\u{e08a}'), + (0xABCE, '\u{e08b}'), + (0xABCF, '\u{e08c}'), + (0xABD0, '\u{e08d}'), + (0xABD1, '\u{e08e}'), + (0xABD2, '\u{e08f}'), + (0xABD3, '\u{e090}'), + (0xABD4, '\u{e091}'), + (0xABD5, '\u{e092}'), + (0xABD6, '\u{e093}'), + (0xABD7, '\u{e094}'), + (0xABD8, '\u{e095}'), + (0xABD9, '\u{e096}'), + (0xABDA, '\u{e097}'), + (0xABDB, '\u{e098}'), + (0xABDC, '\u{e099}'), + (0xABDD, '\u{e09a}'), + (0xABDE, '\u{e09b}'), + (0xABDF, '\u{e09c}'), + (0xABE0, '\u{e09d}'), + (0xABE1, '\u{e09e}'), + (0xABE2, '\u{e09f}'), + (0xABE3, '\u{e0a0}'), + (0xABE4, '\u{e0a1}'), + (0xABE5, '\u{e0a2}'), + (0xABE6, '\u{e0a3}'), + (0xABE7, '\u{e0a4}'), + (0xABE8, '\u{e0a5}'), + (0xABE9, '\u{e0a6}'), + (0xABEA, '\u{e0a7}'), + (0xABEB, '\u{e0a8}'), + (0xABEC, '\u{e0a9}'), + (0xABED, '\u{e0aa}'), + (0xABEE, '\u{e0ab}'), + (0xABEF, '\u{e0ac}'), + (0xABF0, '\u{e0ad}'), + (0xABF1, '\u{e0ae}'), + (0xABF2, '\u{e0af}'), + (0xABF3, '\u{e0b0}'), + (0xABF4, '\u{e0b1}'), + (0xABF5, '\u{e0b2}'), + (0xABF6, '\u{e0b3}'), + (0xABF7, '\u{e0b4}'), + (0xABF8, '\u{e0b5}'), + (0xABF9, '\u{e0b6}'), + (0xABFA, '\u{e0b7}'), + (0xABFB, '\u{e0b8}'), + (0xABFC, '\u{e0b9}'), + (0xABFD, '\u{e0ba}'), + (0xABFE, '\u{e0bb}'), + (0xACA1, '\u{e0bc}'), + (0xACA2, '\u{e0bd}'), + (0xACA3, '\u{e0be}'), + (0xACA4, '\u{e0bf}'), + (0xACA5, '\u{e0c0}'), + (0xACA6, '\u{e0c1}'), + (0xACA7, '\u{e0c2}'), + (0xACA8, '\u{e0c3}'), + (0xACA9, '\u{e0c4}'), + (0xACAA, '\u{e0c5}'), + (0xACAB, '\u{e0c6}'), + (0xACAC, '\u{e0c7}'), + (0xACAD, '\u{e0c8}'), + (0xACAE, '\u{e0c9}'), + (0xACAF, '\u{e0ca}'), + (0xACB0, '\u{e0cb}'), + (0xACB1, '\u{e0cc}'), + (0xACB2, '\u{e0cd}'), + (0xACB3, '\u{e0ce}'), + (0xACB4, '\u{e0cf}'), + (0xACB5, '\u{e0d0}'), + (0xACB6, '\u{e0d1}'), + (0xACB7, '\u{e0d2}'), + (0xACB8, '\u{e0d3}'), + (0xACB9, '\u{e0d4}'), + (0xACBA, '\u{e0d5}'), + (0xACBB, '\u{e0d6}'), + (0xACBC, '\u{e0d7}'), + (0xACBD, '\u{e0d8}'), + (0xACBE, '\u{e0d9}'), + (0xACBF, '\u{e0da}'), + (0xACC0, '\u{e0db}'), + (0xACC1, '\u{e0dc}'), + (0xACC2, '\u{e0dd}'), + (0xACC3, '\u{e0de}'), + (0xACC4, '\u{e0df}'), + (0xACC5, '\u{e0e0}'), + (0xACC6, '\u{e0e1}'), + (0xACC7, '\u{e0e2}'), + (0xACC8, '\u{e0e3}'), + (0xACC9, '\u{e0e4}'), + (0xACCA, '\u{e0e5}'), + (0xACCB, '\u{e0e6}'), + (0xACCC, '\u{e0e7}'), + (0xACCD, '\u{e0e8}'), + (0xACCE, '\u{e0e9}'), + (0xACCF, '\u{e0ea}'), + (0xACD0, '\u{e0eb}'), + (0xACD1, '\u{e0ec}'), + (0xACD2, '\u{e0ed}'), + (0xACD3, '\u{e0ee}'), + (0xACD4, '\u{e0ef}'), + (0xACD5, '\u{e0f0}'), + (0xACD6, '\u{e0f1}'), + (0xACD7, '\u{e0f2}'), + (0xACD8, '\u{e0f3}'), + (0xACD9, '\u{e0f4}'), + (0xACDA, '\u{e0f5}'), + (0xACDB, '\u{e0f6}'), + (0xACDC, '\u{e0f7}'), + (0xACDD, '\u{e0f8}'), + (0xACDE, '\u{e0f9}'), + (0xACDF, '\u{e0fa}'), + (0xACE0, '\u{e0fb}'), + (0xACE1, '\u{e0fc}'), + (0xACE2, '\u{e0fd}'), + (0xACE3, '\u{e0fe}'), + (0xACE4, '\u{e0ff}'), + (0xACE5, '\u{e100}'), + (0xACE6, '\u{e101}'), + (0xACE7, '\u{e102}'), + (0xACE8, '\u{e103}'), + (0xACE9, '\u{e104}'), + (0xACEA, '\u{e105}'), + (0xACEB, '\u{e106}'), + (0xACEC, '\u{e107}'), + (0xACED, '\u{e108}'), + (0xACEE, '\u{e109}'), + (0xACEF, '\u{e10a}'), + (0xACF0, '\u{e10b}'), + (0xACF1, '\u{e10c}'), + (0xACF2, '\u{e10d}'), + (0xACF3, '\u{e10e}'), + (0xACF4, '\u{e10f}'), + (0xACF5, '\u{e110}'), + (0xACF6, '\u{e111}'), + (0xACF7, '\u{e112}'), + (0xACF8, '\u{e113}'), + (0xACF9, '\u{e114}'), + (0xACFA, '\u{e115}'), + (0xACFB, '\u{e116}'), + (0xACFC, '\u{e117}'), + (0xACFD, '\u{e118}'), + (0xACFE, '\u{e119}'), + (0xADA1, '\u{e11a}'), + (0xADA2, '\u{e11b}'), + (0xADA3, '\u{e11c}'), + (0xADA4, '\u{e11d}'), + (0xADA5, '\u{e11e}'), + (0xADA6, '\u{e11f}'), + (0xADA7, '\u{e120}'), + (0xADA8, '\u{e121}'), + (0xADA9, '\u{e122}'), + (0xADAA, '\u{e123}'), + (0xADAB, '\u{e124}'), + (0xADAC, '\u{e125}'), + (0xADAD, '\u{e126}'), + (0xADAE, '\u{e127}'), + (0xADAF, '\u{e128}'), + (0xADB0, '\u{e129}'), + (0xADB1, '\u{e12a}'), + (0xADB2, '\u{e12b}'), + (0xADB3, '\u{e12c}'), + (0xADB4, '\u{e12d}'), + (0xADB5, '\u{e12e}'), + (0xADB6, '\u{e12f}'), + (0xADB7, '\u{e130}'), + (0xADB8, '\u{e131}'), + (0xADB9, '\u{e132}'), + (0xADBA, '\u{e133}'), + (0xADBB, '\u{e134}'), + (0xADBC, '\u{e135}'), + (0xADBD, '\u{e136}'), + (0xADBE, '\u{e137}'), + (0xADBF, '\u{e138}'), + (0xADC0, '\u{e139}'), + (0xADC1, '\u{e13a}'), + (0xADC2, '\u{e13b}'), + (0xADC3, '\u{e13c}'), + (0xADC4, '\u{e13d}'), + (0xADC5, '\u{e13e}'), + (0xADC6, '\u{e13f}'), + (0xADC7, '\u{e140}'), + (0xADC8, '\u{e141}'), + (0xADC9, '\u{e142}'), + (0xADCA, '\u{e143}'), + (0xADCB, '\u{e144}'), + (0xADCC, '\u{e145}'), + (0xADCD, '\u{e146}'), + (0xADCE, '\u{e147}'), + (0xADCF, '\u{e148}'), + (0xADD0, '\u{e149}'), + (0xADD1, '\u{e14a}'), + (0xADD2, '\u{e14b}'), + (0xADD3, '\u{e14c}'), + (0xADD4, '\u{e14d}'), + (0xADD5, '\u{e14e}'), + (0xADD6, '\u{e14f}'), + (0xADD7, '\u{e150}'), + (0xADD8, '\u{e151}'), + (0xADD9, '\u{e152}'), + (0xADDA, '\u{e153}'), + (0xADDB, '\u{e154}'), + (0xADDC, '\u{e155}'), + (0xADDD, '\u{e156}'), + (0xADDE, '\u{e157}'), + (0xADDF, '\u{e158}'), + (0xADE0, '\u{e159}'), + (0xADE1, '\u{e15a}'), + (0xADE2, '\u{e15b}'), + (0xADE3, '\u{e15c}'), + (0xADE4, '\u{e15d}'), + (0xADE5, '\u{e15e}'), + (0xADE6, '\u{e15f}'), + (0xADE7, '\u{e160}'), + (0xADE8, '\u{e161}'), + (0xADE9, '\u{e162}'), + (0xADEA, '\u{e163}'), + (0xADEB, '\u{e164}'), + (0xADEC, '\u{e165}'), + (0xADED, '\u{e166}'), + (0xADEE, '\u{e167}'), + (0xADEF, '\u{e168}'), + (0xADF0, '\u{e169}'), + (0xADF1, '\u{e16a}'), + (0xADF2, '\u{e16b}'), + (0xADF3, '\u{e16c}'), + (0xADF4, '\u{e16d}'), + (0xADF5, '\u{e16e}'), + (0xADF6, '\u{e16f}'), + (0xADF7, '\u{e170}'), + (0xADF8, '\u{e171}'), + (0xADF9, '\u{e172}'), + (0xADFA, '\u{e173}'), + (0xADFB, '\u{e174}'), + (0xADFC, '\u{e175}'), + (0xADFD, '\u{e176}'), + (0xADFE, '\u{e177}'), + (0xAEA1, '\u{e178}'), + (0xAEA2, '\u{e179}'), + (0xAEA3, '\u{e17a}'), + (0xAEA4, '\u{e17b}'), + (0xAEA5, '\u{e17c}'), + (0xAEA6, '\u{e17d}'), + (0xAEA7, '\u{e17e}'), + (0xAEA8, '\u{e17f}'), + (0xAEA9, '\u{e180}'), + (0xAEAA, '\u{e181}'), + (0xAEAB, '\u{e182}'), + (0xAEAC, '\u{e183}'), + (0xAEAD, '\u{e184}'), + (0xAEAE, '\u{e185}'), + (0xAEAF, '\u{e186}'), + (0xAEB0, '\u{e187}'), + (0xAEB1, '\u{e188}'), + (0xAEB2, '\u{e189}'), + (0xAEB3, '\u{e18a}'), + (0xAEB4, '\u{e18b}'), + (0xAEB5, '\u{e18c}'), + (0xAEB6, '\u{e18d}'), + (0xAEB7, '\u{e18e}'), + (0xAEB8, '\u{e18f}'), + (0xAEB9, '\u{e190}'), + (0xAEBA, '\u{e191}'), + (0xAEBB, '\u{e192}'), + (0xAEBC, '\u{e193}'), + (0xAEBD, '\u{e194}'), + (0xAEBE, '\u{e195}'), + (0xAEBF, '\u{e196}'), + (0xAEC0, '\u{e197}'), + (0xAEC1, '\u{e198}'), + (0xAEC2, '\u{e199}'), + (0xAEC3, '\u{e19a}'), + (0xAEC4, '\u{e19b}'), + (0xAEC5, '\u{e19c}'), + (0xAEC6, '\u{e19d}'), + (0xAEC7, '\u{e19e}'), + (0xAEC8, '\u{e19f}'), + (0xAEC9, '\u{e1a0}'), + (0xAECA, '\u{e1a1}'), + (0xAECB, '\u{e1a2}'), + (0xAECC, '\u{e1a3}'), + (0xAECD, '\u{e1a4}'), + (0xAECE, '\u{e1a5}'), + (0xAECF, '\u{e1a6}'), + (0xAED0, '\u{e1a7}'), + (0xAED1, '\u{e1a8}'), + (0xAED2, '\u{e1a9}'), + (0xAED3, '\u{e1aa}'), + (0xAED4, '\u{e1ab}'), + (0xAED5, '\u{e1ac}'), + (0xAED6, '\u{e1ad}'), + (0xAED7, '\u{e1ae}'), + (0xAED8, '\u{e1af}'), + (0xAED9, '\u{e1b0}'), + (0xAEDA, '\u{e1b1}'), + (0xAEDB, '\u{e1b2}'), + (0xAEDC, '\u{e1b3}'), + (0xAEDD, '\u{e1b4}'), + (0xAEDE, '\u{e1b5}'), + (0xAEDF, '\u{e1b6}'), + (0xAEE0, '\u{e1b7}'), + (0xAEE1, '\u{e1b8}'), + (0xAEE2, '\u{e1b9}'), + (0xAEE3, '\u{e1ba}'), + (0xAEE4, '\u{e1bb}'), + (0xAEE5, '\u{e1bc}'), + (0xAEE6, '\u{e1bd}'), + (0xAEE7, '\u{e1be}'), + (0xAEE8, '\u{e1bf}'), + (0xAEE9, '\u{e1c0}'), + (0xAEEA, '\u{e1c1}'), + (0xAEEB, '\u{e1c2}'), + (0xAEEC, '\u{e1c3}'), + (0xAEED, '\u{e1c4}'), + (0xAEEE, '\u{e1c5}'), + (0xAEEF, '\u{e1c6}'), + (0xAEF0, '\u{e1c7}'), + (0xAEF1, '\u{e1c8}'), + (0xAEF2, '\u{e1c9}'), + (0xAEF3, '\u{e1ca}'), + (0xAEF4, '\u{e1cb}'), + (0xAEF5, '\u{e1cc}'), + (0xAEF6, '\u{e1cd}'), + (0xAEF7, '\u{e1ce}'), + (0xAEF8, '\u{e1cf}'), + (0xAEF9, '\u{e1d0}'), + (0xAEFA, '\u{e1d1}'), + (0xAEFB, '\u{e1d2}'), + (0xAEFC, '\u{e1d3}'), + (0xAEFD, '\u{e1d4}'), + (0xAEFE, '\u{e1d5}'), + (0xAFA1, '\u{e1d6}'), + (0xAFA2, '\u{e1d7}'), + (0xAFA3, '\u{e1d8}'), + (0xAFA4, '\u{e1d9}'), + (0xAFA5, '\u{e1da}'), + (0xAFA6, '\u{e1db}'), + (0xAFA7, '\u{e1dc}'), + (0xAFA8, '\u{e1dd}'), + (0xAFA9, '\u{e1de}'), + (0xAFAA, '\u{e1df}'), + (0xAFAB, '\u{e1e0}'), + (0xAFAC, '\u{e1e1}'), + (0xAFAD, '\u{e1e2}'), + (0xAFAE, '\u{e1e3}'), + (0xAFAF, '\u{e1e4}'), + (0xAFB0, '\u{e1e5}'), + (0xAFB1, '\u{e1e6}'), + (0xAFB2, '\u{e1e7}'), + (0xAFB3, '\u{e1e8}'), + (0xAFB4, '\u{e1e9}'), + (0xAFB5, '\u{e1ea}'), + (0xAFB6, '\u{e1eb}'), + (0xAFB7, '\u{e1ec}'), + (0xAFB8, '\u{e1ed}'), + (0xAFB9, '\u{e1ee}'), + (0xAFBA, '\u{e1ef}'), + (0xAFBB, '\u{e1f0}'), + (0xAFBC, '\u{e1f1}'), + (0xAFBD, '\u{e1f2}'), + (0xAFBE, '\u{e1f3}'), + (0xAFBF, '\u{e1f4}'), + (0xAFC0, '\u{e1f5}'), + (0xAFC1, '\u{e1f6}'), + (0xAFC2, '\u{e1f7}'), + (0xAFC3, '\u{e1f8}'), + (0xAFC4, '\u{e1f9}'), + (0xAFC5, '\u{e1fa}'), + (0xAFC6, '\u{e1fb}'), + (0xAFC7, '\u{e1fc}'), + (0xAFC8, '\u{e1fd}'), + (0xAFC9, '\u{e1fe}'), + (0xAFCA, '\u{e1ff}'), + (0xAFCB, '\u{e200}'), + (0xAFCC, '\u{e201}'), + (0xAFCD, '\u{e202}'), + (0xAFCE, '\u{e203}'), + (0xAFCF, '\u{e204}'), + (0xAFD0, '\u{e205}'), + (0xAFD1, '\u{e206}'), + (0xAFD2, '\u{e207}'), + (0xAFD3, '\u{e208}'), + (0xAFD4, '\u{e209}'), + (0xAFD5, '\u{e20a}'), + (0xAFD6, '\u{e20b}'), + (0xAFD7, '\u{e20c}'), + (0xAFD8, '\u{e20d}'), + (0xAFD9, '\u{e20e}'), + (0xAFDA, '\u{e20f}'), + (0xAFDB, '\u{e210}'), + (0xAFDC, '\u{e211}'), + (0xAFDD, '\u{e212}'), + (0xAFDE, '\u{e213}'), + (0xAFDF, '\u{e214}'), + (0xAFE0, '\u{e215}'), + (0xAFE1, '\u{e216}'), + (0xAFE2, '\u{e217}'), + (0xAFE3, '\u{e218}'), + (0xAFE4, '\u{e219}'), + (0xAFE5, '\u{e21a}'), + (0xAFE6, '\u{e21b}'), + (0xAFE7, '\u{e21c}'), + (0xAFE8, '\u{e21d}'), + (0xAFE9, '\u{e21e}'), + (0xAFEA, '\u{e21f}'), + (0xAFEB, '\u{e220}'), + (0xAFEC, '\u{e221}'), + (0xAFED, '\u{e222}'), + (0xAFEE, '\u{e223}'), + (0xAFEF, '\u{e224}'), + (0xAFF0, '\u{e225}'), + (0xAFF1, '\u{e226}'), + (0xAFF2, '\u{e227}'), + (0xAFF3, '\u{e228}'), + (0xAFF4, '\u{e229}'), + (0xAFF5, '\u{e22a}'), + (0xAFF6, '\u{e22b}'), + (0xAFF7, '\u{e22c}'), + (0xAFF8, '\u{e22d}'), + (0xAFF9, '\u{e22e}'), + (0xAFFA, '\u{e22f}'), + (0xAFFB, '\u{e230}'), + (0xAFFC, '\u{e231}'), + (0xAFFD, '\u{e232}'), + (0xAFFE, '\u{e233}'), + (0xD7FA, '\u{e810}'), + (0xD7FB, '\u{e811}'), + (0xD7FC, '\u{e812}'), + (0xD7FD, '\u{e813}'), + (0xD7FE, '\u{e814}'), + (0xF8A1, '\u{e234}'), + (0xF8A2, '\u{e235}'), + (0xF8A3, '\u{e236}'), + (0xF8A4, '\u{e237}'), + (0xF8A5, '\u{e238}'), + (0xF8A6, '\u{e239}'), + (0xF8A7, '\u{e23a}'), + (0xF8A8, '\u{e23b}'), + (0xF8A9, '\u{e23c}'), + (0xF8AA, '\u{e23d}'), + (0xF8AB, '\u{e23e}'), + (0xF8AC, '\u{e23f}'), + (0xF8AD, '\u{e240}'), + (0xF8AE, '\u{e241}'), + (0xF8AF, '\u{e242}'), + (0xF8B0, '\u{e243}'), + (0xF8B1, '\u{e244}'), + (0xF8B2, '\u{e245}'), + (0xF8B3, '\u{e246}'), + (0xF8B4, '\u{e247}'), + (0xF8B5, '\u{e248}'), + (0xF8B6, '\u{e249}'), + (0xF8B7, '\u{e24a}'), + (0xF8B8, '\u{e24b}'), + (0xF8B9, '\u{e24c}'), + (0xF8BA, '\u{e24d}'), + (0xF8BB, '\u{e24e}'), + (0xF8BC, '\u{e24f}'), + (0xF8BD, '\u{e250}'), + (0xF8BE, '\u{e251}'), + (0xF8BF, '\u{e252}'), + (0xF8C0, '\u{e253}'), + (0xF8C1, '\u{e254}'), + (0xF8C2, '\u{e255}'), + (0xF8C3, '\u{e256}'), + (0xF8C4, '\u{e257}'), + (0xF8C5, '\u{e258}'), + (0xF8C6, '\u{e259}'), + (0xF8C7, '\u{e25a}'), + (0xF8C8, '\u{e25b}'), + (0xF8C9, '\u{e25c}'), + (0xF8CA, '\u{e25d}'), + (0xF8CB, '\u{e25e}'), + (0xF8CC, '\u{e25f}'), + (0xF8CD, '\u{e260}'), + (0xF8CE, '\u{e261}'), + (0xF8CF, '\u{e262}'), + (0xF8D0, '\u{e263}'), + (0xF8D1, '\u{e264}'), + (0xF8D2, '\u{e265}'), + (0xF8D3, '\u{e266}'), + (0xF8D4, '\u{e267}'), + (0xF8D5, '\u{e268}'), + (0xF8D6, '\u{e269}'), + (0xF8D7, '\u{e26a}'), + (0xF8D8, '\u{e26b}'), + (0xF8D9, '\u{e26c}'), + (0xF8DA, '\u{e26d}'), + (0xF8DB, '\u{e26e}'), + (0xF8DC, '\u{e26f}'), + (0xF8DD, '\u{e270}'), + (0xF8DE, '\u{e271}'), + (0xF8DF, '\u{e272}'), + (0xF8E0, '\u{e273}'), + (0xF8E1, '\u{e274}'), + (0xF8E2, '\u{e275}'), + (0xF8E3, '\u{e276}'), + (0xF8E4, '\u{e277}'), + (0xF8E5, '\u{e278}'), + (0xF8E6, '\u{e279}'), + (0xF8E7, '\u{e27a}'), + (0xF8E8, '\u{e27b}'), + (0xF8E9, '\u{e27c}'), + (0xF8EA, '\u{e27d}'), + (0xF8EB, '\u{e27e}'), + (0xF8EC, '\u{e27f}'), + (0xF8ED, '\u{e280}'), + (0xF8EE, '\u{e281}'), + (0xF8EF, '\u{e282}'), + (0xF8F0, '\u{e283}'), + (0xF8F1, '\u{e284}'), + (0xF8F2, '\u{e285}'), + (0xF8F3, '\u{e286}'), + (0xF8F4, '\u{e287}'), + (0xF8F5, '\u{e288}'), + (0xF8F6, '\u{e289}'), + (0xF8F7, '\u{e28a}'), + (0xF8F8, '\u{e28b}'), + (0xF8F9, '\u{e28c}'), + (0xF8FA, '\u{e28d}'), + (0xF8FB, '\u{e28e}'), + (0xF8FC, '\u{e28f}'), + (0xF8FD, '\u{e290}'), + (0xF8FE, '\u{e291}'), + (0xF9A1, '\u{e292}'), + (0xF9A2, '\u{e293}'), + (0xF9A3, '\u{e294}'), + (0xF9A4, '\u{e295}'), + (0xF9A5, '\u{e296}'), + (0xF9A6, '\u{e297}'), + (0xF9A7, '\u{e298}'), + (0xF9A8, '\u{e299}'), + (0xF9A9, '\u{e29a}'), + (0xF9AA, '\u{e29b}'), + (0xF9AB, '\u{e29c}'), + (0xF9AC, '\u{e29d}'), + (0xF9AD, '\u{e29e}'), + (0xF9AE, '\u{e29f}'), + (0xF9AF, '\u{e2a0}'), + (0xF9B0, '\u{e2a1}'), + (0xF9B1, '\u{e2a2}'), + (0xF9B2, '\u{e2a3}'), + (0xF9B3, '\u{e2a4}'), + (0xF9B4, '\u{e2a5}'), + (0xF9B5, '\u{e2a6}'), + (0xF9B6, '\u{e2a7}'), + (0xF9B7, '\u{e2a8}'), + (0xF9B8, '\u{e2a9}'), + (0xF9B9, '\u{e2aa}'), + (0xF9BA, '\u{e2ab}'), + (0xF9BB, '\u{e2ac}'), + (0xF9BC, '\u{e2ad}'), + (0xF9BD, '\u{e2ae}'), + (0xF9BE, '\u{e2af}'), + (0xF9BF, '\u{e2b0}'), + (0xF9C0, '\u{e2b1}'), + (0xF9C1, '\u{e2b2}'), + (0xF9C2, '\u{e2b3}'), + (0xF9C3, '\u{e2b4}'), + (0xF9C4, '\u{e2b5}'), + (0xF9C5, '\u{e2b6}'), + (0xF9C6, '\u{e2b7}'), + (0xF9C7, '\u{e2b8}'), + (0xF9C8, '\u{e2b9}'), + (0xF9C9, '\u{e2ba}'), + (0xF9CA, '\u{e2bb}'), + (0xF9CB, '\u{e2bc}'), + (0xF9CC, '\u{e2bd}'), + (0xF9CD, '\u{e2be}'), + (0xF9CE, '\u{e2bf}'), + (0xF9CF, '\u{e2c0}'), + (0xF9D0, '\u{e2c1}'), + (0xF9D1, '\u{e2c2}'), + (0xF9D2, '\u{e2c3}'), + (0xF9D3, '\u{e2c4}'), + (0xF9D4, '\u{e2c5}'), + (0xF9D5, '\u{e2c6}'), + (0xF9D6, '\u{e2c7}'), + (0xF9D7, '\u{e2c8}'), + (0xF9D8, '\u{e2c9}'), + (0xF9D9, '\u{e2ca}'), + (0xF9DA, '\u{e2cb}'), + (0xF9DB, '\u{e2cc}'), + (0xF9DC, '\u{e2cd}'), + (0xF9DD, '\u{e2ce}'), + (0xF9DE, '\u{e2cf}'), + (0xF9DF, '\u{e2d0}'), + (0xF9E0, '\u{e2d1}'), + (0xF9E1, '\u{e2d2}'), + (0xF9E2, '\u{e2d3}'), + (0xF9E3, '\u{e2d4}'), + (0xF9E4, '\u{e2d5}'), + (0xF9E5, '\u{e2d6}'), + (0xF9E6, '\u{e2d7}'), + (0xF9E7, '\u{e2d8}'), + (0xF9E8, '\u{e2d9}'), + (0xF9E9, '\u{e2da}'), + (0xF9EA, '\u{e2db}'), + (0xF9EB, '\u{e2dc}'), + (0xF9EC, '\u{e2dd}'), + (0xF9ED, '\u{e2de}'), + (0xF9EE, '\u{e2df}'), + (0xF9EF, '\u{e2e0}'), + (0xF9F0, '\u{e2e1}'), + (0xF9F1, '\u{e2e2}'), + (0xF9F2, '\u{e2e3}'), + (0xF9F3, '\u{e2e4}'), + (0xF9F4, '\u{e2e5}'), + (0xF9F5, '\u{e2e6}'), + (0xF9F6, '\u{e2e7}'), + (0xF9F7, '\u{e2e8}'), + (0xF9F8, '\u{e2e9}'), + (0xF9F9, '\u{e2ea}'), + (0xF9FA, '\u{e2eb}'), + (0xF9FB, '\u{e2ec}'), + (0xF9FC, '\u{e2ed}'), + (0xF9FD, '\u{e2ee}'), + (0xF9FE, '\u{e2ef}'), + (0xFAA1, '\u{e2f0}'), + (0xFAA2, '\u{e2f1}'), + (0xFAA3, '\u{e2f2}'), + (0xFAA4, '\u{e2f3}'), + (0xFAA5, '\u{e2f4}'), + (0xFAA6, '\u{e2f5}'), + (0xFAA7, '\u{e2f6}'), + (0xFAA8, '\u{e2f7}'), + (0xFAA9, '\u{e2f8}'), + (0xFAAA, '\u{e2f9}'), + (0xFAAB, '\u{e2fa}'), + (0xFAAC, '\u{e2fb}'), + (0xFAAD, '\u{e2fc}'), + (0xFAAE, '\u{e2fd}'), + (0xFAAF, '\u{e2fe}'), + (0xFAB0, '\u{e2ff}'), + (0xFAB1, '\u{e300}'), + (0xFAB2, '\u{e301}'), + (0xFAB3, '\u{e302}'), + (0xFAB4, '\u{e303}'), + (0xFAB5, '\u{e304}'), + (0xFAB6, '\u{e305}'), + (0xFAB7, '\u{e306}'), + (0xFAB8, '\u{e307}'), + (0xFAB9, '\u{e308}'), + (0xFABA, '\u{e309}'), + (0xFABB, '\u{e30a}'), + (0xFABC, '\u{e30b}'), + (0xFABD, '\u{e30c}'), + (0xFABE, '\u{e30d}'), + (0xFABF, '\u{e30e}'), + (0xFAC0, '\u{e30f}'), + (0xFAC1, '\u{e310}'), + (0xFAC2, '\u{e311}'), + (0xFAC3, '\u{e312}'), + (0xFAC4, '\u{e313}'), + (0xFAC5, '\u{e314}'), + (0xFAC6, '\u{e315}'), + (0xFAC7, '\u{e316}'), + (0xFAC8, '\u{e317}'), + (0xFAC9, '\u{e318}'), + (0xFACA, '\u{e319}'), + (0xFACB, '\u{e31a}'), + (0xFACC, '\u{e31b}'), + (0xFACD, '\u{e31c}'), + (0xFACE, '\u{e31d}'), + (0xFACF, '\u{e31e}'), + (0xFAD0, '\u{e31f}'), + (0xFAD1, '\u{e320}'), + (0xFAD2, '\u{e321}'), + (0xFAD3, '\u{e322}'), + (0xFAD4, '\u{e323}'), + (0xFAD5, '\u{e324}'), + (0xFAD6, '\u{e325}'), + (0xFAD7, '\u{e326}'), + (0xFAD8, '\u{e327}'), + (0xFAD9, '\u{e328}'), + (0xFADA, '\u{e329}'), + (0xFADB, '\u{e32a}'), + (0xFADC, '\u{e32b}'), + (0xFADD, '\u{e32c}'), + (0xFADE, '\u{e32d}'), + (0xFADF, '\u{e32e}'), + (0xFAE0, '\u{e32f}'), + (0xFAE1, '\u{e330}'), + (0xFAE2, '\u{e331}'), + (0xFAE3, '\u{e332}'), + (0xFAE4, '\u{e333}'), + (0xFAE5, '\u{e334}'), + (0xFAE6, '\u{e335}'), + (0xFAE7, '\u{e336}'), + (0xFAE8, '\u{e337}'), + (0xFAE9, '\u{e338}'), + (0xFAEA, '\u{e339}'), + (0xFAEB, '\u{e33a}'), + (0xFAEC, '\u{e33b}'), + (0xFAED, '\u{e33c}'), + (0xFAEE, '\u{e33d}'), + (0xFAEF, '\u{e33e}'), + (0xFAF0, '\u{e33f}'), + (0xFAF1, '\u{e340}'), + (0xFAF2, '\u{e341}'), + (0xFAF3, '\u{e342}'), + (0xFAF4, '\u{e343}'), + (0xFAF5, '\u{e344}'), + (0xFAF6, '\u{e345}'), + (0xFAF7, '\u{e346}'), + (0xFAF8, '\u{e347}'), + (0xFAF9, '\u{e348}'), + (0xFAFA, '\u{e349}'), + (0xFAFB, '\u{e34a}'), + (0xFAFC, '\u{e34b}'), + (0xFAFD, '\u{e34c}'), + (0xFAFE, '\u{e34d}'), + (0xFBA1, '\u{e34e}'), + (0xFBA2, '\u{e34f}'), + (0xFBA3, '\u{e350}'), + (0xFBA4, '\u{e351}'), + (0xFBA5, '\u{e352}'), + (0xFBA6, '\u{e353}'), + (0xFBA7, '\u{e354}'), + (0xFBA8, '\u{e355}'), + (0xFBA9, '\u{e356}'), + (0xFBAA, '\u{e357}'), + (0xFBAB, '\u{e358}'), + (0xFBAC, '\u{e359}'), + (0xFBAD, '\u{e35a}'), + (0xFBAE, '\u{e35b}'), + (0xFBAF, '\u{e35c}'), + (0xFBB0, '\u{e35d}'), + (0xFBB1, '\u{e35e}'), + (0xFBB2, '\u{e35f}'), + (0xFBB3, '\u{e360}'), + (0xFBB4, '\u{e361}'), + (0xFBB5, '\u{e362}'), + (0xFBB6, '\u{e363}'), + (0xFBB7, '\u{e364}'), + (0xFBB8, '\u{e365}'), + (0xFBB9, '\u{e366}'), + (0xFBBA, '\u{e367}'), + (0xFBBB, '\u{e368}'), + (0xFBBC, '\u{e369}'), + (0xFBBD, '\u{e36a}'), + (0xFBBE, '\u{e36b}'), + (0xFBBF, '\u{e36c}'), + (0xFBC0, '\u{e36d}'), + (0xFBC1, '\u{e36e}'), + (0xFBC2, '\u{e36f}'), + (0xFBC3, '\u{e370}'), + (0xFBC4, '\u{e371}'), + (0xFBC5, '\u{e372}'), + (0xFBC6, '\u{e373}'), + (0xFBC7, '\u{e374}'), + (0xFBC8, '\u{e375}'), + (0xFBC9, '\u{e376}'), + (0xFBCA, '\u{e377}'), + (0xFBCB, '\u{e378}'), + (0xFBCC, '\u{e379}'), + (0xFBCD, '\u{e37a}'), + (0xFBCE, '\u{e37b}'), + (0xFBCF, '\u{e37c}'), + (0xFBD0, '\u{e37d}'), + (0xFBD1, '\u{e37e}'), + (0xFBD2, '\u{e37f}'), + (0xFBD3, '\u{e380}'), + (0xFBD4, '\u{e381}'), + (0xFBD5, '\u{e382}'), + (0xFBD6, '\u{e383}'), + (0xFBD7, '\u{e384}'), + (0xFBD8, '\u{e385}'), + (0xFBD9, '\u{e386}'), + (0xFBDA, '\u{e387}'), + (0xFBDB, '\u{e388}'), + (0xFBDC, '\u{e389}'), + (0xFBDD, '\u{e38a}'), + (0xFBDE, '\u{e38b}'), + (0xFBDF, '\u{e38c}'), + (0xFBE0, '\u{e38d}'), + (0xFBE1, '\u{e38e}'), + (0xFBE2, '\u{e38f}'), + (0xFBE3, '\u{e390}'), + (0xFBE4, '\u{e391}'), + (0xFBE5, '\u{e392}'), + (0xFBE6, '\u{e393}'), + (0xFBE7, '\u{e394}'), + (0xFBE8, '\u{e395}'), + (0xFBE9, '\u{e396}'), + (0xFBEA, '\u{e397}'), + (0xFBEB, '\u{e398}'), + (0xFBEC, '\u{e399}'), + (0xFBED, '\u{e39a}'), + (0xFBEE, '\u{e39b}'), + (0xFBEF, '\u{e39c}'), + (0xFBF0, '\u{e39d}'), + (0xFBF1, '\u{e39e}'), + (0xFBF2, '\u{e39f}'), + (0xFBF3, '\u{e3a0}'), + (0xFBF4, '\u{e3a1}'), + (0xFBF5, '\u{e3a2}'), + (0xFBF6, '\u{e3a3}'), + (0xFBF7, '\u{e3a4}'), + (0xFBF8, '\u{e3a5}'), + (0xFBF9, '\u{e3a6}'), + (0xFBFA, '\u{e3a7}'), + (0xFBFB, '\u{e3a8}'), + (0xFBFC, '\u{e3a9}'), + (0xFBFD, '\u{e3aa}'), + (0xFBFE, '\u{e3ab}'), + (0xFCA1, '\u{e3ac}'), + (0xFCA2, '\u{e3ad}'), + (0xFCA3, '\u{e3ae}'), + (0xFCA4, '\u{e3af}'), + (0xFCA5, '\u{e3b0}'), + (0xFCA6, '\u{e3b1}'), + (0xFCA7, '\u{e3b2}'), + (0xFCA8, '\u{e3b3}'), + (0xFCA9, '\u{e3b4}'), + (0xFCAA, '\u{e3b5}'), + (0xFCAB, '\u{e3b6}'), + (0xFCAC, '\u{e3b7}'), + (0xFCAD, '\u{e3b8}'), + (0xFCAE, '\u{e3b9}'), + (0xFCAF, '\u{e3ba}'), + (0xFCB0, '\u{e3bb}'), + (0xFCB1, '\u{e3bc}'), + (0xFCB2, '\u{e3bd}'), + (0xFCB3, '\u{e3be}'), + (0xFCB4, '\u{e3bf}'), + (0xFCB5, '\u{e3c0}'), + (0xFCB6, '\u{e3c1}'), + (0xFCB7, '\u{e3c2}'), + (0xFCB8, '\u{e3c3}'), + (0xFCB9, '\u{e3c4}'), + (0xFCBA, '\u{e3c5}'), + (0xFCBB, '\u{e3c6}'), + (0xFCBC, '\u{e3c7}'), + (0xFCBD, '\u{e3c8}'), + (0xFCBE, '\u{e3c9}'), + (0xFCBF, '\u{e3ca}'), + (0xFCC0, '\u{e3cb}'), + (0xFCC1, '\u{e3cc}'), + (0xFCC2, '\u{e3cd}'), + (0xFCC3, '\u{e3ce}'), + (0xFCC4, '\u{e3cf}'), + (0xFCC5, '\u{e3d0}'), + (0xFCC6, '\u{e3d1}'), + (0xFCC7, '\u{e3d2}'), + (0xFCC8, '\u{e3d3}'), + (0xFCC9, '\u{e3d4}'), + (0xFCCA, '\u{e3d5}'), + (0xFCCB, '\u{e3d6}'), + (0xFCCC, '\u{e3d7}'), + (0xFCCD, '\u{e3d8}'), + (0xFCCE, '\u{e3d9}'), + (0xFCCF, '\u{e3da}'), + (0xFCD0, '\u{e3db}'), + (0xFCD1, '\u{e3dc}'), + (0xFCD2, '\u{e3dd}'), + (0xFCD3, '\u{e3de}'), + (0xFCD4, '\u{e3df}'), + (0xFCD5, '\u{e3e0}'), + (0xFCD6, '\u{e3e1}'), + (0xFCD7, '\u{e3e2}'), + (0xFCD8, '\u{e3e3}'), + (0xFCD9, '\u{e3e4}'), + (0xFCDA, '\u{e3e5}'), + (0xFCDB, '\u{e3e6}'), + (0xFCDC, '\u{e3e7}'), + (0xFCDD, '\u{e3e8}'), + (0xFCDE, '\u{e3e9}'), + (0xFCDF, '\u{e3ea}'), + (0xFCE0, '\u{e3eb}'), + (0xFCE1, '\u{e3ec}'), + (0xFCE2, '\u{e3ed}'), + (0xFCE3, '\u{e3ee}'), + (0xFCE4, '\u{e3ef}'), + (0xFCE5, '\u{e3f0}'), + (0xFCE6, '\u{e3f1}'), + (0xFCE7, '\u{e3f2}'), + (0xFCE8, '\u{e3f3}'), + (0xFCE9, '\u{e3f4}'), + (0xFCEA, '\u{e3f5}'), + (0xFCEB, '\u{e3f6}'), + (0xFCEC, '\u{e3f7}'), + (0xFCED, '\u{e3f8}'), + (0xFCEE, '\u{e3f9}'), + (0xFCEF, '\u{e3fa}'), + (0xFCF0, '\u{e3fb}'), + (0xFCF1, '\u{e3fc}'), + (0xFCF2, '\u{e3fd}'), + (0xFCF3, '\u{e3fe}'), + (0xFCF4, '\u{e3ff}'), + (0xFCF5, '\u{e400}'), + (0xFCF6, '\u{e401}'), + (0xFCF7, '\u{e402}'), + (0xFCF8, '\u{e403}'), + (0xFCF9, '\u{e404}'), + (0xFCFA, '\u{e405}'), + (0xFCFB, '\u{e406}'), + (0xFCFC, '\u{e407}'), + (0xFCFD, '\u{e408}'), + (0xFCFE, '\u{e409}'), + (0xFD9C, '\u{f92c}'), + (0xFD9D, '\u{f979}'), + (0xFD9E, '\u{f995}'), + (0xFD9F, '\u{f9e7}'), + (0xFDA0, '\u{f9f1}'), + (0xFDA1, '\u{e40a}'), + (0xFDA2, '\u{e40b}'), + (0xFDA3, '\u{e40c}'), + (0xFDA4, '\u{e40d}'), + (0xFDA5, '\u{e40e}'), + (0xFDA6, '\u{e40f}'), + (0xFDA7, '\u{e410}'), + (0xFDA8, '\u{e411}'), + (0xFDA9, '\u{e412}'), + (0xFDAA, '\u{e413}'), + (0xFDAB, '\u{e414}'), + (0xFDAC, '\u{e415}'), + (0xFDAD, '\u{e416}'), + (0xFDAE, '\u{e417}'), + (0xFDAF, '\u{e418}'), + (0xFDB0, '\u{e419}'), + (0xFDB1, '\u{e41a}'), + (0xFDB2, '\u{e41b}'), + (0xFDB3, '\u{e41c}'), + (0xFDB4, '\u{e41d}'), + (0xFDB5, '\u{e41e}'), + (0xFDB6, '\u{e41f}'), + (0xFDB7, '\u{e420}'), + (0xFDB8, '\u{e421}'), + (0xFDB9, '\u{e422}'), + (0xFDBA, '\u{e423}'), + (0xFDBB, '\u{e424}'), + (0xFDBC, '\u{e425}'), + (0xFDBD, '\u{e426}'), + (0xFDBE, '\u{e427}'), + (0xFDBF, '\u{e428}'), + (0xFDC0, '\u{e429}'), + (0xFDC1, '\u{e42a}'), + (0xFDC2, '\u{e42b}'), + (0xFDC3, '\u{e42c}'), + (0xFDC4, '\u{e42d}'), + (0xFDC5, '\u{e42e}'), + (0xFDC6, '\u{e42f}'), + (0xFDC7, '\u{e430}'), + (0xFDC8, '\u{e431}'), + (0xFDC9, '\u{e432}'), + (0xFDCA, '\u{e433}'), + (0xFDCB, '\u{e434}'), + (0xFDCC, '\u{e435}'), + (0xFDCD, '\u{e436}'), + (0xFDCE, '\u{e437}'), + (0xFDCF, '\u{e438}'), + (0xFDD0, '\u{e439}'), + (0xFDD1, '\u{e43a}'), + (0xFDD2, '\u{e43b}'), + (0xFDD3, '\u{e43c}'), + (0xFDD4, '\u{e43d}'), + (0xFDD5, '\u{e43e}'), + (0xFDD6, '\u{e43f}'), + (0xFDD7, '\u{e440}'), + (0xFDD8, '\u{e441}'), + (0xFDD9, '\u{e442}'), + (0xFDDA, '\u{e443}'), + (0xFDDB, '\u{e444}'), + (0xFDDC, '\u{e445}'), + (0xFDDD, '\u{e446}'), + (0xFDDE, '\u{e447}'), + (0xFDDF, '\u{e448}'), + (0xFDE0, '\u{e449}'), + (0xFDE1, '\u{e44a}'), + (0xFDE2, '\u{e44b}'), + (0xFDE3, '\u{e44c}'), + (0xFDE4, '\u{e44d}'), + (0xFDE5, '\u{e44e}'), + (0xFDE6, '\u{e44f}'), + (0xFDE7, '\u{e450}'), + (0xFDE8, '\u{e451}'), + (0xFDE9, '\u{e452}'), + (0xFDEA, '\u{e453}'), + (0xFDEB, '\u{e454}'), + (0xFDEC, '\u{e455}'), + (0xFDED, '\u{e456}'), + (0xFDEE, '\u{e457}'), + (0xFDEF, '\u{e458}'), + (0xFDF0, '\u{e459}'), + (0xFDF1, '\u{e45a}'), + (0xFDF2, '\u{e45b}'), + (0xFDF3, '\u{e45c}'), + (0xFDF4, '\u{e45d}'), + (0xFDF5, '\u{e45e}'), + (0xFDF6, '\u{e45f}'), + (0xFDF7, '\u{e460}'), + (0xFDF8, '\u{e461}'), + (0xFDF9, '\u{e462}'), + (0xFDFA, '\u{e463}'), + (0xFDFB, '\u{e464}'), + (0xFDFC, '\u{e465}'), + (0xFDFD, '\u{e466}'), + (0xFDFE, '\u{e467}'), + (0xFE40, '\u{fa0c}'), + (0xFE41, '\u{fa0d}'), + (0xFE47, '\u{fa18}'), + (0xFE49, '\u{fa20}'), + (0xFE51, '\u{e816}'), + (0xFE52, '\u{e817}'), + (0xFE53, '\u{e818}'), + (0xFE59, '\u{9fb4}'), + (0xFE61, '\u{9fb5}'), + (0xFE66, '\u{9fb6}'), + (0xFE67, '\u{9fb7}'), + (0xFE6C, '\u{e831}'), + (0xFE6D, '\u{9fb8}'), + (0xFE76, '\u{e83b}'), + (0xFE7E, '\u{9fb9}'), + (0xFE90, '\u{9fba}'), + (0xFE91, '\u{e855}'), + (0xFEA0, '\u{9fbb}'), + (0xFEA1, '\u{e468}'), + (0xFEA2, '\u{e469}'), + (0xFEA3, '\u{e46a}'), + (0xFEA4, '\u{e46b}'), + (0xFEA5, '\u{e46c}'), + (0xFEA6, '\u{e46d}'), + (0xFEA7, '\u{e46e}'), + (0xFEA8, '\u{e46f}'), + (0xFEA9, '\u{e470}'), + (0xFEAA, '\u{e471}'), + (0xFEAB, '\u{e472}'), + (0xFEAC, '\u{e473}'), + (0xFEAD, '\u{e474}'), + (0xFEAE, '\u{e475}'), + (0xFEAF, '\u{e476}'), + (0xFEB0, '\u{e477}'), + (0xFEB1, '\u{e478}'), + (0xFEB2, '\u{e479}'), + (0xFEB3, '\u{e47a}'), + (0xFEB4, '\u{e47b}'), + (0xFEB5, '\u{e47c}'), + (0xFEB6, '\u{e47d}'), + (0xFEB7, '\u{e47e}'), + (0xFEB8, '\u{e47f}'), + (0xFEB9, '\u{e480}'), + (0xFEBA, '\u{e481}'), + (0xFEBB, '\u{e482}'), + (0xFEBC, '\u{e483}'), + (0xFEBD, '\u{e484}'), + (0xFEBE, '\u{e485}'), + (0xFEBF, '\u{e486}'), + (0xFEC0, '\u{e487}'), + (0xFEC1, '\u{e488}'), + (0xFEC2, '\u{e489}'), + (0xFEC3, '\u{e48a}'), + (0xFEC4, '\u{e48b}'), + (0xFEC5, '\u{e48c}'), + (0xFEC6, '\u{e48d}'), + (0xFEC7, '\u{e48e}'), + (0xFEC8, '\u{e48f}'), + (0xFEC9, '\u{e490}'), + (0xFECA, '\u{e491}'), + (0xFECB, '\u{e492}'), + (0xFECC, '\u{e493}'), + (0xFECD, '\u{e494}'), + (0xFECE, '\u{e495}'), + (0xFECF, '\u{e496}'), + (0xFED0, '\u{e497}'), + (0xFED1, '\u{e498}'), + (0xFED2, '\u{e499}'), + (0xFED3, '\u{e49a}'), + (0xFED4, '\u{e49b}'), + (0xFED5, '\u{e49c}'), + (0xFED6, '\u{e49d}'), + (0xFED7, '\u{e49e}'), + (0xFED8, '\u{e49f}'), + (0xFED9, '\u{e4a0}'), + (0xFEDA, '\u{e4a1}'), + (0xFEDB, '\u{e4a2}'), + (0xFEDC, '\u{e4a3}'), + (0xFEDD, '\u{e4a4}'), + (0xFEDE, '\u{e4a5}'), + (0xFEDF, '\u{e4a6}'), + (0xFEE0, '\u{e4a7}'), + (0xFEE1, '\u{e4a8}'), + (0xFEE2, '\u{e4a9}'), + (0xFEE3, '\u{e4aa}'), + (0xFEE4, '\u{e4ab}'), + (0xFEE5, '\u{e4ac}'), + (0xFEE6, '\u{e4ad}'), + (0xFEE7, '\u{e4ae}'), + (0xFEE8, '\u{e4af}'), + (0xFEE9, '\u{e4b0}'), + (0xFEEA, '\u{e4b1}'), + (0xFEEB, '\u{e4b2}'), + (0xFEEC, '\u{e4b3}'), + (0xFEED, '\u{e4b4}'), + (0xFEEE, '\u{e4b5}'), + (0xFEEF, '\u{e4b6}'), + (0xFEF0, '\u{e4b7}'), + (0xFEF1, '\u{e4b8}'), + (0xFEF2, '\u{e4b9}'), + (0xFEF3, '\u{e4ba}'), + (0xFEF4, '\u{e4bb}'), + (0xFEF5, '\u{e4bc}'), + (0xFEF6, '\u{e4bd}'), + (0xFEF7, '\u{e4be}'), + (0xFEF8, '\u{e4bf}'), + (0xFEF9, '\u{e4c0}'), + (0xFEFA, '\u{e4c1}'), + (0xFEFB, '\u{e4c2}'), + (0xFEFC, '\u{e4c3}'), + (0xFEFD, '\u{e4c4}'), + (0xFEFE, '\u{e4c5}'), + (0x8135F437, '\u{e7c7}'), + (0x82359037, '\u{e81e}'), + (0x82359038, '\u{e826}'), + (0x82359039, '\u{e82b}'), + (0x82359130, '\u{e82c}'), + (0x82359131, '\u{e832}'), + (0x82359132, '\u{e843}'), + (0x82359133, '\u{e854}'), + (0x82359134, '\u{e864}'), + (0x84318236, '\u{e78d}'), + (0x84318237, '\u{e78f}'), + (0x84318238, '\u{e78e}'), + (0x84318239, '\u{e790}'), + (0x84318330, '\u{e791}'), + (0x84318331, '\u{e792}'), + (0x84318332, '\u{e793}'), + (0x84318333, '\u{e794}'), + (0x84318334, '\u{e795}'), + (0x84318335, '\u{e796}'), + (0x95329031, '\u{20087}'), + (0x95329033, '\u{20089}'), + (0x95329730, '\u{200cc}'), + (0x9536B937, '\u{215d7}'), + (0x9630BA35, '\u{2298f}'), + (0x9635B630, '\u{241fe}'), +]; diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs index 268b11aad41..ba7d5d75ec2 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs @@ -1,6 +1,8 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. mod ascii; +mod gb18030; +mod gb18030_data; mod gbk; mod unicode_letter; mod utf8; @@ -8,6 +10,7 @@ mod utf8; use std::str; pub use ascii::*; +pub use gb18030::*; pub use gbk::*; pub use unicode_letter::*; pub use utf8::*; diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 93cf0c8ca55..98284ac5463 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -37,6 +37,8 @@ macro_rules! match_template_collator { Latin1Bin => CollatorLatin1Bin, GbkBin => CollatorGbkBin, GbkChineseCi => CollatorGbkChineseCi, + Gb18030Bin => CollatorGb18030Bin, + Gb18030ChineseCi => CollatorGb18030ChineseCi, ], $($tail)* } @@ -81,6 +83,7 @@ macro_rules! match_template_charset { Utf8Mb4 => EncodingUtf8Mb4, Latin1 => EncodingLatin1, Gbk => EncodingGbk, + Gb18030 => EncodingGb18030, Binary => EncodingBinary, Ascii => EncodingAscii, ], @@ -121,7 +124,7 @@ pub trait Collator: 'static + std::marker::Send + std::marker::Sync + std::fmt:: } /// Compares `a` and `b` based on their SortKey. - fn sort_compare(a: &[u8], b: &[u8]) -> Result; + fn sort_compare(a: &[u8], b: &[u8], force_no_pad: bool) -> Result; /// Hashes `bstr` based on its SortKey directly. /// @@ -238,7 +241,7 @@ where { #[inline] fn eq(&self, other: &Self) -> bool { - C::sort_compare(self.inner.as_ref(), other.inner.as_ref()).unwrap() + C::sort_compare(self.inner.as_ref(), other.inner.as_ref(), false).unwrap() == std::cmp::Ordering::Equal } } @@ -251,7 +254,7 @@ where { #[inline] fn partial_cmp(&self, other: &Self) -> Option { - C::sort_compare(self.inner.as_ref(), other.inner.as_ref()).ok() + C::sort_compare(self.inner.as_ref(), other.inner.as_ref(), false).ok() } } @@ -261,7 +264,7 @@ where { #[inline] fn cmp(&self, other: &Self) -> Ordering { - C::sort_compare(self.inner.as_ref(), other.inner.as_ref()).unwrap() + C::sort_compare(self.inner.as_ref(), other.inner.as_ref(), false).unwrap() } } diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_vector_float32.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_vector_float32.rs new file mode 100644 index 00000000000..d065a8ec0ff --- /dev/null +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_vector_float32.rs @@ -0,0 +1,135 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use super::{ + bit_vec::BitVec, ChunkRef, ChunkedVec, UnsafeRefInto, VectorFloat32, VectorFloat32Ref, +}; +use crate::{ + codec::mysql::{VectorFloat32Decoder, VectorFloat32Encoder}, + impl_chunked_vec_common, +}; + +/// A vector storing `Option` with a compact layout. +/// +/// Inside `ChunkedVecVectorFloat32`, `bitmap` indicates if an element at given +/// index is null, and `data` stores actual data. VectorFloat32 data are stored +/// adjacent to each other in `data`. If element at a given index is null, then +/// it takes no space in `data`. Otherwise, a variable size VectorFloat32 data +/// is stored in `data`, and `var_offset` indicates the starting position of +/// each element. +#[derive(Debug, PartialEq, Clone)] +pub struct ChunkedVecVectorFloat32 { + data: Vec, + bitmap: BitVec, + length: usize, + var_offset: Vec, +} + +impl ChunkedVecVectorFloat32 { + #[inline] + pub fn get(&self, idx: usize) -> Option> { + assert!(idx < self.len()); + if self.bitmap.get(idx) { + let mut sliced_data = &self.data[self.var_offset[idx]..self.var_offset[idx + 1]]; + let v: VectorFloat32Ref<'_> = sliced_data.read_vector_float32_ref().unwrap(); + unsafe { + let v_with_static_lifetime = v.unsafe_into(); + Some(v_with_static_lifetime) + } + } else { + None + } + } +} + +impl ChunkedVec for ChunkedVecVectorFloat32 { + impl_chunked_vec_common! { VectorFloat32 } + + fn with_capacity(capacity: usize) -> Self { + Self { + data: Vec::with_capacity(capacity), + bitmap: BitVec::with_capacity(capacity), + var_offset: vec![0], + length: 0, + } + } + + #[inline] + fn push_data(&mut self, value: VectorFloat32) { + self.bitmap.push(true); + self.data.write_vector_float32(value.as_ref()).unwrap(); + self.var_offset.push(self.data.len()); + self.length += 1; + } + + #[inline] + fn push_null(&mut self) { + self.bitmap.push(false); + self.var_offset.push(self.data.len()); + self.length += 1; + } + + fn len(&self) -> usize { + self.length + } + + fn truncate(&mut self, len: usize) { + if len < self.len() { + self.data.truncate(self.var_offset[len]); + self.bitmap.truncate(len); + self.var_offset.truncate(len + 1); + self.length = len; + } + } + + fn capacity(&self) -> usize { + self.data.capacity().max(self.length) + } + + fn append(&mut self, other: &mut Self) { + self.data.append(&mut other.data); + self.bitmap.append(&mut other.bitmap); + let var_offset_last = *self.var_offset.last().unwrap(); + for i in 1..other.var_offset.len() { + self.var_offset.push(other.var_offset[i] + var_offset_last); + } + self.length += other.length; + other.var_offset = vec![0]; + other.length = 0; + } + + fn to_vec(&self) -> Vec> { + let mut x = Vec::with_capacity(self.len()); + for i in 0..self.len() { + x.push(self.get(i).map(|x| x.to_owned())); + } + x + } +} + +impl<'a> ChunkRef<'a, VectorFloat32Ref<'a>> for &'a ChunkedVecVectorFloat32 { + #[inline] + fn get_option_ref(self, idx: usize) -> Option> { + self.get(idx) + } + + fn get_bit_vec(self) -> &'a BitVec { + &self.bitmap + } + + #[inline] + fn phantom_data(self) -> Option> { + None + } +} + +impl From>> for ChunkedVecVectorFloat32 { + fn from(v: Vec>) -> ChunkedVecVectorFloat32 { + ChunkedVecVectorFloat32::from_vec(v) + } +} + +impl<'a> UnsafeRefInto<&'static ChunkedVecVectorFloat32> for &'a ChunkedVecVectorFloat32 { + unsafe fn unsafe_into(self) -> &'static ChunkedVecVectorFloat32 { + std::mem::transmute(self) + } +} diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index b464b1119c8..5a11a5f295e 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -7,6 +7,7 @@ mod chunked_vec_enum; mod chunked_vec_json; mod chunked_vec_set; mod chunked_vec_sized; +mod chunked_vec_vector_float32; mod logical_rows; mod scalar; mod vector; @@ -17,10 +18,10 @@ pub use logical_rows::{LogicalRows, BATCH_MAX_SIZE, IDENTICAL_LOGICAL_ROWS}; macro_rules! match_template_evaltype { ($t:tt, $($tail:tt)*) => {{ #[allow(unused_imports)] - use $crate::codec::data_type::{Int, Real, Decimal, Bytes, DateTime, Duration, Json, Set, Enum}; + use $crate::codec::data_type::{Int, Real, Decimal, Bytes, DateTime, Duration, Json, Set, Enum, VectorFloat32}; match_template::match_template! { - $t = [Int, Real, Decimal, Bytes, DateTime, Duration, Json, Set, Enum], + $t = [Int, Real, Decimal, Bytes, DateTime, Duration, Json, Set, Enum, VectorFloat32], $($tail)* }} } @@ -38,6 +39,7 @@ pub use chunked_vec_enum::ChunkedVecEnum; pub use chunked_vec_json::ChunkedVecJson; pub use chunked_vec_set::ChunkedVecSet; pub use chunked_vec_sized::ChunkedVecSized; +pub use chunked_vec_vector_float32::ChunkedVecVectorFloat32; // Dynamic eval types. pub use self::{ @@ -47,6 +49,7 @@ pub use self::{ use super::Result; pub use crate::codec::mysql::{ json::JsonRef, Decimal, Duration, Enum, EnumRef, Json, JsonType, Set, SetRef, Time as DateTime, + VectorFloat32, VectorFloat32Ref, }; use crate::{codec::convert::ConvertTo, expr::EvalContext, EvalType}; @@ -104,12 +107,24 @@ where } } +impl AsMySqlBool for VectorFloat32 { + fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { + Ok(!self.as_ref().is_empty()) + } +} + impl<'a> AsMySqlBool for JsonRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_zero()) } } +impl<'a> AsMySqlBool for VectorFloat32Ref<'a> { + fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { + Ok(!self.is_empty()) + } +} + impl<'a> AsMySqlBool for EnumRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_empty()) @@ -140,6 +155,15 @@ impl<'a> AsMySqlBool for Option> { } } +impl<'a> AsMySqlBool for Option> { + fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { + match self { + None => Ok(false), + Some(ref v) => v.as_mysql_bool(context), + } + } +} + impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { @@ -355,6 +379,7 @@ impl_evaluable_ret! { Duration, ChunkedVecSized } impl_evaluable_ret! { Json, ChunkedVecJson } impl_evaluable_ret! { Enum, ChunkedVecEnum } impl_evaluable_ret! { Set, ChunkedVecSet } +impl_evaluable_ret! { VectorFloat32, ChunkedVecVectorFloat32 } pub trait EvaluableRef<'a>: Clone + std::fmt::Debug + Send + Sync { const EVAL_TYPE: EvalType; @@ -501,6 +526,12 @@ impl<'a> UnsafeRefInto> for SetRef<'a> { } } +impl<'a> UnsafeRefInto> for VectorFloat32Ref<'a> { + unsafe fn unsafe_into(self) -> VectorFloat32Ref<'static> { + std::mem::transmute(self) + } +} + impl<'a> EvaluableRef<'a> for JsonRef<'a> { const EVAL_TYPE: EvalType = EvalType::Json; type EvaluableType = Json; @@ -553,6 +584,58 @@ impl<'a> EvaluableRef<'a> for JsonRef<'a> { } } +impl<'a> EvaluableRef<'a> for VectorFloat32Ref<'a> { + const EVAL_TYPE: EvalType = EvalType::VectorFloat32; + type EvaluableType = VectorFloat32; + type ChunkedType = &'a ChunkedVecVectorFloat32; + + #[inline] + fn borrow_scalar_value(v: &'a ScalarValue) -> Option { + match v { + ScalarValue::VectorFloat32(x) => x.as_ref().map(|x| x.as_ref()), + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(VectorFloat32), + ), + } + } + + #[inline] + fn borrow_scalar_value_ref(v: ScalarValueRef<'a>) -> Option { + match v { + ScalarValueRef::VectorFloat32(x) => x, + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(VectorFloat32), + ), + } + } + + #[inline] + fn borrow_vector_value(v: &VectorValue) -> &ChunkedVecVectorFloat32 { + match v { + VectorValue::VectorFloat32(x) => x, + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(VectorFloat32), + ), + } + } + + #[inline] + fn into_owned_value(self) -> Self::EvaluableType { + self.to_owned() + } + + #[inline] + fn from_owned_value(value: &'a VectorFloat32) -> Self { + value.as_ref() + } +} + impl<'a> EvaluableRef<'a> for EnumRef<'a> { const EVAL_TYPE: EvalType = EvalType::Enum; type EvaluableType = Enum; diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index c74423107e4..397ffbe0619 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -37,6 +37,7 @@ pub enum ScalarValue { Json(Option), Enum(Option), Set(Option), + VectorFloat32(Option), } impl ScalarValue { @@ -61,6 +62,9 @@ impl ScalarValue { ScalarValue::Json(x) => ScalarValueRef::Json(x.as_ref().map(|x| x.as_ref())), ScalarValue::Enum(x) => ScalarValueRef::Enum(x.as_ref().map(|x| x.as_ref())), ScalarValue::Set(x) => ScalarValueRef::Set(x.as_ref().map(|x| x.as_ref())), + ScalarValue::VectorFloat32(x) => { + ScalarValueRef::VectorFloat32(x.as_ref().map(|x| x.as_ref())) + } } } @@ -133,6 +137,7 @@ impl_from! { Bytes } impl_from! { DateTime } impl_from! { Duration } impl_from! { Json } +impl_from! { VectorFloat32 } impl From> for ScalarValue { #[inline] @@ -155,6 +160,13 @@ impl<'a> From>> for ScalarValue { } } +impl<'a> From>> for ScalarValue { + #[inline] + fn from(s: Option>) -> ScalarValue { + ScalarValue::VectorFloat32(s.map(|x| x.to_owned())) + } +} + impl From for ScalarValue { #[inline] fn from(s: f64) -> ScalarValue { @@ -193,6 +205,7 @@ pub enum ScalarValueRef<'a> { Json(Option>), Enum(Option>), Set(Option>), + VectorFloat32(Option>), } impl<'a> ScalarValueRef<'a> { @@ -209,6 +222,7 @@ impl<'a> ScalarValueRef<'a> { ScalarValueRef::Json(x) => ScalarValue::Json(x.map(|x| x.to_owned())), ScalarValueRef::Enum(x) => ScalarValue::Enum(x.map(|x| x.to_owned())), ScalarValueRef::Set(x) => ScalarValue::Set(x.map(|x| x.to_owned())), + ScalarValueRef::VectorFloat32(x) => ScalarValue::VectorFloat32(x.map(|x| x.to_owned())), } } @@ -310,6 +324,17 @@ impl<'a> ScalarValueRef<'a> { } Ok(()) } + ScalarValueRef::VectorFloat32(val) => { + match val { + None => { + output.write_evaluable_datum_null()?; + } + Some(val) => { + output.write_evaluable_datum_vector_float32(*val)?; + } + } + Ok(()) + } // TODO: we should implement enum/set encode ScalarValueRef::Enum(_) => unimplemented!(), ScalarValueRef::Set(_) => unimplemented!(), @@ -352,7 +377,7 @@ impl<'a> ScalarValueRef<'a> { field_type: &FieldType, ) -> crate::codec::Result { Ok(match_template! { - TT = [Real, Decimal, DateTime, Duration, Json, Enum], + TT = [Real, Decimal, DateTime, Duration, Json, Enum, VectorFloat32], match (self, other) { (ScalarValueRef::TT(v1), ScalarValueRef::TT(v2)) => v1.cmp(v2), (ScalarValueRef::Int(v1), ScalarValueRef::Int(v2)) => compare_int(&v1.cloned(), &v2.cloned(), field_type), @@ -362,7 +387,7 @@ impl<'a> ScalarValueRef<'a> { (ScalarValueRef::Bytes(Some(v1)), ScalarValueRef::Bytes(Some(v2))) => { match_template_collator! { TT, match field_type.collation()? { - Collation::TT => TT::sort_compare(v1, v2)? + Collation::TT => TT::sort_compare(v1, v2, false)? } } } @@ -442,6 +467,11 @@ impl ScalarValue { pub fn as_json(&self) -> Option> { EvaluableRef::borrow_scalar_value(self) } + + #[inline] + pub fn as_vector_float32(&self) -> Option> { + EvaluableRef::borrow_scalar_value(self) + } } impl<'a> ScalarValueRef<'a> { @@ -449,6 +479,11 @@ impl<'a> ScalarValueRef<'a> { pub fn as_json(&'a self) -> Option> { EvaluableRef::borrow_scalar_value_ref(*self) } + + #[inline] + pub fn as_vector_float32(&'a self) -> Option> { + EvaluableRef::borrow_scalar_value_ref(*self) + } } impl ScalarValue { diff --git a/components/tidb_query_datatype/src/codec/data_type/vector.rs b/components/tidb_query_datatype/src/codec/data_type/vector.rs index 49a4e3a1cff..9086ac3f7b9 100644 --- a/components/tidb_query_datatype/src/codec/data_type/vector.rs +++ b/components/tidb_query_datatype/src/codec/data_type/vector.rs @@ -15,13 +15,13 @@ pub enum VectorValue { Int(ChunkedVecSized), Real(ChunkedVecSized), Decimal(ChunkedVecSized), - // TODO: We need to improve its performance, i.e. store strings in adjacent memory places Bytes(ChunkedVecBytes), DateTime(ChunkedVecSized), Duration(ChunkedVecSized), Json(ChunkedVecJson), Enum(ChunkedVecEnum), Set(ChunkedVecSet), + VectorFloat32(ChunkedVecVectorFloat32), } impl VectorValue { @@ -63,6 +63,7 @@ impl VectorValue { } } } + expand_convertion!( scalar, Int: ChunkedVecSized, @@ -72,6 +73,7 @@ impl VectorValue { Duration: ChunkedVecSized, Set: ChunkedVecSet, Json: ChunkedVecJson, + VectorFloat32: ChunkedVecVectorFloat32, Enum: ChunkedVecEnum, Bytes: ChunkedVecBytes ) @@ -261,6 +263,21 @@ impl VectorValue { } size } + VectorValue::VectorFloat32(vec) => { + let mut size = 0; + for idx in logical_rows { + let el = vec.get_option_ref(*idx); + match el { + Some(v) => { + size += 1 /* FLAG */ + v.encoded_len(); + } + None => { + size += 1; + } + } + } + size + } VectorValue::Enum(_) => logical_rows.len() * 9, // TODO: implement here after we implement set encoding VectorValue::Set(_) => unimplemented!(), @@ -305,6 +322,21 @@ impl VectorValue { } size } + VectorValue::VectorFloat32(vec) => { + let mut size = logical_rows.len() + 10; + for idx in logical_rows { + let el = vec.get_option_ref(*idx); + match el { + Some(v) => { + size += 8 /* Offset */ + v.encoded_len(); + } + None => { + size += 8 /* Offset */; + } + } + } + size + } VectorValue::Enum(vec) => { let mut size = logical_rows.len() * 9 + 10; for idx in logical_rows { @@ -415,6 +447,17 @@ impl VectorValue { } Ok(()) } + VectorValue::VectorFloat32(ref vec) => { + match &vec.get_option_ref(row_index) { + None => { + output.write_evaluable_datum_null()?; + } + Some(ref val) => { + output.write_evaluable_datum_vector_float32(*val)?; + } + } + Ok(()) + } VectorValue::Enum(ref vec) => { match &vec.get_option_ref(row_index) { None => { @@ -496,6 +539,7 @@ impl_as_slice! { Bytes, to_bytes_vec } impl_as_slice! { DateTime, to_date_time_vec } impl_as_slice! { Duration, to_duration_vec } impl_as_slice! { Json, to_json_vec } +impl_as_slice! { VectorFloat32, to_vector_float32_vec } impl_as_slice! { Enum, to_enum_vec } impl_as_slice! { Set, to_set_vec } @@ -548,6 +592,7 @@ impl_ext! { Bytes, push_bytes } impl_ext! { DateTime, push_date_time } impl_ext! { Duration, push_duration } impl_ext! { Json, push_json } +impl_ext! { VectorFloat32, push_vector_float32 } impl_ext! { Enum, push_enum } impl_ext! { Set, push_set } @@ -569,6 +614,7 @@ impl_from! { Bytes, ChunkedVecBytes } impl_from! { DateTime, ChunkedVecSized } impl_from! { Duration, ChunkedVecSized } impl_from! { Json, ChunkedVecJson } +impl_from! { VectorFloat32, ChunkedVecVectorFloat32 } impl_from! { Enum, ChunkedVecEnum } impl_from! { Set, ChunkedVecSet } diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index f91d204b3b0..9da537a2c09 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -12,12 +12,14 @@ use codec::{ number::{self, NumberCodec}, prelude::*, }; +use mysql::VectorFloat32; use tikv_util::{codec::BytesSlice, escape}; use super::{ mysql::{ self, parse_json_path_expr, Decimal, DecimalDecoder, DecimalEncoder, Duration, Enum, Json, - JsonDecoder, JsonEncoder, PathExpression, Set, Time, DEFAULT_FSP, MAX_FSP, + JsonDecoder, JsonEncoder, PathExpression, Set, Time, VectorFloat32Decoder, + VectorFloat32Encoder, DEFAULT_FSP, MAX_FSP, }, Result, }; @@ -41,6 +43,7 @@ pub const DURATION_FLAG: u8 = 7; pub const VAR_INT_FLAG: u8 = 8; pub const VAR_UINT_FLAG: u8 = 9; pub const JSON_FLAG: u8 = 10; +pub const VECTOR_FLOAT32_FLAG: u8 = 20; pub const MAX_FLAG: u8 = 250; pub const DATUM_DATA_NULL: &[u8; 1] = &[NIL_FLAG]; @@ -57,6 +60,7 @@ pub enum Datum { Dec(Decimal), Time(Time), Json(Json), + VectorFloat32(VectorFloat32), Enum(Enum), Set(Set), Min, @@ -141,6 +145,7 @@ impl Display for Datum { Datum::Dec(ref d) => write!(f, "Dec({})", d), Datum::Time(t) => write!(f, "Time({})", t), Datum::Json(ref j) => write!(f, "Json({})", j), + Datum::VectorFloat32(ref v) => write!(f, "VectorFloat32({})", v), Datum::Enum(ref e) => write!(f, "Enum({})", e), Datum::Set(ref s) => write!(f, "Set({})", s), Datum::Min => write!(f, "MIN"), @@ -207,6 +212,7 @@ impl Datum { Datum::Dec(ref d) => self.cmp_dec(ctx, d), Datum::Time(t) => self.cmp_time(ctx, t), Datum::Json(ref j) => self.cmp_json(ctx, j), + Datum::VectorFloat32(_) => Err(box_err!("not implemented")), Datum::Enum(ref e) => self.cmp_enum(ctx, e), Datum::Set(ref s) => self.cmp_set(ctx, s), } @@ -252,6 +258,7 @@ impl Datum { Datum::Dur(ref d) => cmp_f64(d.to_secs_f64(), f), Datum::Time(t) => cmp_f64(t.convert(ctx)?, f), Datum::Json(_) => Ok(Ordering::Less), + Datum::VectorFloat32(_) => Err(box_err!("not implemented")), Datum::Enum(_) => Ok(Ordering::Less), Datum::Set(_) => Ok(Ordering::Less), } @@ -475,6 +482,7 @@ impl Datum { | Datum::Bytes(_) | Datum::Dec(_) | Datum::Json(_) + | Datum::VectorFloat32(_) | Datum::Enum(_) | Datum::Set(_) | Datum::Max @@ -896,7 +904,7 @@ impl From for Datum { /// `DatumDecoder` decodes the datum. pub trait DatumDecoder: - DecimalDecoder + JsonDecoder + CompactByteDecoder + MemComparableByteDecoder + DecimalDecoder + JsonDecoder + VectorFloat32Decoder + CompactByteDecoder + MemComparableByteDecoder { /// `read_datum` decodes on a datum from a byte slice generated by TiDB. fn read_datum(&mut self) -> Result { @@ -919,6 +927,7 @@ pub trait DatumDecoder: VAR_INT_FLAG => self.read_var_i64().map(Datum::I64)?, VAR_UINT_FLAG => self.read_var_u64().map(Datum::U64)?, JSON_FLAG => self.read_json().map(Datum::Json)?, + VECTOR_FLOAT32_FLAG => self.read_vector_float32().map(Datum::VectorFloat32)?, f => return Err(invalid_type!("unsupported data type `{}`", f)), }; Ok(datum) @@ -939,7 +948,7 @@ pub fn decode(data: &mut BytesSlice<'_>) -> Result> { /// `DatumEncoder` encodes the datum. pub trait DatumEncoder: - DecimalEncoder + JsonEncoder + CompactByteEncoder + MemComparableByteEncoder + DecimalEncoder + JsonEncoder + VectorFloat32Encoder + CompactByteEncoder + MemComparableByteEncoder { /// Encode values to buf slice. fn write_datum( @@ -1011,6 +1020,10 @@ pub trait DatumEncoder: self.write_u8(JSON_FLAG)?; self.write_json(j.as_ref())?; } + Datum::VectorFloat32(ref v) => { + self.write_u8(VECTOR_FLOAT32_FLAG)?; + self.write_vector_float32(v.as_ref())?; + } // TODO: implement datum write here. Datum::Enum(_) => unimplemented!(), Datum::Set(_) => unimplemented!(), @@ -1057,6 +1070,7 @@ pub fn approximate_size(values: &[Datum], comparable: bool) -> usize { Datum::Dec(ref d) => d.approximate_encoded_size(), Datum::Json(ref d) => d.as_ref().binary_len(), Datum::Null | Datum::Min | Datum::Max => 0, + Datum::VectorFloat32(ref v) => v.as_ref().encoded_len(), // TODO: implement here after we implement datum write Datum::Enum(_) => unimplemented!(), Datum::Set(_) => unimplemented!(), @@ -1126,6 +1140,12 @@ pub fn split_datum(buf: &[u8], desc: bool) -> Result<(&[u8], &[u8])> { v.read_json()?; l - v.len() } + VECTOR_FLOAT32_FLAG => { + let mut v = &buf[1..]; + let l = v.len(); + v.read_vector_float32_ref()?; + l - v.len() + } f => return Err(invalid_type!("unsupported data type `{}`", f)), }; if buf.len() < pos + 1 { @@ -1246,6 +1266,10 @@ mod tests { .unwrap(), ), ], + vec![ + Datum::VectorFloat32(VectorFloat32::from_f32(vec![1.0, 2.0, 3.0]).unwrap()), + Datum::VectorFloat32(VectorFloat32::from_f32(vec![]).unwrap()), + ], ]; for vs in table { let mut buf = encode_key(&mut ctx, &vs).unwrap(); diff --git a/components/tidb_query_datatype/src/codec/datum_codec.rs b/components/tidb_query_datatype/src/codec/datum_codec.rs index 9d3f5058d0b..930a9986a86 100644 --- a/components/tidb_query_datatype/src/codec/datum_codec.rs +++ b/components/tidb_query_datatype/src/codec/datum_codec.rs @@ -13,7 +13,7 @@ use crate::{ datum, mysql::{ DecimalDecoder, DecimalEncoder, DurationDecoder, EnumDecoder, EnumEncoder, JsonDecoder, - JsonEncoder, TimeDecoder, + JsonEncoder, TimeDecoder, VectorFloat32Decoder, VectorFloat32Encoder, }, Error, Result, }, @@ -33,6 +33,7 @@ pub trait DatumPayloadDecoder: + DecimalDecoder + JsonDecoder + EnumDecoder + + VectorFloat32Decoder { #[inline] fn read_datum_payload_i64(&mut self) -> Result { @@ -130,6 +131,13 @@ pub trait DatumPayloadDecoder: }) } + #[inline] + fn read_datum_payload_vector_float32(&mut self) -> Result { + self.read_vector_float32().map_err(|_| { + Error::InvalidDataType("Failed to decode datum payload as vectorFloat32".to_owned()) + }) + } + #[inline] fn read_datum_payload_enum_compact_bytes(&mut self, field_type: &FieldType) -> Result { self.read_enum_compact_bytes(field_type).map_err(|_| { @@ -158,7 +166,12 @@ impl DatumPayloadDecoder for T {} /// /// The types this encoder accepts are not fully 1:1 mapping to evaluable types. pub trait DatumPayloadEncoder: - NumberEncoder + CompactByteEncoder + JsonEncoder + DecimalEncoder + EnumEncoder + NumberEncoder + + CompactByteEncoder + + JsonEncoder + + DecimalEncoder + + EnumEncoder + + VectorFloat32Encoder { #[inline] fn write_datum_payload_i64(&mut self, v: i64) -> Result<()> { @@ -211,6 +224,13 @@ pub trait DatumPayloadEncoder: }) } + #[inline] + fn write_datum_payload_vector_float32(&mut self, v: VectorFloat32Ref<'_>) -> Result<()> { + self.write_vector_float32(v).map_err(|_| { + Error::InvalidDataType("Failed to encode datum payload from vectorFloat32".to_owned()) + }) + } + #[inline] fn write_datum_payload_enum_uint(&mut self, v: EnumRef<'_>) -> Result<()> { self.write_enum_uint(v).map_err(|_| { @@ -290,6 +310,12 @@ pub trait DatumFlagAndPayloadEncoder: BufferWriter + DatumPayloadEncoder { Ok(()) } + fn write_datum_vector_float32(&mut self, val: VectorFloat32Ref<'_>) -> Result<()> { + self.write_u8(datum::VECTOR_FLOAT32_FLAG)?; + self.write_datum_payload_vector_float32(val)?; + Ok(()) + } + fn write_datum_enum_uint(&mut self, val: EnumRef<'_>) -> Result<()> { self.write_u8(datum::UINT_FLAG)?; self.write_datum_payload_enum_uint(val)?; @@ -348,6 +374,11 @@ pub trait EvaluableDatumEncoder: DatumFlagAndPayloadEncoder { self.write_datum_json(val) } + #[inline] + fn write_evaluable_datum_vector_float32(&mut self, val: VectorFloat32Ref<'_>) -> Result<()> { + self.write_datum_vector_float32(val) + } + #[inline] fn write_evaluable_datum_enum_uint(&mut self, val: EnumRef<'_>) -> Result<()> { self.write_datum_enum_uint(val) @@ -529,6 +560,24 @@ pub fn decode_json_datum(mut raw_datum: &[u8]) -> Result> { } } +pub fn decode_vector_float32_datum(mut raw_datum: &[u8]) -> Result> { + if raw_datum.is_empty() { + return Err(Error::InvalidDataType( + "Failed to decode datum flag".to_owned(), + )); + } + let flag = raw_datum[0]; + raw_datum = &raw_datum[1..]; + match flag { + datum::NIL_FLAG => Ok(None), + datum::VECTOR_FLOAT32_FLAG => Ok(Some(raw_datum.read_datum_payload_vector_float32()?)), + _ => Err(Error::InvalidDataType(format!( + "Unsupported datum flag {} for VectorFloat32 vector", + flag + ))), + } +} + pub fn decode_enum_datum(mut raw_datum: &[u8], field_type: &FieldType) -> Result> { if raw_datum.is_empty() { return Err(Error::InvalidDataType( @@ -599,6 +648,16 @@ impl<'a> RawDatumDecoder for &'a [u8] { } } +impl<'a> RawDatumDecoder for &'a [u8] { + fn decode( + self, + _field_type: &FieldType, + _ctx: &mut EvalContext, + ) -> Result> { + decode_vector_float32_datum(self) + } +} + impl<'a> RawDatumDecoder for &'a [u8] { fn decode(self, field_type: &FieldType, _ctx: &mut EvalContext) -> Result> { decode_enum_datum(self, field_type) diff --git a/components/tidb_query_datatype/src/codec/error.rs b/components/tidb_query_datatype/src/codec/error.rs index 785424b31ca..2d48ee08b65 100644 --- a/components/tidb_query_datatype/src/codec/error.rs +++ b/components/tidb_query_datatype/src/codec/error.rs @@ -21,6 +21,7 @@ pub const ERR_TRUNCATE_WRONG_VALUE: i32 = 1292; pub const ERR_UNKNOWN_TIMEZONE: i32 = 1298; pub const ERR_DIVISION_BY_ZERO: i32 = 1365; pub const ERR_DATA_TOO_LONG: i32 = 1406; +pub const ERR_DATETIME_FUNCTION_OVERFLOW: i32 = 1441; pub const ERR_INCORRECT_PARAMETERS: i32 = 1583; pub const ERR_DATA_OUT_OF_RANGE: i32 = 1690; pub const ERR_CANNOT_CONVERT_STRING: i32 = 3854; @@ -100,6 +101,11 @@ impl Error { Error::Eval(msg, ERR_CANNOT_CONVERT_STRING) } + pub fn datetime_function_overflow() -> Error { + let msg = "Datetime function field overflow"; + Error::Eval(msg.into(), ERR_DATETIME_FUNCTION_OVERFLOW) + } + pub fn code(&self) -> i32 { match *self { Error::Eval(_, code) => code, diff --git a/components/tidb_query_datatype/src/codec/mysql/charset.rs b/components/tidb_query_datatype/src/codec/mysql/charset.rs index 0ac2655c619..cbdd5a01315 100644 --- a/components/tidb_query_datatype/src/codec/mysql/charset.rs +++ b/components/tidb_query_datatype/src/codec/mysql/charset.rs @@ -13,6 +13,8 @@ pub const CHARSET_ASCII: &str = "ascii"; pub const CHARSET_LATIN1: &str = "latin1"; /// `CHARSET_GBK` is Chinese character set. pub const CHARSET_GBK: &str = "gbk"; +/// `CHARSET_GB18030` is another Chinese character set containing GBK. +pub const CHARSET_GB18030: &str = "gb18030"; /// All utf8 charsets. pub const UTF8_CHARSETS: &[&str] = &[CHARSET_UTF8, CHARSET_UTF8MB4, CHARSET_ASCII]; diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 0930973233c..8d3e291fa4e 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -1047,6 +1047,11 @@ impl Decimal { } } + /// `frac_cnt` returns fraction count. + pub fn frac_cnt(&self) -> u8 { + self.frac_cnt + } + /// `digit_bounds` returns bounds of decimal digits in the number. fn digit_bounds(&self) -> (u8, u8) { let mut buf_beg = 0; @@ -1741,11 +1746,6 @@ impl Decimal { pub fn result_frac_cnt(&self) -> u8 { self.result_frac_cnt } - - #[cfg(test)] - pub fn frac_cnt(&self) -> u8 { - self.frac_cnt - } } macro_rules! enable_conv_for_int { diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 7279f788146..6b08c756d24 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -20,24 +20,29 @@ use crate::{ FieldTypeAccessor, }; -pub const NANOS_PER_SEC: i64 = 1_000_000_000; -pub const NANOS_PER_MILLI: i64 = 1_000_000; pub const NANOS_PER_MICRO: i64 = 1_000; +pub const NANOS_PER_MILLI: i64 = 1_000_000; +pub const NANOS_PER_SEC: i64 = 1_000_000_000; +pub const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SEC; +pub const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE; +pub const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR; + pub const MICROS_PER_SEC: i64 = 1_000_000; pub const NANO_WIDTH: usize = 9; pub const MICRO_WIDTH: usize = 6; -const SECS_PER_HOUR: i64 = 3600; -const SECS_PER_MINUTE: i64 = 60; +pub const SECS_PER_MINUTE: i64 = 60; +pub const SECS_PER_HOUR: i64 = 3600; +pub const SECS_PER_DAY: i64 = SECS_PER_HOUR * 24; pub const MAX_HOUR_PART: u32 = 838; pub const MAX_MINUTE_PART: u32 = 59; pub const MAX_SECOND_PART: u32 = 59; pub const MAX_NANOS_PART: u32 = 999_999_999; -pub const MAX_NANOS: i64 = ((MAX_HOUR_PART as i64 * SECS_PER_HOUR) +pub const MAX_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR + MAX_MINUTE_PART as i64 * SECS_PER_MINUTE - + MAX_SECOND_PART as i64) - * NANOS_PER_SEC; + + MAX_SECOND_PART as i64; +pub const MAX_NANOS: i64 = MAX_SECS * NANOS_PER_SEC; const MAX_DURATION_INT_VALUE: u32 = MAX_HOUR_PART * 10000 + MAX_MINUTE_PART * 100 + MAX_SECOND_PART; #[inline] diff --git a/components/tidb_query_datatype/src/codec/mysql/mod.rs b/components/tidb_query_datatype/src/codec/mysql/mod.rs index 5f15fd2c107..2bcfeb8ee23 100644 --- a/components/tidb_query_datatype/src/codec/mysql/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/mod.rs @@ -15,7 +15,7 @@ pub const DEFAULT_FSP: i8 = 0; /// inrements. pub const DEFAULT_DIV_FRAC_INCR: u8 = 4; -fn check_fsp(fsp: i8) -> Result { +pub fn check_fsp(fsp: i8) -> Result { if fsp == UNSPECIFIED_FSP { return Ok(DEFAULT_FSP as u8); } @@ -33,6 +33,7 @@ pub mod enums; pub mod json; pub mod set; pub mod time; +pub mod vector; pub use self::{ decimal::{dec_encoded_len, Decimal, DecimalDecoder, DecimalEncoder, Res, RoundMode}, @@ -44,4 +45,5 @@ pub use self::{ }, set::{Set, SetRef}, time::{Time, TimeDecoder, TimeEncoder, TimeType, Tz}, + vector::{VectorFloat32, VectorFloat32Decoder, VectorFloat32Encoder, VectorFloat32Ref}, }; diff --git a/components/tidb_query_datatype/src/codec/mysql/time/interval.rs b/components/tidb_query_datatype/src/codec/mysql/time/interval.rs new file mode 100644 index 00000000000..c22f8399f8c --- /dev/null +++ b/components/tidb_query_datatype/src/codec/mysql/time/interval.rs @@ -0,0 +1,2037 @@ +// Copyright 2024 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashMap, str::FromStr}; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::{ + codec::{ + data_type::{BytesRef, Decimal, Real}, + mysql::{duration::*, RoundMode, DEFAULT_FSP, MAX_FSP, MIN_FSP}, + Error, Result, + }, + expr::EvalContext, +}; + +/// See https://dev.mysql.com/doc/refman/8.0/en/expressions.html#temporal-intervals +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum IntervalUnit { + Microsecond, + Second, + Minute, + Hour, + Day, + Week, + Month, + Quarter, + Year, + SecondMicrosecond, + MinuteMicrosecond, + MinuteSecond, + HourMicrosecond, + HourSecond, + HourMinute, + DayMicrosecond, + DaySecond, + DayMinute, + DayHour, + YearMonth, +} + +impl IntervalUnit { + pub fn from_str(unit: &str) -> Result { + use IntervalUnit::*; + match unit.to_uppercase().as_str() { + "MICROSECOND" => Ok(Microsecond), + "SECOND" => Ok(Second), + "MINUTE" => Ok(Minute), + "HOUR" => Ok(Hour), + "DAY" => Ok(Day), + "WEEK" => Ok(Week), + "MONTH" => Ok(Month), + "QUARTER" => Ok(Quarter), + "YEAR" => Ok(Year), + "SECOND_MICROSECOND" => Ok(SecondMicrosecond), + "MINUTE_MICROSECOND" => Ok(MinuteMicrosecond), + "MINUTE_SECOND" => Ok(MinuteSecond), + "HOUR_MICROSECOND" => Ok(HourMicrosecond), + "HOUR_SECOND" => Ok(HourSecond), + "HOUR_MINUTE" => Ok(HourMinute), + "DAY_MICROSECOND" => Ok(DayMicrosecond), + "DAY_SECOND" => Ok(DaySecond), + "DAY_MINUTE" => Ok(DayMinute), + "DAY_HOUR" => Ok(DayHour), + "YEAR_MONTH" => Ok(YearMonth), + _ => Err(box_err!("unknown unit str {}", unit)), + } + } + + pub fn is_clock_unit(&self) -> bool { + use IntervalUnit::*; + matches!( + self, + Microsecond + | Second + | Minute + | Hour + | SecondMicrosecond + | MinuteMicrosecond + | HourMicrosecond + | DayMicrosecond + | MinuteSecond + | HourSecond + | DaySecond + | HourMinute + | DayMinute + | DayHour + ) + } +} + +#[derive(Clone, Copy, PartialEq)] +#[repr(usize)] +enum TimeIndex { + Year = 0, + Month = 1, + Day = 2, + Hour = 3, + Minute = 4, + Second = 5, + Microsecond = 6, + Max = 7, +} + +lazy_static! { + static ref ONE_TO_SIX_DIGIT_REGEX: Regex = Regex::new(r"^[0-9]{0,6}").unwrap(); + static ref NUMERIC_REGEX: Regex = Regex::new(r"[0-9]+").unwrap(); + static ref INTERVAL_REGEX: Regex = Regex::new(r"^[+-]?[\d]+").unwrap(); + /// Index of 'YEARS-MONTHS DAYS HOURS:MINUTES:SECONDS.MICROSECONDS' interval string Format. + /// IntervalUnit -> (Time Index, Max Count) + static ref INTERVAL_STR_INDEX_MAP: HashMap = { + [ + // 'SECONDS.MICROSECONDS' + (IntervalUnit::SecondMicrosecond, (TimeIndex::Microsecond, 2)), + // 'MINUTES:SECONDS.MICROSECONDS' + (IntervalUnit::MinuteMicrosecond, (TimeIndex::Microsecond, 3)), + // 'MINUTES:SECONDS' + (IntervalUnit::MinuteSecond, (TimeIndex::Second, 2)), + // 'HOURS:MINUTES:SECONDS.MICROSECONDS' + (IntervalUnit::HourMicrosecond, (TimeIndex::Microsecond, 4)), + // 'HOURS:MINUTES:SECONDS' + (IntervalUnit::HourSecond, (TimeIndex::Second, 3)), + // 'HOURS:MINUTES' + (IntervalUnit::HourMinute, (TimeIndex::Minute, 2)), + // 'DAYS HOURS:MINUTES:SECONDS.MICROSECONDS' + (IntervalUnit::DayMicrosecond, (TimeIndex::Microsecond, 5)), + // 'DAYS HOURS:MINUTES:SECONDS' + (IntervalUnit::DaySecond, (TimeIndex::Second, 4)), + // 'DAYS HOURS:MINUTES' + (IntervalUnit::DayMinute, (TimeIndex::Minute, 3)), + // 'DAYS HOURS' + (IntervalUnit::DayHour, (TimeIndex::Hour, 2)), + // 'YEARS-MONTHS' + (IntervalUnit::YearMonth, (TimeIndex::Month, 2)), + ].iter().cloned().collect() + }; +} + +#[derive(Debug, PartialEq)] +pub struct Interval { + month: i64, + sec: i64, + nano: i64, + fsp: i8, +} + +impl Interval { + pub fn parse_from_str( + ctx: &mut EvalContext, + unit: &IntervalUnit, + input: &str, + ) -> Result> { + Self::parse_from_str_internal(ctx, unit, input, false) + } + + #[inline] + fn parse_from_str_internal( + ctx: &mut EvalContext, + unit: &IntervalUnit, + input: &str, + for_duration: bool, + ) -> Result> { + if let Some(&(index, max_cnt)) = INTERVAL_STR_INDEX_MAP.get(unit) { + Self::parse_time_value(ctx, input, index, max_cnt, for_duration) + } else { + Self::parse_single_time_value(ctx, unit, input, for_duration) + } + } + + fn parse_single_time_value( + ctx: &mut EvalContext, + unit: &IntervalUnit, + input: &str, + for_duration: bool, + ) -> Result> { + use IntervalUnit::*; + // Find decimal point position + let decimal_point_pos = input.find('.').unwrap_or(input.len()); + + // Handle negative sign + let mut sign: i64 = 1; + let integer_part = if input.starts_with('-') { + sign = -1; + &input[1..decimal_point_pos] + } else { + &input[..decimal_point_pos] + }; + + // Parse integer part before decimal point + let iv = match i64::from_str(integer_part) { + Ok(val) => val * sign, + Err(_) => { + if for_duration { + return Err(Error::incorrect_datetime_value(input)); + } + ctx.handle_invalid_time_error(Error::incorrect_datetime_value(input))?; + 0 + } + }; + // Rounded integer value + let mut riv = iv; + + // Handle decimal part + let mut decimal_len = 0; + let mut dv = 0i64; + if decimal_point_pos < input.len() - 1 { + let dv_pre = &input[decimal_point_pos + 1..]; + let mut dv_pre = ONE_TO_SIX_DIGIT_REGEX + .find(dv_pre) + .map_or("", |m| m.as_str()) + .to_string(); + decimal_len = dv_pre.len(); + if decimal_len < MAX_FSP as usize { + dv_pre.push_str(&"0".repeat(MAX_FSP as usize - decimal_len)); + } + decimal_len = std::cmp::min(decimal_len, MAX_FSP as usize); + + dv = match i64::from_str(&dv_pre[..MAX_FSP as usize]) { + Ok(val) => val, + Err(_) => { + if for_duration { + return Err(Error::incorrect_datetime_value(input)); + } + ctx.handle_invalid_time_error(Error::incorrect_datetime_value(input))?; + 0 + } + }; + + // Round up, and we should keep 6 digits for microsecond, so dv should in + // [000000, 999999]. + if dv >= 500_000 { + riv += sign; + } + if *unit != Second { + if for_duration { + return Err(Error::incorrect_datetime_value(input)); + } + ctx.handle_invalid_time_error(Error::incorrect_datetime_value(input))?; + } + dv *= sign; + } + + match unit { + Microsecond => { + let nano = if for_duration { + if riv.abs() > MAX_SECS * NANOS_PER_MICRO { + return Err(Error::datetime_function_overflow()); + } + riv * NANOS_PER_MICRO + } else { + match riv.checked_mul(NANOS_PER_MICRO) { + Some(n) => n, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + } + }; + Ok(Some(Self { + month: 0, + sec: 0, + nano, + fsp: MAX_FSP, + })) + } + Second => { + if for_duration && iv.abs() > MAX_SECS { + return Err(Error::datetime_function_overflow()); + } + Ok(Some(Self { + month: 0, + sec: iv, + nano: dv * NANOS_PER_MICRO, + fsp: decimal_len as i8, + })) + } + Minute => { + let sec = if for_duration { + if riv.abs() > (MAX_HOUR_PART * 60 + MAX_MINUTE_PART) as i64 { + return Err(Error::datetime_function_overflow()); + } + riv * SECS_PER_MINUTE + } else { + match riv.checked_mul(SECS_PER_MINUTE) { + Some(n) => n, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + } + }; + Ok(Some(Self { + month: 0, + sec, + nano: 0, + fsp: 0, + })) + } + Hour => { + let sec = if for_duration { + if riv.abs() > MAX_HOUR_PART as i64 { + return Err(Error::datetime_function_overflow()); + } + riv * SECS_PER_HOUR + } else { + match riv.checked_mul(SECS_PER_HOUR) { + Some(n) => n, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + } + }; + Ok(Some(Self { + month: 0, + sec, + nano: 0, + fsp: 0, + })) + } + Day => { + let sec = if for_duration { + if riv.abs() > MAX_HOUR_PART as i64 / 24 { + return Err(Error::datetime_function_overflow()); + } + riv * SECS_PER_DAY + } else { + match riv.checked_mul(SECS_PER_DAY) { + Some(n) => n, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + } + }; + Ok(Some(Self { + month: 0, + sec, + nano: 0, + fsp: 0, + })) + } + Week => { + let sec = if for_duration { + if riv.abs() * 7 > MAX_HOUR_PART as i64 / 24 { + return Err(Error::datetime_function_overflow()); + } + riv * SECS_PER_DAY * 7 + } else { + match riv.checked_mul(SECS_PER_DAY * 7) { + Some(n) => n, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + } + }; + Ok(Some(Self { + month: 0, + sec, + nano: 0, + fsp: 0, + })) + } + Month => { + if for_duration && riv.abs() > 1 { + return Err(Error::datetime_function_overflow()); + } + Ok(Some(Self { + month: riv, + sec: 0, + nano: 0, + fsp: 0, + })) + } + Quarter => { + if for_duration { + return Err(Error::datetime_function_overflow()); + } + let month = match riv.checked_mul(3) { + Some(m) => m, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + }; + Ok(Some(Self { + month, + sec: 0, + nano: 0, + fsp: 0, + })) + } + Year => { + if for_duration { + return Err(Error::datetime_function_overflow()); + } + let month = match riv.checked_mul(12) { + Some(m) => m, + None => { + return ctx + .handle_invalid_time_error(Error::datetime_function_overflow()) + .map(|_| Ok(None))?; + } + }; + Ok(Some(Self { + month, + sec: 0, + nano: 0, + fsp: 0, + })) + } + _ => Err(box_err!("invalid single time unit {:?}", unit)), + } + } + + fn parse_time_value( + ctx: &mut EvalContext, + input: &str, + index: TimeIndex, + max_cnt: usize, + for_duration: bool, + ) -> Result> { + let mut neg = false; + let original_input = input; + + // Trim spaces and check if negative + let mut input = input.trim(); + if input.starts_with('-') { + neg = true; + input = &input[1..]; + } + + // Initialize fields as "0" + let mut fields = ["0"; TimeIndex::Max as usize]; + + let matches: Vec<&str> = NUMERIC_REGEX.find_iter(input).map(|m| m.as_str()).collect(); + + if matches.len() > max_cnt || matches.len() > index as usize + 1 { + if for_duration { + return Err(Error::incorrect_datetime_value(original_input)); + } + ctx.handle_invalid_time_error(Error::incorrect_datetime_value(original_input))?; + return Ok(Some(Self { + month: 0, + sec: 0, + nano: 0, + fsp: DEFAULT_FSP, + })); + } + + // Populate fields in reverse order + for (i, &matched) in matches.iter().rev().enumerate() { + fields[index as usize - i] = &matched; + } + + // Helper to parse integer fields and handle errors + let mut parse_field = |field: &str| -> Result { + match i64::from_str(field) { + Ok(val) => Ok(val), + Err(_) => { + if for_duration { + return Err(Error::incorrect_datetime_value(original_input)); + } + ctx.handle_invalid_time_error(Error::incorrect_datetime_value(original_input))?; + Ok(0) + } + } + }; + + // Parse the fields (year, month, day, hour, minute, second, microsecond) + let years = parse_field(fields[TimeIndex::Year as usize])?; + let months = parse_field(fields[TimeIndex::Month as usize])?; + let days = parse_field(fields[TimeIndex::Day as usize])?; + let hours = parse_field(fields[TimeIndex::Hour as usize])?; + let minutes = parse_field(fields[TimeIndex::Minute as usize])?; + let seconds = parse_field(fields[TimeIndex::Second as usize])?; + + let mut frac_part = fields[TimeIndex::Microsecond as usize].to_string(); + let frac_part_len = frac_part.len(); + if frac_part_len < MAX_FSP as usize { + frac_part.push_str(&"0".repeat(MAX_FSP as usize - frac_part_len)); + } + let microseconds = parse_field(&frac_part)?; + + let mut check_result = |res: Option| -> Result> { + match res { + Some(v) => Ok(Some(v)), + None => { + if for_duration { + return Err(Error::datetime_function_overflow()); + } + ctx.handle_invalid_time_error(Error::datetime_function_overflow())?; + Ok(None) + } + } + }; + let day_secs = match check_result(days.checked_mul(SECS_PER_DAY))? { + Some(v) => v, + None => return Ok(None), + }; + let hour_secs = match check_result(hours.checked_mul(SECS_PER_HOUR))? { + Some(v) => v, + None => return Ok(None), + }; + let minute_secs = match check_result(minutes.checked_mul(SECS_PER_MINUTE))? { + Some(v) => v, + None => return Ok(None), + }; + let total_secs1 = match check_result(day_secs.checked_add(hour_secs))? { + Some(v) => v, + None => return Ok(None), + }; + let total_secs2 = match check_result(minute_secs.checked_add(seconds))? { + Some(v) => v, + None => return Ok(None), + }; + let mut sec = match check_result(total_secs1.checked_add(total_secs2))? { + Some(v) => v, + None => return Ok(None), + }; + let mut nano = match check_result(microseconds.checked_mul(NANOS_PER_MICRO))? { + Some(v) => v, + None => return Ok(None), + }; + + let month1 = match check_result(years.checked_mul(12))? { + Some(v) => v, + None => return Ok(None), + }; + let mut month = match check_result(month1.checked_add(months))? { + Some(v) => v, + None => return Ok(None), + }; + if neg { + month = -month; + sec = -sec; + nano = -nano; + } + + // Return Interval with month, nano, and fsp values + Ok(Some(Self { + month, + sec, + nano, + fsp: if index == TimeIndex::Microsecond { + MAX_FSP + } else { + MIN_FSP + }, + })) + } + + pub fn extract_duration( + ctx: &mut EvalContext, + unit: &IntervalUnit, + input: &str, + ) -> Result { + let val = Self::parse_from_str_internal(ctx, unit, input, true)? + .ok_or_else(|| Error::datetime_function_overflow())?; + use IntervalUnit::*; + match unit { + Microsecond | Second | Minute | Hour | Day | Week | Month | Quarter | Year => { + Ok(Duration::from_nanos( + val.month * 30 * NANOS_PER_DAY + val.sec * NANOS_PER_SEC + val.nano, + val.fsp, + )?) + } + _ => { + if val.month != 0 || val.sec.abs() > MAX_SECS || val.nano.abs() > MAX_NANOS { + return Err(Error::datetime_function_overflow()); + } + Ok(Duration::from_nanos( + val.sec * NANOS_PER_SEC + val.nano, + val.fsp, + )?) + } + } + } + + pub fn negate(&self) -> Self { + Self { + month: -self.month, + sec: -self.sec, + nano: -self.nano, + fsp: self.fsp, + } + } + + pub fn month(&self) -> i64 { + self.month + } + + pub fn sec(&self) -> i64 { + self.sec + } + + pub fn nano(&self) -> i64 { + self.nano + } + + pub fn fsp(&self) -> i8 { + self.fsp + } +} + +/// Convert to a string which has a uniform interval format and then can be +/// parsed into Interval struct. +pub trait ConvertToIntervalStr { + fn to_interval_string( + &self, + ctx: &mut EvalContext, + unit: IntervalUnit, + is_unsigned: bool, + decimal: isize, + ) -> Result; +} + +impl<'a> ConvertToIntervalStr for BytesRef<'a> { + #[inline] + fn to_interval_string( + &self, + ctx: &mut EvalContext, + unit: IntervalUnit, + _is_unsigned: bool, + _decimal: isize, + ) -> Result { + let mut interval = "0".to_string(); + let input = std::str::from_utf8(self).map_err(Error::Encoding)?; + use IntervalUnit::*; + match unit { + Microsecond | Minute | Hour | Day | Week | Month | Quarter | Year => { + let trimmed = input.trim(); + if let Some(m) = INTERVAL_REGEX.find(trimmed) { + interval = m.as_str().to_string(); + } + + if interval != trimmed { + ctx.handle_truncate(true)?; + } + } + Second => { + // The unit SECOND is specially handled, for example: + // date + INTERVAL "1e2" SECOND = date + INTERVAL 100 second + // date + INTERVAL "1.6" SECOND = date + INTERVAL 1.6 second + // But: + // date + INTERVAL "1e2" MINUTE = date + INTERVAL 1 MINUTE + // date + INTERVAL "1.6" MINUTE = date + INTERVAL 1 MINUTE + let dec = match Decimal::from_bytes(self) { + Ok(d) => d.into_result(ctx)?, + Err(_) => { + ctx.handle_truncate(true)?; + Decimal::zero() + } + }; + interval = dec.to_string(); + } + _ => { + interval = input.to_string(); + } + } + Ok(interval) + } +} + +impl ConvertToIntervalStr for i64 { + #[inline] + fn to_interval_string( + &self, + _ctx: &mut EvalContext, + _unit: IntervalUnit, + is_unsigned: bool, + _decimal: isize, + ) -> Result { + if is_unsigned { + Ok((*self as u64).to_string()) + } else { + Ok(self.to_string()) + } + } +} + +impl ConvertToIntervalStr for Real { + #[inline] + fn to_interval_string( + &self, + _ctx: &mut EvalContext, + _unit: IntervalUnit, + _is_unsigned: bool, + decimal: isize, + ) -> Result { + if decimal < 0 { + // Default + Ok(self.to_string()) + } else { + Ok(format!("{:.*}", decimal as usize, self.into_inner())) + } + } +} + +impl ConvertToIntervalStr for Decimal { + #[inline] + fn to_interval_string( + &self, + ctx: &mut EvalContext, + unit: IntervalUnit, + _is_unsigned: bool, + _decimal: isize, + ) -> Result { + let mut interval = self.to_string(); + use IntervalUnit::*; + match unit { + HourMinute | MinuteSecond | YearMonth | DayHour | DayMinute | DaySecond + | DayMicrosecond | HourMicrosecond | HourSecond | MinuteMicrosecond + | SecondMicrosecond => { + let mut neg = false; + if !interval.is_empty() && interval.starts_with('-') { + neg = true; + interval = interval[1..].to_string(); + } + match unit { + HourMinute | MinuteSecond => interval = interval.replace('.', ":"), + YearMonth => interval = interval.replace('.', "-"), + DayHour => interval = interval.replace('.', " "), + DayMinute => interval = "0 ".to_string() + &interval.replace('.', ":"), + DaySecond => interval = "0 00:".to_string() + &interval.replace('.', ":"), + DayMicrosecond => interval = "0 00:00:".to_string() + &interval, + HourMicrosecond => interval = "00:00:".to_string() + &interval, + HourSecond => interval = "00:".to_string() + &interval.replace('.', ":"), + MinuteMicrosecond => interval = "00:".to_string() + &interval, + SecondMicrosecond => (), + _ => unreachable!(), + } + if neg { + interval = "-".to_string() + &interval; + } + } + Second => (), + _ => { + let rounded = self.round(0, RoundMode::HalfEven).into_result(ctx)?; + let int_val = rounded.as_i64().into_result(ctx)?; + interval = int_val.to_string(); + } + } + Ok(interval) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::expr::{EvalConfig, Flag}; + + #[test] + fn test_is_clock_unit() -> Result<()> { + let cases = vec![ + ("MICROSECOND", true), + ("secOnd", true), + ("MINUTE", true), + ("HOUR", true), + ("daY", false), + ("WeeK", false), + ("MONTH", false), + ("QUARTER", false), + ("year", false), + ("SECOND_MIcROSECOnD", true), + ("MINUTE_MICROSECOND", true), + ("MINUTE_second", true), + ("HOUR_MICROSECOND", true), + ("HOUR_SECOND", true), + ("HOUR_MINUTE", true), + ("DAY_MICROSECOND", true), + ("DAY_SECOND", true), + ("DAY_MINUTE", true), + ("DAY_HOUR", true), + ("year_MONTH", false), + ]; + for (str, result) in cases { + let unit = IntervalUnit::from_str(str)?; + assert_eq!(unit.is_clock_unit(), result); + } + Ok(()) + } + + #[test] + fn test_bytes_ref_to_interval_string() { + use IntervalUnit::*; + let cases = vec![ + (b"365" as &[u8], Microsecond, "365"), + (b"10", Minute, "10"), + (b"-123", Minute, "-123"), + (b"24", Hour, "24"), + (b" 365", Day, "365"), + (b"abc", Day, "0"), + (b" -221", Week, "-221"), + (b"a6", Month, "0"), + (b"-24a", Quarter, "-24"), + (b"1024", Year, "1024"), + (b"1e2", Second, "100"), + (b"-2e4", Second, "-20000"), + (b"1.6", Second, "1.6"), + (b"-1.6554", Second, "-1.6554"), + (b"sdfasersasd", Second, "0"), + ]; + + let mut config = EvalConfig::new(); + config.set_flag(Flag::TRUNCATE_AS_WARNING); + let mut ctx = EvalContext::new(std::sync::Arc::new(config)); + for (input, unit, expected) in cases { + let result = input.to_interval_string(&mut ctx, unit, false, 0).unwrap(); + assert_eq!(result, expected); + } + + let mut ctx = EvalContext::default(); + let err_cases = vec![(b"abc" as &[u8], Day), (b"a6", Month), (b"-24a", Quarter)]; + for (input, unit) in err_cases { + input + .to_interval_string(&mut ctx, unit, false, 0) + .unwrap_err(); + } + } + + #[test] + fn test_i64_to_interval_string() { + let cases = vec![ + (42i64, false, "42"), + (-100i64, false, "-100"), + (0i64, false, "0"), + (9999999999i64, false, "9999999999"), + (-9999999999i64, false, "-9999999999"), + (9999999999i64, true, "9999999999"), + (-9999999999i64, true, "18446744063709551617"), + ]; + + let mut ctx = EvalContext::default(); + for (input, is_unsigned, expected) in cases { + let result = input + .to_interval_string(&mut ctx, IntervalUnit::Second, is_unsigned, 0) + .unwrap(); + assert_eq!(result, expected); + } + } + + #[test] + fn test_real_to_interval_string() { + let mut ctx = EvalContext::default(); + + let cases = vec![ + (1.2345, 4, "1.2345"), + (1.2345, 5, "1.23450"), + (1.2345, 2, "1.23"), + (-1.6789, 3, "-1.679"), + (-1.6789, 6, "-1.678900"), + (100.779, 0, "101"), + (-100.779, 0, "-101"), + (-123.123, -1, "-123.123"), + (-123.1239123, -1, "-123.1239123"), + ]; + + for (input, decimal, expected) in cases { + let real = Real::new(input).unwrap(); + let result = real + .to_interval_string(&mut ctx, IntervalUnit::Second, false, decimal) + .unwrap(); + assert_eq!(result, expected); + } + } + + #[test] + fn test_decimal_to_interval() { + use IntervalUnit::*; + let cases = vec![ + // Basic unit cases + ("12.34", Year, "12"), + ("-12.34", Month, "-12"), + ("12.5", Day, "13"), + ("12.45", Hour, "12"), + ("-12.6", Minute, "-13"), + ("12.34", Second, "12.34"), + ("-12.34", Second, "-12.34"), + // Compound unit cases + ("12.34", HourMinute, "12:34"), + ("-12.34", MinuteSecond, "-12:34"), + ("12.34", YearMonth, "12-34"), + ("-12.34", YearMonth, "-12-34"), + ("12.34", DayHour, "12 34"), + ("-12.34", DayHour, "-12 34"), + ("12.34", DayMinute, "0 12:34"), + ("-12.3400", DayMinute, "-0 12:3400"), + ("12.34", DaySecond, "0 00:12:34"), + ("-12.34", DaySecond, "-0 00:12:34"), + ("12.34", DayMicrosecond, "0 00:00:12.34"), + ("-12.34", DayMicrosecond, "-0 00:00:12.34"), + ("12.34", HourMicrosecond, "00:00:12.34"), + ("-12.34", HourMicrosecond, "-00:00:12.34"), + ("12.34", HourSecond, "00:12:34"), + ("-12.34", HourSecond, "-00:12:34"), + ("12.34", MinuteMicrosecond, "00:12.34"), + ("-12.34", MinuteMicrosecond, "-00:12.34"), + ("12.34", SecondMicrosecond, "12.34"), + ("-12.34", SecondMicrosecond, "-12.34"), + // Rounding case + ("12.99", Year, "13"), + ("12.49", Year, "12"), + ("-12.99", Year, "-13"), + ]; + + let mut ctx = EvalContext::default(); + for (input, unit, expected) in cases { + let decimal = Decimal::from_str(input).unwrap(); + let result = decimal + .to_interval_string(&mut ctx, unit, false, 0) + .unwrap(); + assert_eq!( + result, expected, + "Failed for input: {}, unit: {:?}", + input, unit + ); + } + } + + #[test] + fn test_interval_parse_from_str() { + use IntervalUnit::*; + let cases = vec![ + ( + "123456", + Microsecond, + Interval { + month: 0, + sec: 0, + nano: 123456 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123456", + Microsecond, + Interval { + month: 0, + sec: 0, + nano: -123456 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "2.123456", + Second, + Interval { + month: 0, + sec: 2, + nano: 123456 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-2.123456", + Second, + Interval { + month: 0, + sec: -2, + nano: -123456 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "2.12345", + Second, + Interval { + month: 0, + sec: 2, + nano: 123450 * NANOS_PER_MICRO, + fsp: 5, + }, + ), + ( + "-2.12345", + Second, + Interval { + month: 0, + sec: -2, + nano: -123450 * NANOS_PER_MICRO, + fsp: 5, + }, + ), + ( + "2.1234567", + Second, + Interval { + month: 0, + sec: 2, + nano: 123456 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-2.1234567", + Second, + Interval { + month: 0, + sec: -2, + nano: -123456 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "2.99", + Second, + Interval { + month: 0, + sec: 2, + nano: 990000 * NANOS_PER_MICRO, + fsp: 2, + }, + ), + ( + "-2.50000", + Second, + Interval { + month: 0, + sec: -2, + nano: -500000 * NANOS_PER_MICRO, + fsp: 5, + }, + ), + ( + "2.500000", + Minute, + Interval { + month: 0, + sec: 3 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-2.50000", + Minute, + Interval { + month: 0, + sec: -3 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Minute, + Interval { + month: 0, + sec: 100 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Minute, + Interval { + month: 0, + sec: -99 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Hour, + Interval { + month: 0, + sec: 100 * SECS_PER_HOUR, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Hour, + Interval { + month: 0, + sec: -99 * SECS_PER_HOUR, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Day, + Interval { + month: 0, + sec: 100 * SECS_PER_DAY, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Day, + Interval { + month: 0, + sec: -99 * SECS_PER_DAY, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Week, + Interval { + month: 0, + sec: 100 * SECS_PER_DAY * 7, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Week, + Interval { + month: 0, + sec: -99 * SECS_PER_DAY * 7, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Month, + Interval { + month: 100, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Month, + Interval { + month: -99, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Quarter, + Interval { + month: 100 * 3, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Quarter, + Interval { + month: -99 * 3, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "99.9", + Year, + Interval { + month: 100 * 12, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "-99.4", + Year, + Interval { + month: -99 * 12, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + // Compound unit cases + ( + "123", + SecondMicrosecond, + Interval { + month: 0, + sec: 0, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123", + SecondMicrosecond, + Interval { + month: 0, + sec: 0, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123.123", + SecondMicrosecond, + Interval { + month: 0, + sec: 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123.123", + SecondMicrosecond, + Interval { + month: 0, + sec: -123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123", + MinuteMicrosecond, + Interval { + month: 0, + sec: 0, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123", + MinuteMicrosecond, + Interval { + month: 0, + sec: 0, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123.123", + MinuteMicrosecond, + Interval { + month: 0, + sec: 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123.123", + MinuteMicrosecond, + Interval { + month: 0, + sec: -123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "2:123.123", + MinuteMicrosecond, + Interval { + month: 0, + sec: 2 * SECS_PER_MINUTE + 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-62:123.123", + MinuteMicrosecond, + Interval { + month: 0, + sec: -62 * SECS_PER_MINUTE - 123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123", + MinuteSecond, + Interval { + month: 0, + sec: 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + MinuteSecond, + Interval { + month: 0, + sec: -123, + nano: 0, + fsp: 0, + }, + ), + ( + "2:123", + MinuteSecond, + Interval { + month: 0, + sec: 2 * SECS_PER_MINUTE + 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-2:123", + MinuteSecond, + Interval { + month: 0, + sec: -2 * SECS_PER_MINUTE - 123, + nano: 0, + fsp: 0, + }, + ), + ( + "123", + HourMicrosecond, + Interval { + month: 0, + sec: 0, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123", + HourMicrosecond, + Interval { + month: 0, + sec: 0, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123.123", + HourMicrosecond, + Interval { + month: 0, + sec: 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123.123", + HourMicrosecond, + Interval { + month: 0, + sec: -123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "2:123.123", + HourMicrosecond, + Interval { + month: 0, + sec: 2 * SECS_PER_MINUTE + 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-62:123.123", + HourMicrosecond, + Interval { + month: 0, + sec: -62 * SECS_PER_MINUTE - 123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "12:2:123.123", + HourMicrosecond, + Interval { + month: 0, + sec: 12 * SECS_PER_HOUR + 2 * SECS_PER_MINUTE + 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-2:62:123.123", + HourMicrosecond, + Interval { + month: 0, + sec: -2 * SECS_PER_HOUR - 62 * SECS_PER_MINUTE - 123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123", + HourSecond, + Interval { + month: 0, + sec: 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + HourSecond, + Interval { + month: 0, + sec: -123, + nano: 0, + fsp: 0, + }, + ), + ( + "2:123", + HourSecond, + Interval { + month: 0, + sec: 2 * SECS_PER_MINUTE + 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-2:123", + HourSecond, + Interval { + month: 0, + sec: -2 * SECS_PER_MINUTE - 123, + nano: 0, + fsp: 0, + }, + ), + ( + "9:62:123", + HourSecond, + Interval { + month: 0, + sec: 9 * SECS_PER_HOUR + 62 * SECS_PER_MINUTE + 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-55:62:123", + HourSecond, + Interval { + month: 0, + sec: -55 * SECS_PER_HOUR - 62 * SECS_PER_MINUTE - 123, + nano: 0, + fsp: 0, + }, + ), + ( + "123", + HourMinute, + Interval { + month: 0, + sec: 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + HourMinute, + Interval { + month: 0, + sec: -123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "2:123", + HourMinute, + Interval { + month: 0, + sec: 2 * SECS_PER_HOUR + 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-88:123", + HourMinute, + Interval { + month: 0, + sec: -88 * SECS_PER_HOUR - 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "123", + DayMicrosecond, + Interval { + month: 0, + sec: 0, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123", + DayMicrosecond, + Interval { + month: 0, + sec: 0, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123.123", + DayMicrosecond, + Interval { + month: 0, + sec: 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-123.123", + DayMicrosecond, + Interval { + month: 0, + sec: -123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "2:123.123", + DayMicrosecond, + Interval { + month: 0, + sec: 2 * SECS_PER_MINUTE + 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-62:123.123", + DayMicrosecond, + Interval { + month: 0, + sec: -62 * SECS_PER_MINUTE - 123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "12:2:123.123", + DayMicrosecond, + Interval { + month: 0, + sec: 12 * SECS_PER_HOUR + 2 * SECS_PER_MINUTE + 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-2:62:123.123", + DayMicrosecond, + Interval { + month: 0, + sec: -2 * SECS_PER_HOUR - 62 * SECS_PER_MINUTE - 123, + nano: -123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "9 12:2:123.123", + DayMicrosecond, + Interval { + month: 0, + sec: 9 * SECS_PER_DAY + 12 * SECS_PER_HOUR + 2 * SECS_PER_MINUTE + 123, + nano: 123000 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "-77 2:62:123.123456789", + DayMicrosecond, + Interval { + month: 0, + sec: -77 * SECS_PER_DAY - 2 * SECS_PER_HOUR - 62 * SECS_PER_MINUTE - 123, + nano: -123456789 * NANOS_PER_MICRO, + fsp: 6, + }, + ), + ( + "123", + DaySecond, + Interval { + month: 0, + sec: 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + DaySecond, + Interval { + month: 0, + sec: -123, + nano: 0, + fsp: 0, + }, + ), + ( + "2:123", + DaySecond, + Interval { + month: 0, + sec: 2 * SECS_PER_MINUTE + 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-2:123", + DaySecond, + Interval { + month: 0, + sec: -2 * SECS_PER_MINUTE - 123, + nano: 0, + fsp: 0, + }, + ), + ( + "9:62:123", + DaySecond, + Interval { + month: 0, + sec: 9 * SECS_PER_HOUR + 62 * SECS_PER_MINUTE + 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-55:62:123", + DaySecond, + Interval { + month: 0, + sec: -55 * SECS_PER_HOUR - 62 * SECS_PER_MINUTE - 123, + nano: 0, + fsp: 0, + }, + ), + ( + "1 9:62:123", + DaySecond, + Interval { + month: 0, + sec: SECS_PER_DAY + 9 * SECS_PER_HOUR + 62 * SECS_PER_MINUTE + 123, + nano: 0, + fsp: 0, + }, + ), + ( + "-3 55:62:123", + DaySecond, + Interval { + month: 0, + sec: -3 * SECS_PER_DAY - 55 * SECS_PER_HOUR - 62 * SECS_PER_MINUTE - 123, + nano: 0, + fsp: 0, + }, + ), + ( + "123", + DayMinute, + Interval { + month: 0, + sec: 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + DayMinute, + Interval { + month: 0, + sec: -123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "2:123", + DayMinute, + Interval { + month: 0, + sec: 2 * SECS_PER_HOUR + 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-88:123", + DayMinute, + Interval { + month: 0, + sec: -88 * SECS_PER_HOUR - 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "08 2:123", + DayMinute, + Interval { + month: 0, + sec: 8 * SECS_PER_DAY + 2 * SECS_PER_HOUR + 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "-70 88:123", + DayMinute, + Interval { + month: 0, + sec: -70 * SECS_PER_DAY - 88 * SECS_PER_HOUR - 123 * SECS_PER_MINUTE, + nano: 0, + fsp: 0, + }, + ), + ( + "123", + DayHour, + Interval { + month: 0, + sec: 123 * SECS_PER_HOUR, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + DayHour, + Interval { + month: 0, + sec: -123 * SECS_PER_HOUR, + nano: 0, + fsp: 0, + }, + ), + ( + "66 123", + DayHour, + Interval { + month: 0, + sec: 66 * SECS_PER_DAY + 123 * SECS_PER_HOUR, + nano: 0, + fsp: 0, + }, + ), + ( + "-77 123", + DayHour, + Interval { + month: 0, + sec: -77 * SECS_PER_DAY - 123 * SECS_PER_HOUR, + nano: 0, + fsp: 0, + }, + ), + ( + "123", + YearMonth, + Interval { + month: 123, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "-123", + YearMonth, + Interval { + month: -123, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "99 123", + YearMonth, + Interval { + month: 99 * 12 + 123, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ( + "-7 123", + YearMonth, + Interval { + month: -7 * 12 - 123, + sec: 0, + nano: 0, + fsp: 0, + }, + ), + ]; + let mut ctx = EvalContext::default(); + for (input, unit, expected) in cases { + let result = Interval::parse_from_str(&mut ctx, &unit, input) + .unwrap() + .unwrap(); + assert_eq!( + result, expected, + "Failed for input: {}, unit: {:?}", + input, unit + ); + } + + let err_cases = vec![ + ("12:12.123", SecondMicrosecond), + ("20:12:12.123", MinuteMicrosecond), + ("12:12:12", MinuteSecond), + ("1 12:12:12.11", HourMicrosecond), + ("12:12:12.123", HourSecond), + ("12:12:12", HourMinute), + ("3 2 12:12:12.123", DayMicrosecond), + ("3 12:12:12.123", DaySecond), + ("3 12:12:12", DayMinute), + ("3 12:12", DayHour), + ("99 123:123", YearMonth), + ]; + for (input, unit) in err_cases { + let result = Interval::parse_from_str(&mut ctx, &unit, input) + .unwrap() + .unwrap(); + assert_eq!( + result, + Interval { + month: 0, + sec: 0, + nano: 0, + fsp: DEFAULT_FSP, + }, + "Failed for input: {}, unit: {:?}", + input, + unit + ); + } + + let none_cases = vec![ + // 2^54 * 1000 > 2^63 + ("8791026472627208192", Microsecond), + ("-8791026472627208192", Microsecond), + // 2^60 * 60 > 2^63 + ("1152921504606846976", Minute), + ("-1152921504606846976", Minute), + // 2^55 * 3600 > 2^63 + ("36028797018963968", Hour), + ("-36028797018963968", Hour), + // 2^47 * 86400 > 2^63 + ("140737488355328", Day), + ("-140737488355328", Day), + // 2^44 * 86400 * 7 > 2^63 + ("17592186044416", Week), + ("-17592186044416", Week), + // 2^62 * 3 > 2*63 + ("4611686018427387904", Quarter), + ("-4611686018427387904", Quarter), + // 2^60 * 12 > 2*63 + ("1152921504606846976", Year), + ("-1152921504606846976", Year), + ("140737488355328 12:12:12.123", DayMicrosecond), + ("-2 36028797018963968:12:12.123", DayMicrosecond), + ("-2 12:1152921504606846976:12.123", DayMicrosecond), + ("-2 12:12:9223372036854731888.123", DayMicrosecond), + ]; + for (input, unit) in none_cases { + let result = Interval::parse_from_str(&mut ctx, &unit, input).unwrap(); + assert!( + result.is_none(), + "Failed for input: {}, unit: {:?}", + input, + unit + ); + } + } + + #[test] + fn test_interval_extract_duration() { + use IntervalUnit::*; + let cases = vec![ + ( + "123456", + Microsecond, + Duration::from_nanos(123456 * NANOS_PER_MICRO, 6), + ), + ( + "-123456", + Microsecond, + Duration::from_nanos(-123456 * NANOS_PER_MICRO, 6), + ), + ( + "2.123456", + Second, + Duration::from_nanos(2 * NANOS_PER_SEC + 123456 * NANOS_PER_MICRO, 6), + ), + ( + "-2.123456", + Second, + Duration::from_nanos(-2 * NANOS_PER_SEC - 123456 * NANOS_PER_MICRO, 6), + ), + ( + "2.12345", + Second, + Duration::from_nanos(2 * NANOS_PER_SEC + 123450 * NANOS_PER_MICRO, 5), + ), + ( + "-2.12345", + Second, + Duration::from_nanos(-2 * NANOS_PER_SEC - 123450 * NANOS_PER_MICRO, 5), + ), + ( + "2.1234567", + Second, + Duration::from_nanos(2 * NANOS_PER_SEC + 123456 * NANOS_PER_MICRO, 6), + ), + ( + "-2.1234567", + Second, + Duration::from_nanos(-2 * NANOS_PER_SEC - 123456 * NANOS_PER_MICRO, 6), + ), + ( + "2.99", + Second, + Duration::from_nanos(2 * NANOS_PER_SEC + 990000 * NANOS_PER_MICRO, 2), + ), + ( + "-2.50000", + Second, + Duration::from_nanos(-2 * NANOS_PER_SEC - 500000 * NANOS_PER_MICRO, 5), + ), + ("99", Minute, Duration::from_nanos(99 * NANOS_PER_MINUTE, 0)), + ( + "-99", + Minute, + Duration::from_nanos(-99 * NANOS_PER_MINUTE, 0), + ), + ("30", Day, Duration::from_nanos(30 * NANOS_PER_DAY, 0)), + ("-30", Day, Duration::from_nanos(-30 * NANOS_PER_DAY, 0)), + ("2", Week, Duration::from_nanos(2 * NANOS_PER_DAY * 7, 0)), + ("-2", Week, Duration::from_nanos(-2 * NANOS_PER_DAY * 7, 0)), + ("1", Month, Duration::from_nanos(30 * NANOS_PER_DAY, 0)), + ("-1", Month, Duration::from_nanos(-30 * NANOS_PER_DAY, 0)), + ( + "29 12:23:36.1234", + DayMicrosecond, + Duration::from_nanos( + 29 * NANOS_PER_DAY + + 12 * NANOS_PER_HOUR + + 23 * NANOS_PER_MINUTE + + 36 * NANOS_PER_SEC + + 123400 * NANOS_PER_MICRO, + 6, + ), + ), + ( + "-29 12:23:36.1234", + DayMicrosecond, + Duration::from_nanos( + -29 * NANOS_PER_DAY + - 12 * NANOS_PER_HOUR + - 23 * NANOS_PER_MINUTE + - 36 * NANOS_PER_SEC + - 123400 * NANOS_PER_MICRO, + 6, + ), + ), + ]; + let mut ctx = EvalContext::default(); + for (input, unit, expected) in cases { + let result = Interval::extract_duration(&mut ctx, &unit, input).unwrap(); + assert_eq!( + result, + expected.unwrap(), + "Failed for input: {}, unit: {:?}", + input, + unit + ); + } + let err_cases = vec![ + ("2.500000", Minute), + ("-2.50000", Minute), + ("99.9", Hour), + ("-99.4", Hour), + ("35", Day), + ("-35", Day), + ("2", Month), + ("-2", Month), + ("99", Quarter), + ("-99", Quarter), + ("99", Year), + ("-99", Year), + ("-34 23:59:59.1234", DayMicrosecond), + ]; + for (input, unit) in err_cases { + let result = Interval::extract_duration(&mut ctx, &unit, input); + result.unwrap_err(); + } + } +} diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 4befe30c3c1..97f756bc1fe 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1,6 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. pub mod extension; +pub mod interval; mod tz; pub mod weekmode; @@ -9,6 +10,7 @@ use std::{ convert::{TryFrom, TryInto}, fmt::Write, hash::{Hash, Hasher}, + intrinsics::unlikely, }; use bitfield::bitfield; @@ -22,7 +24,7 @@ use crate::{ codec::{ convert::ConvertTo, data_type::Real, - mysql::{check_fsp, Decimal, Duration}, + mysql::{check_fsp, duration::SECS_PER_DAY, Decimal, Duration, Res, DEFAULT_FSP, MAX_FSP}, Error, Result, TEN_POW, }, expr::{EvalContext, Flag, SqlMode}, @@ -565,7 +567,7 @@ mod parser { pub fn parse( ctx: &mut EvalContext, input: &str, - time_type: TimeType, + time_type_opt: Option, fsp: u8, round: bool, ) -> Option