From 1e68e63c57003d1ea9060d8cbca4bcd61da624f5 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 3 Dec 2024 13:48:17 +0000 Subject: [PATCH 01/22] Chore: Bump OPTE->0.34.311 (#7195) This picks up fixes for oxidecomputer/opte#618 and oxidecomputer/opte#624. --- Cargo.lock | 238 +++++++++++++++++++----------------- Cargo.toml | 4 +- tools/opte_version | 2 +- tools/opte_version_override | 2 +- 4 files changed, 133 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 71ead293b1..a53447a075 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,7 +287,7 @@ dependencies = [ "bb8", "diesel", "futures", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -534,7 +534,7 @@ dependencies = [ "serde_with", "sha3", "static_assertions", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -687,7 +687,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85b6598a2f5d564fb7855dc6b06fd1c38cff5a72bd8b863a4d021938497b440a" dependencies = [ "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -895,7 +895,7 @@ dependencies = [ "slog", "slog-async", "slog-term", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", "vsss-rs", @@ -1056,7 +1056,7 @@ dependencies = [ "semver 1.0.23", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1449,7 +1449,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -1668,7 +1668,7 @@ source = "git+https://github.com/oxidecomputer/propolis?rev=fae5334bcad5e8647943 dependencies = [ "serde", "serde_derive", - "thiserror", + "thiserror 1.0.69", "toml 0.7.8", ] @@ -1869,7 +1869,7 @@ dependencies = [ "slog-dtrace", "slog-term", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-rustls 0.24.1", "toml 0.8.19", @@ -1904,7 +1904,7 @@ dependencies = [ "libc", "num-derive", "num-traits", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2150,7 +2150,7 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff4a5fefe330e8d7f31b16a318f9ce81000d8e35e69b93eae154d16d2278f70f" dependencies = [ - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2272,7 +2272,7 @@ dependencies = [ "hex", "ipnet", "rand", - "thiserror", + "thiserror 1.0.69", "trust-dns-proto", "url", ] @@ -2425,7 +2425,7 @@ dependencies = [ "libdlpi-sys", "num_enum", "pretty-hex 0.2.1", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -2464,7 +2464,7 @@ dependencies = [ "slog-term", "subprocess", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "uuid", @@ -2514,7 +2514,7 @@ dependencies = [ "pretty-hex 0.4.1", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "zerocopy 0.7.35", ] @@ -2639,7 +2639,7 @@ dependencies = [ "slog-bunyan", "slog-json", "slog-term", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-rustls 0.25.0", "toml 0.8.19", @@ -2700,7 +2700,7 @@ checksum = "71734e3eb68cd4df338d04dffdcc024f89eb0b238150cc95b826fbfad756452b" dependencies = [ "pest", "pest_derive", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -3429,7 +3429,7 @@ dependencies = [ "slog-error-chain", "socket2", "string_cache", - "thiserror", + "thiserror 1.0.69", "tlvc 0.3.1 (git+https://github.com/oxidecomputer/tlvc.git?branch=main)", "tokio", "usdt", @@ -3828,7 +3828,7 @@ dependencies = [ "once_cell", "radix_trie", "rand", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -3850,7 +3850,7 @@ dependencies = [ "ipnet", "once_cell", "rand", - "thiserror", + "thiserror 1.0.69", "tinyvec", "tokio", "tracing", @@ -3873,7 +3873,7 @@ dependencies = [ "rand", "resolv-conf", "smallvec 1.13.2", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -3891,7 +3891,7 @@ dependencies = [ "futures-util", "hickory-proto", "serde", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-util", @@ -4036,7 +4036,7 @@ dependencies = [ "similar", "stringmetrics", "tabwriter", - "thiserror", + "thiserror 1.0.69", "tokio", "url", ] @@ -4097,7 +4097,7 @@ dependencies = [ "object 0.30.4", "path-slash", "rsa", - "thiserror", + "thiserror 1.0.69", "tlvc 0.3.1 (git+https://github.com/oxidecomputer/tlvc)", "tlvc-text", "toml 0.7.8", @@ -4446,7 +4446,7 @@ dependencies = [ [[package]] name = "illumos-sys-hdrs" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=98247c27846133a80fdb8f730f0c57e72d766561#98247c27846133a80fdb8f730f0c57e72d766561" +source = "git+https://github.com/oxidecomputer/opte?rev=b56afeeb14e0042cbd7bda85b166ed86ee17820e#b56afeeb14e0042cbd7bda85b166ed86ee17820e" [[package]] name = "illumos-utils" @@ -4481,7 +4481,7 @@ dependencies = [ "slog", "slog-error-chain", "smf", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "uuid", @@ -4650,7 +4650,7 @@ dependencies = [ "slog-term", "smf", "test-strategy", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tufaceous-lib", @@ -4708,7 +4708,7 @@ dependencies = [ "serde_json", "serde_with", "test-strategy", - "thiserror", + "thiserror 1.0.69", "tokio", "update-engine", ] @@ -4764,7 +4764,7 @@ dependencies = [ "sled", "slog", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -4800,7 +4800,7 @@ dependencies = [ "proptest", "serde", "test-strategy", - "thiserror", + "thiserror 1.0.69", "uuid", ] @@ -4934,7 +4934,7 @@ dependencies = [ "secrecy", "sha3", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "zeroize", ] @@ -4942,7 +4942,7 @@ dependencies = [ [[package]] name = "kstat-macro" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=98247c27846133a80fdb8f730f0c57e72d766561#98247c27846133a80fdb8f730f0c57e72d766561" +source = "git+https://github.com/oxidecomputer/opte?rev=b56afeeb14e0042cbd7bda85b166ed86ee17820e#b56afeeb14e0042cbd7bda85b166ed86ee17820e" dependencies = [ "quote", "syn 2.0.87", @@ -4955,7 +4955,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27964e4632377753acb0898ce6f28770d50cbca1339200ae63d700cff97b5c2b" dependencies = [ "libc", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -5028,7 +5028,7 @@ source = "git+https://github.com/oxidecomputer/libefi-illumos?branch=master#54c3 dependencies = [ "libc", "libefi-sys", - "thiserror", + "thiserror 1.0.69", "uuid", ] @@ -5065,7 +5065,7 @@ dependencies = [ "slog-term", "smf", "tabwriter", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-tungstenite 0.21.0", "toml 0.7.8", @@ -5093,7 +5093,7 @@ source = "git+https://github.com/oxidecomputer/libipcc?rev=fdffa212373a8f92473ea dependencies = [ "cfg-if", "libc", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -5129,7 +5129,7 @@ dependencies = [ "rand", "rusty-doors", "socket2", - "thiserror", + "thiserror 1.0.69", "tracing", "winnow 0.6.20", ] @@ -5151,7 +5151,7 @@ dependencies = [ "rand", "rusty-doors", "socket2", - "thiserror", + "thiserror 1.0.69", "tracing", "winnow 0.6.20", ] @@ -5163,7 +5163,7 @@ source = "git+https://github.com/oxidecomputer/libnvme?rev=dd5bb221d327a1bc92879 dependencies = [ "libnvme-sys", "nvme", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -5328,7 +5328,7 @@ dependencies = [ "serde", "serde-hex", "sha2", - "thiserror", + "thiserror 1.0.69", "x509-cert", "zerocopy 0.6.6", ] @@ -5685,7 +5685,7 @@ dependencies = [ "serde_urlencoded", "slog", "strum", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -5791,7 +5791,7 @@ dependencies = [ "slog-error-chain", "steno", "strum", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -5870,7 +5870,7 @@ dependencies = [ "subprocess", "swrite", "term", - "thiserror", + "thiserror 1.0.69", "tokio", "url", "usdt", @@ -5949,7 +5949,7 @@ dependencies = [ "sled-agent-client", "slog", "strum", - "thiserror", + "thiserror 1.0.69", "tokio", "typed-rng", "uuid", @@ -5987,7 +5987,7 @@ dependencies = [ "pq-sys", "slog", "slog-error-chain", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -6089,7 +6089,7 @@ dependencies = [ "static_assertions", "strum", "test-strategy", - "thiserror", + "thiserror 1.0.69", "typed-rng", "uuid", ] @@ -6127,7 +6127,7 @@ dependencies = [ "slog", "swrite", "sync-ptr", - "thiserror", + "thiserror 1.0.69", "typed-rng", "uuid", ] @@ -6174,7 +6174,7 @@ dependencies = [ "serde_json", "sled-hardware-types", "strum", - "thiserror", + "thiserror 1.0.69", "uuid", ] @@ -6294,7 +6294,7 @@ dependencies = [ "steno", "strum", "test-strategy", - "thiserror", + "thiserror 1.0.69", "update-engine", "uuid", ] @@ -6593,7 +6593,7 @@ dependencies = [ "openssl", "openssl-sys", "rcgen", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -6629,7 +6629,7 @@ dependencies = [ "slog-error-chain", "slog-term", "subprocess", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "toml 0.8.19", @@ -6671,7 +6671,7 @@ dependencies = [ "slog-error-chain", "subprocess", "test-strategy", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "toml 0.8.19", @@ -6719,7 +6719,7 @@ dependencies = [ "slog-error-chain", "strum", "test-strategy", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "uuid", @@ -6738,7 +6738,7 @@ dependencies = [ "serde", "sled-hardware-types", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -6819,7 +6819,7 @@ dependencies = [ "slog-error-chain", "sp-sim", "subprocess", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-tungstenite 0.23.1", @@ -7011,7 +7011,7 @@ dependencies = [ "subprocess", "tempfile", "term", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-util", @@ -7119,7 +7119,7 @@ dependencies = [ "smf", "strum", "tar", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "walkdir", @@ -7139,7 +7139,7 @@ dependencies = [ "schemars", "serde", "serde_with", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -7279,7 +7279,7 @@ dependencies = [ "subprocess", "tar", "tempfile", - "thiserror", + "thiserror 1.0.69", "tofino", "tokio", "tokio-stream", @@ -7326,7 +7326,7 @@ dependencies = [ "subprocess", "tar", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "usdt", @@ -7506,7 +7506,7 @@ dependencies = [ "serde_json", "slog", "tar", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.7.8", "topological-sort", @@ -7644,7 +7644,7 @@ dependencies = [ [[package]] name = "opte" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=98247c27846133a80fdb8f730f0c57e72d766561#98247c27846133a80fdb8f730f0c57e72d766561" +source = "git+https://github.com/oxidecomputer/opte?rev=b56afeeb14e0042cbd7bda85b166ed86ee17820e#b56afeeb14e0042cbd7bda85b166ed86ee17820e" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -7663,7 +7663,7 @@ dependencies = [ [[package]] name = "opte-api" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=98247c27846133a80fdb8f730f0c57e72d766561#98247c27846133a80fdb8f730f0c57e72d766561" +source = "git+https://github.com/oxidecomputer/opte?rev=b56afeeb14e0042cbd7bda85b166ed86ee17820e#b56afeeb14e0042cbd7bda85b166ed86ee17820e" dependencies = [ "illumos-sys-hdrs", "ingot", @@ -7676,7 +7676,7 @@ dependencies = [ [[package]] name = "opte-ioctl" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=98247c27846133a80fdb8f730f0c57e72d766561#98247c27846133a80fdb8f730f0c57e72d766561" +source = "git+https://github.com/oxidecomputer/opte?rev=b56afeeb14e0042cbd7bda85b166ed86ee17820e#b56afeeb14e0042cbd7bda85b166ed86ee17820e" dependencies = [ "libc", "libnet 0.1.0 (git+https://github.com/oxidecomputer/netadm-sys)", @@ -7684,7 +7684,7 @@ dependencies = [ "oxide-vpc", "postcard", "serde", - "thiserror", + "thiserror 2.0.3", ] [[package]] @@ -7698,7 +7698,7 @@ dependencies = [ "maplit", "oso-derive", "polar-core", - "thiserror", + "thiserror 1.0.69", "tracing", ] @@ -7736,7 +7736,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -7744,7 +7744,7 @@ dependencies = [ [[package]] name = "oxide-vpc" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=98247c27846133a80fdb8f730f0c57e72d766561#98247c27846133a80fdb8f730f0c57e72d766561" +source = "git+https://github.com/oxidecomputer/opte?rev=b56afeeb14e0042cbd7bda85b166ed86ee17820e#b56afeeb14e0042cbd7bda85b166ed86ee17820e" dependencies = [ "cfg-if", "illumos-sys-hdrs", @@ -7843,7 +7843,7 @@ dependencies = [ "slog-term", "strum", "subprocess", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "uuid", @@ -7906,7 +7906,7 @@ dependencies = [ "strum", "tabled", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "usdt", @@ -7933,7 +7933,7 @@ dependencies = [ "slog", "slog-async", "slog-term", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -7969,7 +7969,7 @@ dependencies = [ "slog", "slog-dtrace", "slog-term", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -8044,7 +8044,7 @@ dependencies = [ "serde", "serde_json", "strum", - "thiserror", + "thiserror 1.0.69", "trybuild", "uuid", ] @@ -8397,7 +8397,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442" dependencies = [ "memchr", - "thiserror", + "thiserror 1.0.69", "ucd-trie", ] @@ -8933,7 +8933,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.87", - "thiserror", + "thiserror 1.0.69", "typify", "unicode-ident", ] @@ -8971,7 +8971,7 @@ dependencies = [ "serde", "serde_json", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-tungstenite 0.21.0", "uuid", @@ -8992,7 +8992,7 @@ dependencies = [ "serde", "serde_json", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-tungstenite 0.21.0", "uuid", @@ -9022,7 +9022,7 @@ dependencies = [ "slog-bunyan", "slog-dtrace", "slog-term", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-tungstenite 0.21.0", "uuid", @@ -9036,7 +9036,7 @@ dependencies = [ "cpuid_profile_config", "serde", "serde_derive", - "thiserror", + "thiserror 1.0.69", "toml 0.7.8", ] @@ -9049,7 +9049,7 @@ dependencies = [ "propolis_types", "schemars", "serde", - "thiserror", + "thiserror 1.0.69", "uuid", ] @@ -9115,7 +9115,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-tungstenite 0.24.0", @@ -9152,7 +9152,7 @@ dependencies = [ "rustc-hash 2.0.0", "rustls 0.23.14", "socket2", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -9169,7 +9169,7 @@ dependencies = [ "rustc-hash 2.0.0", "rustls 0.23.14", "slab", - "thiserror", + "thiserror 1.0.69", "tinyvec", "tracing", ] @@ -9295,7 +9295,7 @@ dependencies = [ "hyper", "omicron-workspace-hack", "proptest", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", ] @@ -9423,7 +9423,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -9441,7 +9441,7 @@ dependencies = [ "strip-ansi-escapes", "strum", "strum_macros 0.26.4", - "thiserror", + "thiserror 1.0.69", "unicode-segmentation", "unicode-width 0.1.14", ] @@ -9802,7 +9802,7 @@ dependencies = [ "ssh-encoding", "ssh-key", "subtle", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -9860,7 +9860,7 @@ dependencies = [ "spki", "ssh-encoding", "ssh-key", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "typenum", @@ -9922,7 +9922,7 @@ checksum = "f1adc9dfed5cc999077978cc7163b9282c5751c8d39827c4ea8c8c220ca5a440" dependencies = [ "serde", "tempfile", - "thiserror", + "thiserror 1.0.69", "toml 0.8.19", "toolchain_find", ] @@ -10146,7 +10146,7 @@ dependencies = [ "quick-xml", "rand", "serde", - "thiserror", + "thiserror 1.0.69", "url", "uuid", ] @@ -10729,7 +10729,7 @@ dependencies = [ "sled-hardware-types", "slog", "strum", - "thiserror", + "thiserror 1.0.69", "toml 0.8.19", "uuid", ] @@ -10759,7 +10759,7 @@ dependencies = [ "sled-hardware-types", "slog", "slog-error-chain", - "thiserror", + "thiserror 1.0.69", "tofino", "tokio", "uuid", @@ -10803,7 +10803,7 @@ dependencies = [ "serde_json", "sled-hardware", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -10960,7 +10960,7 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a491bfc47dffa70a3c267bc379e9de9f4b0a7195e474a94498189b177f8d18c" dependencies = [ - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -11040,7 +11040,7 @@ dependencies = [ "serde", "slog", "slog-dtrace", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", ] @@ -11092,7 +11092,7 @@ dependencies = [ "slog", "slog-async", "slog-term", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-rustls 0.26.0", "toml 0.8.19", @@ -11211,7 +11211,7 @@ dependencies = [ "serde", "serde_json", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "uuid", ] @@ -11623,7 +11623,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", ] [[package]] @@ -11637,6 +11646,17 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "thiserror-impl" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "thiserror-impl-no-std" version = "2.0.2" @@ -12152,7 +12172,7 @@ dependencies = [ "lazy_static", "rand", "smallvec 1.13.2", - "thiserror", + "thiserror 1.0.69", "tinyvec", "tracing", "url", @@ -12267,7 +12287,7 @@ dependencies = [ "log", "rand", "sha1", - "thiserror", + "thiserror 1.0.69", "url", "utf-8", ] @@ -12286,7 +12306,7 @@ dependencies = [ "log", "rand", "sha1", - "thiserror", + "thiserror 1.0.69", "utf-8", ] @@ -12304,7 +12324,7 @@ dependencies = [ "log", "rand", "sha1", - "thiserror", + "thiserror 1.0.69", "utf-8", ] @@ -12369,7 +12389,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.87", - "thiserror", + "thiserror 1.0.69", "unicode-ident", ] @@ -12535,7 +12555,7 @@ dependencies = [ "rand", "sha2", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tough", @@ -12638,7 +12658,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.87", - "thiserror", + "thiserror 1.0.69", "thread-id", "version_check", ] @@ -13024,7 +13044,7 @@ dependencies = [ "sha2", "sled-hardware-types", "slog", - "thiserror", + "thiserror 1.0.69", "tokio", "toml 0.8.19", "update-engine", @@ -13114,7 +13134,7 @@ dependencies = [ "slog-dtrace", "subprocess", "tar", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-util", @@ -13718,7 +13738,7 @@ dependencies = [ "flate2", "indexmap 2.6.0", "memchr", - "thiserror", + "thiserror 1.0.69", "zopfli", ] @@ -13729,7 +13749,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3596bbc963cd9dbaa69b02e349af4d061c56c41d211ba64150a2cedb2f722707" dependencies = [ "itertools 0.10.5", - "thiserror", + "thiserror 1.0.69", "zone_cfg_derive 0.1.2", ] @@ -13740,7 +13760,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a62a428a79ea2224ce8ab05d6d8a21bdd7b4b68a8dbc1230511677a56e72ef22" dependencies = [ "itertools 0.10.5", - "thiserror", + "thiserror 1.0.69", "tokio", "zone_cfg_derive 0.3.0", ] diff --git a/Cargo.toml b/Cargo.toml index c0f4be8bae..572ba4a0d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -492,7 +492,7 @@ omicron-test-utils = { path = "test-utils" } omicron-workspace-hack = "0.1.0" omicron-zone-package = "0.11.1" oxide-client = { path = "clients/oxide-client" } -oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "98247c27846133a80fdb8f730f0c57e72d766561", features = [ "api", "std" ] } +oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "b56afeeb14e0042cbd7bda85b166ed86ee17820e", features = [ "api", "std" ] } oxlog = { path = "dev-tools/oxlog" } oxnet = { git = "https://github.com/oxidecomputer/oxnet" } once_cell = "1.20.2" @@ -502,7 +502,7 @@ openapiv3 = "2.0.0" # must match samael's crate! openssl = "0.10" openssl-sys = "0.9" -opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "98247c27846133a80fdb8f730f0c57e72d766561" } +opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "b56afeeb14e0042cbd7bda85b166ed86ee17820e" } oso = "0.27" owo-colors = "4.1.0" oximeter = { path = "oximeter/oximeter" } diff --git a/tools/opte_version b/tools/opte_version index adc2ee64e8..a80da921ae 100644 --- a/tools/opte_version +++ b/tools/opte_version @@ -1 +1 @@ -0.34.301 +0.34.311 diff --git a/tools/opte_version_override b/tools/opte_version_override index 0a98b51ca1..6ababee9f1 100644 --- a/tools/opte_version_override +++ b/tools/opte_version_override @@ -2,4 +2,4 @@ # only set this if you want to override the version of opte/xde installed by the # install_opte.sh script -OPTE_COMMIT="98247c27846133a80fdb8f730f0c57e72d766561" +OPTE_COMMIT="b56afeeb14e0042cbd7bda85b166ed86ee17820e" From 9285a7c7ad8d034bbe7b4afd39a6e4d4602f004a Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 3 Dec 2024 11:05:20 -0800 Subject: [PATCH 02/22] Retry OxQL queries in integration tests (#7156) Adds a small `wait_for_condition()` loop to OxQL queries in integration tests. This tries to catch the error where the query fails specifically because the timeseries it names is not yet available, and return `None` in these cases. We expect it to appear soon, so this isn't immediately fatal. Fixes #7154 and #7084. --- nexus/tests/integration_tests/metrics.rs | 79 +++++++++++++++++++----- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index a468fa23d5..33cf7e2073 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -297,10 +297,44 @@ async fn test_timeseries_schema_list( .expect("Failed to find HTTP request latency histogram schema"); } -pub async fn timeseries_query( +/// Run an OxQL query until it succeeds or panics. +pub async fn timeseries_query_until_success( cptestctx: &ControlPlaneTestContext, query: impl ToString, ) -> Vec { + const POLL_INTERVAL: Duration = Duration::from_secs(1); + const POLL_MAX: Duration = Duration::from_secs(30); + let query_ = query.to_string(); + wait_for_condition( + || async { + match timeseries_query(cptestctx, &query_).await { + Some(r) => Ok(r), + None => Err(CondCheckError::<()>::NotYet), + } + }, + &POLL_INTERVAL, + &POLL_MAX, + ) + .await + .unwrap_or_else(|_| { + panic!( + "Timeseries named in query are not available \ + after {:?}, query: '{}'", + POLL_MAX, + query.to_string(), + ) + }) +} + +/// Run an OxQL query. +/// +/// This returns `None` if the query resulted in client error and the body +/// indicates that a timeseries named in the query could not be found. In all +/// other cases, it either succeeds or panics. +pub async fn timeseries_query( + cptestctx: &ControlPlaneTestContext, + query: impl ToString, +) -> Option> { // first, make sure the latest timeseries have been collected. cptestctx .oximeter @@ -327,14 +361,29 @@ pub async fn timeseries_query( .unwrap_or_else(|e| { panic!("timeseries query failed: {e:?}\nquery: {query}") }); - rsp.parsed_body::() - .unwrap_or_else(|e| { - panic!( - "could not parse timeseries query response: {e:?}\n\ - query: {query}\nresponse: {rsp:#?}" - ); - }) - .tables + + // Check for a timeseries-not-found error specifically. + if rsp.status.is_client_error() { + let text = std::str::from_utf8(&rsp.body) + .expect("Timeseries query response body should be UTF-8"); + if text.contains("Schema for timeseries") && text.contains("not found") + { + return None; + } + } + + // Try to parse the query as usual, which will fail on other kinds of + // errors. + Some( + rsp.parsed_body::() + .unwrap_or_else(|e| { + panic!( + "could not parse timeseries query response: {e:?}\n\ + query: {query}\nresponse: {rsp:#?}" + ); + }) + .tables, + ) } #[nexus_test] @@ -441,7 +490,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await; + let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -457,7 +506,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await; + let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -474,7 +523,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await; + let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -499,7 +548,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await; + let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -528,7 +577,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await; + let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -714,7 +763,7 @@ async fn test_mgs_metrics( .try_force_collect() .await .expect("Could not force oximeter collection"); - let table = timeseries_query(&cptestctx, &query) + let table = timeseries_query_until_success(&cptestctx, &query) .await .into_iter() .find(|t| t.name() == name) From 289146b142b32ccc1872dc278089752e89022a70 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Tue, 3 Dec 2024 17:54:42 -0600 Subject: [PATCH 03/22] [nexus] Project-scoped OxQL endpoint (#6873) #7047 moves the existing `/v1/timeseries/query` endpoint (which requires the fleet viewer role) to `/v1/system/timeseries/query`. This PR is on top of #7047 and adds `/v1/timeseries/query?project=my-project`, using a sneaky trick to let us ensure that the user has read permissions on a project before we let them see metrics for that project. See https://github.com/oxidecomputer/omicron/issues/5298#issuecomment-2284591633 for a discussion of the OxQL authz problem more broadly. - [x] Add `/v1/timeseries/query` with required project query param - [x] Integration tests showing the authz works in an expected scenarios - ~~Add a list schemas endpoint that only lists schemas with a `project_id` field~~ - ~~Move schema filtering logic inside oximeter client~~ - ~~Fully test schema list endpoint~~ ## The trick 1. Require the user to say (in a query param) what project they're interested in 2. Look up that project and make sure they can read it 3. Jam `| filter silo_id == "" && project_id == ""` on the end of whatever query they passed in It sounds silly, but I talked it over with @bnaecker and we couldn't find any holes. If the user tries to fetch metrics from another project inside their query, the query will end up with the filter `project_id == "def" && project_id == "xyz"` and the result set will always be empty. If they try to query a metric that doesn't have a `project_id` on it, it will error out. It works! ## API design Initially I had the endpoint as `/v1/timeseries/query/project/{project}`. This is really horrible. It should be `/projects/`, but that doesn't feel any better. I also considered `/v1/projects/{project}/timeseries/query`, which has some precedent: https://github.com/oxidecomputer/omicron/blob/45f5f1cc2c7eec2a1de0a5143e85e0794134f175/nexus/external-api/output/nexus_tags.txt#L59-L64 But it also feels awful. I like the query param approach. Right now, there are only fleet-scoped metrics and project-scoped metrics, so requiring the project ID makes sense, but the query params are nicely flexible in that we could make project optional or add other optional fields and just do different auth checks based on what's passed in. Neither path nor operation ID mention projects.
Examples of fleet- and project-scoped metrics ## No list endpoint yet I backed out the schema list endpoint. We can't list project-scoped schemas because `authz_scope` is not in the database! See https://github.com/oxidecomputer/omicron/issues/5942. Currently the schema list endpoint hard-codes `authz_scope: Fleet`. https://github.com/oxidecomputer/omicron/blob/69de8b6288fde36fbcd6cabb4d632d62851230ad/oximeter/db/src/model/from_block.rs#L142 I am using the tag `hidden` on the query endpoint so that it goes in the OpenAPI definition (so I can use it in the console) but it will not show up on the docs site. --- nexus/external-api/output/nexus_tags.txt | 1 + nexus/external-api/src/lib.rs | 20 +++ nexus/src/app/metrics.rs | 70 +++++--- nexus/src/external_api/http_entrypoints.rs | 27 +++ nexus/tests/integration_tests/endpoints.rs | 23 ++- nexus/tests/integration_tests/metrics.rs | 187 +++++++++++++++++++-- openapi/nexus.json | 49 ++++++ 7 files changed, 337 insertions(+), 40 deletions(-) diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 8102ebce08..a979a9804b 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -30,6 +30,7 @@ probe_create POST /experimental/v1/probes probe_delete DELETE /experimental/v1/probes/{probe} probe_list GET /experimental/v1/probes probe_view GET /experimental/v1/probes/{probe} +timeseries_query POST /v1/timeseries/query API operations found with tag "images" OPERATION ID METHOD URL PATH diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 1c5c7c1d2d..e2b53a7e6f 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2567,6 +2567,26 @@ pub trait NexusExternalApi { body: TypedBody, ) -> Result, HttpError>; + // TODO: list endpoint for project-scoped schemas is blocked on + // https://github.com/oxidecomputer/omicron/issues/5942: the authz scope for + // each schema is not stored in Clickhouse yet. + + /// Run project-scoped timeseries query + /// + /// Queries are written in OxQL. Project must be specified by name or ID in + /// URL query parameter. The OxQL query will only return timeseries data + /// from the specified project. + #[endpoint { + method = POST, + path = "/v1/timeseries/query", + tags = ["hidden"], + }] + async fn timeseries_query( + rqctx: RequestContext, + query_params: Query, + body: TypedBody, + ) -> Result, HttpError>; + // Updates /// Upload TUF repository diff --git a/nexus/src/app/metrics.rs b/nexus/src/app/metrics.rs index 40f7882281..5b77e681b1 100644 --- a/nexus/src/app/metrics.rs +++ b/nexus/src/app/metrics.rs @@ -140,28 +140,52 @@ impl super::Nexus { self.timeseries_client .oxql_query(query) .await - .map(|result| { - // TODO-observability: The query method returns information - // about the duration of the OxQL query and the database - // resource usage for each contained SQL query. We should - // publish this as a timeseries itself, so that we can track - // improvements to query processing. - // - // For now, simply return the tables alone. - result.tables - }) - .map_err(|e| match e { - oximeter_db::Error::DatabaseUnavailable(_) - | oximeter_db::Error::Connection(_) => { - Error::ServiceUnavailable { - internal_message: e.to_string(), - } - } - oximeter_db::Error::Oxql(_) - | oximeter_db::Error::TimeseriesNotFound(_) => { - Error::invalid_request(e.to_string()) - } - _ => Error::InternalError { internal_message: e.to_string() }, - }) + // TODO-observability: The query method returns information + // about the duration of the OxQL query and the database + // resource usage for each contained SQL query. We should + // publish this as a timeseries itself, so that we can track + // improvements to query processing. + // + // For now, simply return the tables alone. + .map(|result| result.tables) + .map_err(map_timeseries_err) + } + + /// Run an OxQL query against the timeseries database, scoped to a specific project. + pub(crate) async fn timeseries_query_project( + &self, + _opctx: &OpContext, + project_lookup: &lookup::Project<'_>, + query: impl AsRef, + ) -> Result, Error> { + // Ensure the user has read access to the project + let (authz_silo, authz_project) = + project_lookup.lookup_for(authz::Action::Read).await?; + + // Ensure the query only refers to the project + let filtered_query = format!( + "{} | filter silo_id == \"{}\" && project_id == \"{}\"", + query.as_ref(), + authz_silo.id(), + authz_project.id() + ); + + self.timeseries_client + .oxql_query(filtered_query) + .await + .map(|result| result.tables) + .map_err(map_timeseries_err) + } +} + +fn map_timeseries_err(e: oximeter_db::Error) -> Error { + match e { + oximeter_db::Error::DatabaseUnavailable(_) + | oximeter_db::Error::Connection(_) => Error::unavail(&e.to_string()), + oximeter_db::Error::Oxql(_) + | oximeter_db::Error::TimeseriesNotFound(_) => { + Error::invalid_request(e.to_string()) + } + _ => Error::internal_error(&e.to_string()), } } diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index a285542442..740895b7e4 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -5544,6 +5544,33 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn timeseries_query( + rqctx: RequestContext, + query_params: Query, + body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + let project_selector = query_params.into_inner(); + let query = body.into_inner().query; + let project_lookup = + nexus.project_lookup(&opctx, project_selector)?; + nexus + .timeseries_query_project(&opctx, &project_lookup, &query) + .await + .map(|tables| HttpResponseOk(views::OxqlQueryResult { tables })) + .map_err(HttpError::from) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + // Updates async fn system_update_put_repository( diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 2e7b68eaca..466cae17a8 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -948,10 +948,14 @@ pub static DEMO_SILO_METRICS_URL: Lazy = Lazy::new(|| { ) }); -pub static TIMESERIES_LIST_URL: Lazy = +pub static TIMESERIES_QUERY_URL: Lazy = Lazy::new(|| { + format!("/v1/timeseries/query?project={}", *DEMO_PROJECT_NAME) +}); + +pub static SYSTEM_TIMESERIES_LIST_URL: Lazy = Lazy::new(|| String::from("/v1/system/timeseries/schemas")); -pub static TIMESERIES_QUERY_URL: Lazy = +pub static SYSTEM_TIMESERIES_QUERY_URL: Lazy = Lazy::new(|| String::from("/v1/system/timeseries/query")); pub static DEMO_TIMESERIES_QUERY: Lazy = @@ -2208,7 +2212,18 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { }, VerifyEndpoint { - url: &TIMESERIES_LIST_URL, + url: &TIMESERIES_QUERY_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Post( + serde_json::to_value(&*DEMO_TIMESERIES_QUERY).unwrap() + ), + ], + }, + + VerifyEndpoint { + url: &SYSTEM_TIMESERIES_LIST_URL, visibility: Visibility::Public, unprivileged_access: UnprivilegedAccess::None, allowed_methods: vec![ @@ -2217,7 +2232,7 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { }, VerifyEndpoint { - url: &TIMESERIES_QUERY_URL, + url: &SYSTEM_TIMESERIES_QUERY_URL, visibility: Visibility::Public, unprivileged_access: UnprivilegedAccess::None, allowed_methods: vec![ diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 33cf7e2073..7e5441c16a 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -9,16 +9,20 @@ use crate::integration_tests::instances::{ }; use chrono::Utc; use dropshot::test_util::ClientTestContext; -use dropshot::ResultsPage; +use dropshot::{HttpErrorResponseBody, ResultsPage}; use http::{Method, StatusCode}; +use nexus_auth::authn::USER_TEST_UNPRIVILEGED; +use nexus_db_queries::db::identity::Asset; +use nexus_test_utils::background::activate_background_task; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::{ create_default_ip_pool, create_disk, create_instance, create_project, - objects_list_page_authz, DiskTest, + grant_iam, object_create_error, objects_list_page_authz, DiskTest, }; use nexus_test_utils::wait_for_producer; use nexus_test_utils::ControlPlaneTestContext; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::shared::ProjectRole; use nexus_types::external_api::views::OxqlQueryResult; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; @@ -266,7 +270,7 @@ async fn test_metrics( /// Test that we can correctly list some timeseries schema. #[nexus_test] -async fn test_timeseries_schema_list( +async fn test_system_timeseries_schema_list( cptestctx: &ControlPlaneTestContext, ) { // Nexus registers itself as a metric producer on startup, with its own UUID @@ -298,16 +302,44 @@ async fn test_timeseries_schema_list( } /// Run an OxQL query until it succeeds or panics. -pub async fn timeseries_query_until_success( +pub async fn system_timeseries_query( cptestctx: &ControlPlaneTestContext, query: impl ToString, +) -> Vec { + timeseries_query_until_success( + cptestctx, + "/v1/system/timeseries/query", + query, + ) + .await +} + +/// Run a project-scoped OxQL query until it succeeds or panics. +pub async fn project_timeseries_query( + cptestctx: &ControlPlaneTestContext, + project: &str, + query: impl ToString, +) -> Vec { + timeseries_query_until_success( + cptestctx, + &format!("/v1/timeseries/query?project={}", project), + query, + ) + .await +} + +/// Run an OxQL query until it succeeds or panics. +async fn timeseries_query_until_success( + cptestctx: &ControlPlaneTestContext, + endpoint: &str, + query: impl ToString, ) -> Vec { const POLL_INTERVAL: Duration = Duration::from_secs(1); const POLL_MAX: Duration = Duration::from_secs(30); let query_ = query.to_string(); wait_for_condition( || async { - match timeseries_query(cptestctx, &query_).await { + match execute_timeseries_query(cptestctx, endpoint, &query_).await { Some(r) => Ok(r), None => Err(CondCheckError::<()>::NotYet), } @@ -331,8 +363,9 @@ pub async fn timeseries_query_until_success( /// This returns `None` if the query resulted in client error and the body /// indicates that a timeseries named in the query could not be found. In all /// other cases, it either succeeds or panics. -pub async fn timeseries_query( +pub async fn execute_timeseries_query( cptestctx: &ControlPlaneTestContext, + endpoint: &str, query: impl ToString, ) -> Option> { // first, make sure the latest timeseries have been collected. @@ -351,7 +384,7 @@ pub async fn timeseries_query( nexus_test_utils::http_testing::RequestBuilder::new( &cptestctx.external_client, http::Method::POST, - "/v1/system/timeseries/query", + endpoint, ) .body(Some(&body)), ) @@ -490,7 +523,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -506,7 +539,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -523,7 +556,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -548,7 +581,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -577,7 +610,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -597,6 +630,134 @@ async fn test_instance_watcher_metrics( assert_gte!(ts2_running, 2); } +#[nexus_test] +async fn test_project_timeseries_query( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + create_default_ip_pool(&client).await; // needed for instance create to work + + // Create two projects + let p1 = create_project(&client, "project1").await; + let _p2 = create_project(&client, "project2").await; + + // Create resources in each project + let i1 = create_instance(&client, "project1", "instance1").await; + let _i2 = create_instance(&client, "project2", "instance2").await; + + let internal_client = &cptestctx.internal_client; + + // get the instance metrics to show up + let _ = + activate_background_task(&internal_client, "instance_watcher").await; + + // Query with no project specified + let q1 = "get virtual_machine:check"; + + let result = project_timeseries_query(&cptestctx, "project1", q1).await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + // also works with project ID + let result = + project_timeseries_query(&cptestctx, &p1.identity.id.to_string(), q1) + .await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + let result = project_timeseries_query(&cptestctx, "project2", q1).await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + // with project specified + let q2 = &format!("{} | filter project_id == \"{}\"", q1, p1.identity.id); + + let result = project_timeseries_query(&cptestctx, "project1", q2).await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + let result = project_timeseries_query(&cptestctx, "project2", q2).await; + assert_eq!(result.len(), 1); + assert_eq!(result[0].timeseries().len(), 0); + + // with instance specified + let q3 = &format!("{} | filter instance_id == \"{}\"", q1, i1.identity.id); + + // project containing instance gives me something + let result = project_timeseries_query(&cptestctx, "project1", q3).await; + assert_eq!(result.len(), 1); + assert_eq!(result[0].timeseries().len(), 1); + + // should be empty or error + let result = project_timeseries_query(&cptestctx, "project2", q3).await; + assert_eq!(result.len(), 1); + assert_eq!(result[0].timeseries().len(), 0); + + // expect error when querying a metric that has no project_id on it + let q4 = "get integration_target:integration_metric"; + let url = "/v1/timeseries/query?project=project1"; + let body = nexus_types::external_api::params::TimeseriesQuery { + query: q4.to_string(), + }; + let result = + object_create_error(client, url, &body, StatusCode::BAD_REQUEST).await; + assert_eq!(result.error_code.unwrap(), "InvalidRequest"); + // Notable that the error confirms that the metric exists and says what the + // fields are. This is helpful generally, but here it would be better if + // we could say something more like "you can't query this timeseries from + // this endpoint" + assert_eq!(result.message, "The filter expression contains identifiers that are not valid for its input timeseries. Invalid identifiers: [\"project_id\", \"silo_id\"], timeseries fields: {\"datum\", \"metric_name\", \"target_name\", \"timestamp\"}"); + + // nonexistent project + let url = "/v1/timeseries/query?project=nonexistent"; + let body = nexus_types::external_api::params::TimeseriesQuery { + query: q4.to_string(), + }; + let result = + object_create_error(client, url, &body, StatusCode::NOT_FOUND).await; + assert_eq!(result.message, "not found: project with name \"nonexistent\""); + + // unprivileged user gets 404 on project that exists, but which they can't read + let url = "/v1/timeseries/query?project=project1"; + let body = nexus_types::external_api::params::TimeseriesQuery { + query: q1.to_string(), + }; + + let request = RequestBuilder::new(client, Method::POST, url) + .body(Some(&body)) + .expect_status(Some(StatusCode::NOT_FOUND)); + let result = NexusRequest::new(request) + .authn_as(AuthnMode::UnprivilegedUser) + .execute() + .await + .unwrap() + .parsed_body::() + .unwrap(); + assert_eq!(result.message, "not found: project with name \"project1\""); + + // now grant the user access to that project only + grant_iam( + client, + "/v1/projects/project1", + ProjectRole::Viewer, + USER_TEST_UNPRIVILEGED.id(), + AuthnMode::PrivilegedUser, + ) + .await; + + // now they can access the timeseries. how cool is that + let request = RequestBuilder::new(client, Method::POST, url) + .body(Some(&body)) + .expect_status(Some(StatusCode::OK)); + let result = NexusRequest::new(request) + .authn_as(AuthnMode::UnprivilegedUser) + .execute_and_parse_unwrap::() + .await; + assert_eq!(result.tables.len(), 1); + assert_eq!(result.tables[0].timeseries().len(), 1); +} + #[nexus_test] async fn test_mgs_metrics( cptestctx: &ControlPlaneTestContext, @@ -763,7 +924,7 @@ async fn test_mgs_metrics( .try_force_collect() .await .expect("Could not force oximeter collection"); - let table = timeseries_query_until_success(&cptestctx, &query) + let table = system_timeseries_query(&cptestctx, &query) .await .into_iter() .find(|t| t.name() == name) diff --git a/openapi/nexus.json b/openapi/nexus.json index 79186e379a..c0b6a96fcf 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -8890,6 +8890,55 @@ } } }, + "/v1/timeseries/query": { + "post": { + "tags": [ + "hidden" + ], + "summary": "Run project-scoped timeseries query", + "description": "Queries are written in OxQL. Project must be specified by name or ID in URL query parameter. The OxQL query will only return timeseries data from the specified project.", + "operationId": "timeseries_query", + "parameters": [ + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TimeseriesQuery" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OxqlQueryResult" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/users": { "get": { "tags": [ From c6d4c55ffece9faf167e2da5aee044341ffdc24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Wed, 4 Dec 2024 14:10:04 +1300 Subject: [PATCH 04/22] [clickhouse] Fix integration test setup script (#7199) Tested this manually by running the tests first to make sure happy path doesn't change: ```console $ cargo nextest run -p omicron-clickhouse-admin info: experimental features enabled: setup-scripts Finished `test` profile [unoptimized + debuginfo] target(s) in 5.86s ------------ Nextest run ID 700e33bd-e5dc-43fe-9fe7-4c79e4cf3884 with nextest profile: default Starting 5 tests across 5 binaries SETUP [ 1/1] clickhouse-cluster: cargo run -p clickhouse-cluster-dev [ 00:00:00] [ ] 0/5: Compiling clickhouse-cluster-dev v0.1.0 (/Users/karcar/src/omicron/dev-tools/clickhouse-cluster-dev) Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.91s Running `target/debug/clickhouse-cluster-dev` Dec 03 22:41:25.258 INFO Setting up a ClickHouse cluster Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-1 Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-3 Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-2 Deploying clickhouse server: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/clickhouse-1 Deploying clickhouse server: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/clickhouse-2 Dec 03 22:41:26.348 INFO ClickHouse server ready Dec 03 22:41:26.348 INFO ClickHouse server ready Dec 03 22:41:28.594 INFO Keepers ready: [KeeperId(1), KeeperId(2), KeeperId(3)] SETUP PASS [ 1/1] clickhouse-cluster: cargo run -p clickhouse-cluster-dev PASS [ 1.324s] omicron-clickhouse-admin::integration_test test_keeper_cluster_membership PASS [ 0.446s] omicron-clickhouse-admin::integration_test test_keeper_conf_parsing PASS [ 0.441s] omicron-clickhouse-admin::integration_test test_lgif_parsing PASS [ 0.452s] omicron-clickhouse-admin::integration_test test_raft_config_parsing PASS [ 0.039s] omicron-clickhouse-admin::integration_test test_teardown ------------ Summary [ 7.298s] 5 tests run: 5 passed, 0 skipped ``` Then interrupted a test run to make sure to leave the directory and files behind ```console $ cargo nextest run -p omicron-clickhouse-admin info: experimental features enabled: setup-scripts Compiling omicron-clickhouse-admin v0.1.0 (/Users/karcar/src/omicron/clickhouse-admin) Finished `test` profile [unoptimized + debuginfo] target(s) in 0.96s ------------ Nextest run ID 787a3f0e-b480-4433-9570-9c61bd9ea08e with nextest profile: default Starting 5 tests across 5 binaries SETUP [ 1/1] clickhouse-cluster: cargo run -p clickhouse-cluster-dev [ 00:00:00] [ ] 0/5: Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.32s Running `target/debug/clickhouse-cluster-dev` Dec 03 22:42:55.011 INFO Setting up a ClickHouse cluster Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-1 Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-3 Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-2 Deploying clickhouse server: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/clickhouse-1 Deploying clickhouse server: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/clickhouse-2 Dec 03 22:42:55.993 INFO ClickHouse server ready Dec 03 22:42:56.130 INFO ClickHouse server ready Dec 03 22:42:57.925 INFO Keepers ready: [KeeperId(1), KeeperId(2), KeeperId(3)] SETUP PASS [ 1/1] clickhouse-cluster: cargo run -p clickhouse-cluster-dev Running [ 00:00:04] [ ] 0/5: 1 running, 0 passed, 0 skipped Canceling due to interrupt: 1 test still running SIGINT [ 0.906s] omicron-clickhouse-admin::integration_test test_keeper_cluster_membership ------------ Summary [ 4.460s] 1/5 tests run: 0 passed, 1 failed, 0 skipped warning: 4/5 tests were not run due to interrupt error: test run failed ``` Ran the tests again and expected to see in the log that the directory is being removed ```console $ cargo nextest run -p omicron-clickhouse-admin info: experimental features enabled: setup-scripts Finished `test` profile [unoptimized + debuginfo] target(s) in 0.34s ------------ Nextest run ID 0302644b-d97b-4bd2-bae3-edc4976a9298 with nextest profile: default Starting 5 tests across 5 binaries SETUP [ 1/1] clickhouse-cluster: cargo run -p clickhouse-cluster-dev [ 00:00:00] [ ] 0/5: Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.32s Running `target/debug/clickhouse-cluster-dev` Dec 03 22:44:10.996 INFO Removing previous temporary test directory Dec 03 22:44:11.019 INFO Setting up a ClickHouse cluster Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-1 Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-3 Deploying keeper: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/keeper-2 Deploying clickhouse server: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/clickhouse-1 Deploying clickhouse server: /var/folders/mk/l_57z4w97xlc7wymyvc7mwzr0000gn/T/clickward_test/clickhouse-2 Dec 03 22:44:12.016 INFO ClickHouse server ready Dec 03 22:44:12.018 INFO ClickHouse server ready Dec 03 22:44:13.806 INFO Keepers ready: [KeeperId(1), KeeperId(2), KeeperId(3)] SETUP PASS [ 1/1] clickhouse-cluster: cargo run -p clickhouse-cluster-dev PASS [ 1.324s] omicron-clickhouse-admin::integration_test test_keeper_cluster_membership PASS [ 0.444s] omicron-clickhouse-admin::integration_test test_keeper_conf_parsing PASS [ 0.445s] omicron-clickhouse-admin::integration_test test_lgif_parsing PASS [ 0.441s] omicron-clickhouse-admin::integration_test test_raft_config_parsing PASS [ 0.035s] omicron-clickhouse-admin::integration_test test_teardown ------------ Summary [ 6.116s] 5 tests run: 5 passed, 0 skipped ``` Closes: https://github.com/oxidecomputer/omicron/issues/7198 --- dev-tools/clickhouse-cluster-dev/src/main.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dev-tools/clickhouse-cluster-dev/src/main.rs b/dev-tools/clickhouse-cluster-dev/src/main.rs index 2f85c53ab6..1b78144e4c 100644 --- a/dev-tools/clickhouse-cluster-dev/src/main.rs +++ b/dev-tools/clickhouse-cluster-dev/src/main.rs @@ -22,6 +22,16 @@ use std::time::Duration; async fn main() -> Result<()> { let request_timeout = Duration::from_secs(15); let (logctx, path) = default_clickhouse_log_ctx_and_path(); + + if path.exists() { + let deployment = + default_clickhouse_cluster_test_deployment(path.clone()); + slog::info!(logctx.log, "Stopping test clickhouse nodes"); + deployment.teardown()?; + slog::info!(logctx.log, "Removing previous temporary test directory"); + std::fs::remove_dir_all(&path)?; + } + std::fs::create_dir(&path)?; slog::info!(logctx.log, "Setting up a ClickHouse cluster"); From 464f06cb15c2a454ce3d1bf64b0fd5e2892f0b25 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Wed, 4 Dec 2024 01:02:52 -0600 Subject: [PATCH 05/22] Bump web console (require cert, serial console button) (#7200) https://github.com/oxidecomputer/console/compare/059c5514...fd47bee7 * [fd47bee7](https://github.com/oxidecomputer/console/commit/fd47bee7) tweak bump omicron script to prompt for message * [f6a3c06c](https://github.com/oxidecomputer/console/commit/f6a3c06c) bump omicron for project-scoped oxql endpoint * [4d3a923c](https://github.com/oxidecomputer/console/commit/4d3a923c) oxidecomputer/console#2596 * [576e1842](https://github.com/oxidecomputer/console/commit/576e1842) oxidecomputer/console#2595 * [72b6d43d](https://github.com/oxidecomputer/console/commit/72b6d43d) oxidecomputer/console#2579 * [15e55044](https://github.com/oxidecomputer/console/commit/15e55044) oxidecomputer/console#2594 * [fa27caf9](https://github.com/oxidecomputer/console/commit/fa27caf9) oxidecomputer/console#2588 * [aad054c5](https://github.com/oxidecomputer/console/commit/aad054c5) oxidecomputer/console#2592 * [d97310be](https://github.com/oxidecomputer/console/commit/d97310be) oxidecomputer/console#2593 * [8ba8a898](https://github.com/oxidecomputer/console/commit/8ba8a898) oxidecomputer/console#2590 * [d7fd3c42](https://github.com/oxidecomputer/console/commit/d7fd3c42) oxidecomputer/console#2586 * [23423c2b](https://github.com/oxidecomputer/console/commit/23423c2b) oxidecomputer/console#2583 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 85363dcf35..cf50396baf 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="059c55142ba29e4f691247def92493b7ef3f8df2" -SHA2="dfb98bf8eb4f97ff65be8128a456dbbf98dfbda48988a4b683bf0894d57381ba" +COMMIT="fd47bee7c1f2baf189661c4e44318f8a9caa3d4c" +SHA2="74fdcd4c4f102c35cee25e893873596cf26fd7bdaf7a6962371e56e786a981f2" From 564e9b8c3fc2cadfef801c10af79da74bb94607c Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Wed, 4 Dec 2024 10:21:48 -0800 Subject: [PATCH 06/22] Crucible and Propolis update (#7201) Crucible: Skip jobs when reinitializing to `Faulted` (#1583) Clear `repair_check_deadline` if repair is successfully started (#1581) Update rust-version reqs to reflect reality (#1580) Propolis: Update nvme-trace.d to match current probe definitions (#821) Fix clippy lints for Rust 1.83 PHD: use stty to widen the effective terminal for Linux guests (#818) Co-authored-by: Alan Hanson --- Cargo.lock | 28 ++++++++++++++-------------- Cargo.toml | 16 ++++++++-------- package-manifest.toml | 16 ++++++++-------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a53447a075..98a5975316 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -693,7 +693,7 @@ dependencies = [ [[package]] name = "bhyve_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04#6936f1a949d155da38d3148abd42caef337dea04" +source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" dependencies = [ "bhyve_api_sys", "libc", @@ -703,7 +703,7 @@ dependencies = [ [[package]] name = "bhyve_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04#6936f1a949d155da38d3148abd42caef337dea04" +source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" dependencies = [ "libc", "strum", @@ -1822,7 +1822,7 @@ dependencies = [ [[package]] name = "crucible-agent-client" version = "0.0.1" -source = "git+https://github.com/oxidecomputer/crucible?rev=2cfc7e0c8572b3bfafbfc838c4e6d658f442d239#2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source = "git+https://github.com/oxidecomputer/crucible?rev=5a41b826171c7d2a8412fa833377ab1df25ee8ec#5a41b826171c7d2a8412fa833377ab1df25ee8ec" dependencies = [ "anyhow", "chrono", @@ -1838,7 +1838,7 @@ dependencies = [ [[package]] name = "crucible-client-types" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/crucible?rev=2cfc7e0c8572b3bfafbfc838c4e6d658f442d239#2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source = "git+https://github.com/oxidecomputer/crucible?rev=5a41b826171c7d2a8412fa833377ab1df25ee8ec#5a41b826171c7d2a8412fa833377ab1df25ee8ec" dependencies = [ "base64 0.22.1", "crucible-workspace-hack", @@ -1851,7 +1851,7 @@ dependencies = [ [[package]] name = "crucible-common" version = "0.0.1" -source = "git+https://github.com/oxidecomputer/crucible?rev=2cfc7e0c8572b3bfafbfc838c4e6d658f442d239#2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source = "git+https://github.com/oxidecomputer/crucible?rev=5a41b826171c7d2a8412fa833377ab1df25ee8ec#5a41b826171c7d2a8412fa833377ab1df25ee8ec" dependencies = [ "anyhow", "atty", @@ -1881,7 +1881,7 @@ dependencies = [ [[package]] name = "crucible-pantry-client" version = "0.0.1" -source = "git+https://github.com/oxidecomputer/crucible?rev=2cfc7e0c8572b3bfafbfc838c4e6d658f442d239#2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source = "git+https://github.com/oxidecomputer/crucible?rev=5a41b826171c7d2a8412fa833377ab1df25ee8ec#5a41b826171c7d2a8412fa833377ab1df25ee8ec" dependencies = [ "anyhow", "chrono", @@ -1898,7 +1898,7 @@ dependencies = [ [[package]] name = "crucible-smf" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/crucible?rev=2cfc7e0c8572b3bfafbfc838c4e6d658f442d239#2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source = "git+https://github.com/oxidecomputer/crucible?rev=5a41b826171c7d2a8412fa833377ab1df25ee8ec#5a41b826171c7d2a8412fa833377ab1df25ee8ec" dependencies = [ "crucible-workspace-hack", "libc", @@ -6980,7 +6980,7 @@ dependencies = [ "pq-sys", "pretty_assertions", "progenitor-client", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37)", "qorb", "rand", "rcgen", @@ -7245,7 +7245,7 @@ dependencies = [ "oximeter-producer", "oxnet", "pretty_assertions", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37)", "propolis-mock-server", "propolis_api_types", "rand", @@ -8959,7 +8959,7 @@ dependencies = [ [[package]] name = "propolis-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04#6936f1a949d155da38d3148abd42caef337dea04" +source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" dependencies = [ "async-trait", "base64 0.21.7", @@ -9001,7 +9001,7 @@ dependencies = [ [[package]] name = "propolis-mock-server" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04#6936f1a949d155da38d3148abd42caef337dea04" +source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" dependencies = [ "anyhow", "atty", @@ -9043,7 +9043,7 @@ dependencies = [ [[package]] name = "propolis_api_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04#6936f1a949d155da38d3148abd42caef337dea04" +source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" dependencies = [ "crucible-client-types", "propolis_types", @@ -9056,7 +9056,7 @@ dependencies = [ [[package]] name = "propolis_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04#6936f1a949d155da38d3148abd42caef337dea04" +source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" dependencies = [ "schemars", "serde", @@ -10719,7 +10719,7 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "oxnet", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=6936f1a949d155da38d3148abd42caef337dea04)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37)", "rcgen", "schemars", "serde", diff --git a/Cargo.toml b/Cargo.toml index 572ba4a0d2..23799e3c3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -346,10 +346,10 @@ cookie = "0.18" criterion = { version = "0.5.1", features = [ "async_tokio" ] } crossbeam = "0.8" crossterm = { version = "0.28.1", features = ["event-stream"] } -crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" } -crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" } -crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" } -crucible-common = { git = "https://github.com/oxidecomputer/crucible", rev = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" } +crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" } +crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" } +crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" } +crucible-common = { git = "https://github.com/oxidecomputer/crucible", rev = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" } csv = "1.3.0" curve25519-dalek = "4" datatest-stable = "0.2.9" @@ -539,10 +539,10 @@ prettyplease = { version = "0.2.25", features = ["verbatim"] } proc-macro2 = "1.0" progenitor = "0.8.0" progenitor-client = "0.8.0" -bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "6936f1a949d155da38d3148abd42caef337dea04" } -propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "6936f1a949d155da38d3148abd42caef337dea04" } -propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "6936f1a949d155da38d3148abd42caef337dea04" } -propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "6936f1a949d155da38d3148abd42caef337dea04" } +bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } +propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } +propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } +propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } proptest = "1.5.0" qorb = "0.2.1" quote = "1.0" diff --git a/package-manifest.toml b/package-manifest.toml index 789d1eb0c0..5d55c9368a 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -578,10 +578,10 @@ only_for_targets.image = "standard" # 3. Use source.type = "manual" instead of "prebuilt" source.type = "prebuilt" source.repo = "crucible" -source.commit = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source.commit = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible.sha256.txt -source.sha256 = "0276c1513b33c61c866eb31756879e9d079534f43af90b01c0a2dd152c6ce18d" +source.sha256 = "bcccfb03a68e46bb958410faf6f619e25f5ec9ccc65c503aeb87bb7ad456e517" output.type = "zone" output.intermediate_only = true @@ -590,10 +590,10 @@ service_name = "crucible_pantry_prebuilt" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "crucible" -source.commit = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source.commit = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible-pantry.sha256.txt -source.sha256 = "7ad4f84df681f5ccd90bd74473a17a0e1310f562bfd0c08047aad6adbd131903" +source.sha256 = "96326422f79413fe31bb1c7df6173b2991b463cabc5b1fb4182db703500c8882" output.type = "zone" output.intermediate_only = true @@ -607,10 +607,10 @@ service_name = "crucible_dtrace" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "crucible" -source.commit = "2cfc7e0c8572b3bfafbfc838c4e6d658f442d239" +source.commit = "5a41b826171c7d2a8412fa833377ab1df25ee8ec" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible-dtrace.sha256.txt -source.sha256 = "dac88622ecf6e3529b9d83390607c921723eca26de68b0801efd66c36acfa629" +source.sha256 = "d35ed81a1e58ec66b621938f4b57513c1a3eb0b66e21834e000e0ace9624b462" output.type = "tarball" # Refer to @@ -621,10 +621,10 @@ service_name = "propolis-server" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "propolis" -source.commit = "6936f1a949d155da38d3148abd42caef337dea04" +source.commit = "19a421dceac7756aef26a8771f258af9cc21fc37" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/propolis/image//propolis-server.sha256.txt -source.sha256 = "a3a45292bd45938a785b84afee39f690a5f05d1920b78b8fc0512a131857d7ee" +source.sha256 = "fbb52fed6312db047a7f56d43162e5d4c5072886a23b5e6a0096f6db78c5d2ba" output.type = "zone" [package.mg-ddm-gz] From 3b77deb17937384387eaaa8a7ae661c35c7aec55 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 4 Dec 2024 11:02:54 -0800 Subject: [PATCH 07/22] Add API for fetching details about an oximeter producer (#7139) - Add `producer_details` API to `oximeter` collector, which returns information about registration time, update time, and collection summaries. - Update producer details during collections themselves - Add `omdb oximeter producer-details` subcommand for printing - Closes #7125 --- dev-tools/omdb/src/bin/omdb/oximeter.rs | 187 +++++++++- dev-tools/omdb/tests/usage_errors.out | 5 +- openapi/oximeter.json | 180 ++++++++++ oximeter/api/src/lib.rs | 128 +++++++ oximeter/collector/src/agent.rs | 380 +++++++++++++++++++-- oximeter/collector/src/http_entrypoints.rs | 13 + oximeter/collector/src/lib.rs | 9 +- oximeter/collector/src/self_stats.rs | 76 ++++- 8 files changed, 945 insertions(+), 33 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/oximeter.rs b/dev-tools/omdb/src/bin/omdb/oximeter.rs index cc1efd126f..7dae63e947 100644 --- a/dev-tools/omdb/src/bin/omdb/oximeter.rs +++ b/dev-tools/omdb/src/bin/omdb/oximeter.rs @@ -7,11 +7,15 @@ use crate::helpers::CONNECTION_OPTIONS_HEADING; use crate::Omdb; use anyhow::Context; +use chrono::SecondsFormat; use clap::Args; use clap::Subcommand; use futures::TryStreamExt; use internal_dns_types::names::ServiceName; +use oximeter_client::types::FailedCollection; +use oximeter_client::types::ProducerDetails; use oximeter_client::types::ProducerEndpoint; +use oximeter_client::types::SuccessfulCollection; use oximeter_client::Client; use slog::Logger; use std::net::SocketAddr; @@ -41,6 +45,11 @@ pub struct OximeterArgs { enum OximeterCommands { /// List the producers the collector is assigned to poll. ListProducers, + /// Fetch details about a single assigned producer. + ProducerDetails { + /// The ID of the producer to fetch. + producer_id: Uuid, + }, } impl OximeterArgs { @@ -81,9 +90,26 @@ impl OximeterArgs { OximeterCommands::ListProducers => { self.list_producers(client).await } + OximeterCommands::ProducerDetails { producer_id } => { + self.producer_details(client, producer_id).await + } } } + async fn producer_details( + &self, + client: Client, + producer_id: Uuid, + ) -> anyhow::Result<()> { + let details = client + .producer_details(&producer_id) + .await + .context("failed to fetch producer details")? + .into_inner(); + print_producer_details(details); + Ok(()) + } + async fn list_producers(&self, client: Client) -> anyhow::Result<()> { let info = client .collector_info() @@ -120,11 +146,168 @@ struct Producer { impl From for Producer { fn from(p: ProducerEndpoint) -> Self { - let interval = Duration::new(p.interval.secs, p.interval.nanos); Self { id: p.id, address: p.address.parse().unwrap(), - interval: humantime::format_duration(interval).to_string(), + interval: duration_to_humantime(&p.interval), + } + } +} + +fn duration_to_humantime(d: &oximeter_client::types::Duration) -> String { + let interval = Duration::new(d.secs, d.nanos); + humantime::format_duration(interval).to_string() +} + +const WIDTH: usize = 12; + +fn print_producer_details(details: ProducerDetails) { + println!(); + println!("{:>WIDTH$}: {}", "ID", details.id); + println!("{:>WIDTH$}: {}", "Address", details.address); + println!( + "{:>WIDTH$}: {}", + "Registered", + details.registered.to_rfc3339_opts(SecondsFormat::Millis, true) + ); + println!( + "{:>WIDTH$}: {}", + "Updated", + details.updated.to_rfc3339_opts(SecondsFormat::Millis, true) + ); + println!( + "{:>WIDTH$}: {}", + "Interval", + duration_to_humantime(&details.interval) + ); + println!("{:>WIDTH$}: {}", "Successes", details.n_collections); + println!("{:>WIDTH$}: {}", "Failures", details.n_failures); + println!(); + print_last_success(details.last_success.as_ref()); + println!(); + print_last_failure(details.last_failure.as_ref()); +} + +fn print_last_success(maybe_success: Option<&SuccessfulCollection>) { + print!("{:>WIDTH$}: ", "Last success"); + match maybe_success { + None => println!("None"), + Some(success) => { + println!(); + println!( + "{:>WIDTH$}: {}", + "Started at", + success.started_at.to_rfc3339_opts(SecondsFormat::Millis, true) + ); + println!( + "{:>WIDTH$}: {:?}", + "Queued for", + Duration::new( + success.time_queued.secs, + success.time_queued.nanos + ) + ); + println!( + "{:>WIDTH$}: {:?}", + "Duration", + Duration::new( + success.time_collecting.secs, + success.time_collecting.nanos + ) + ); + println!("{:>WIDTH$}: {}", "Samples", success.n_samples); } } } + +fn print_last_failure(maybe_failure: Option<&FailedCollection>) { + print!("{:>WIDTH$}: ", "Last failure"); + match maybe_failure { + None => println!("None"), + Some(failure) => { + println!(); + println!( + "{:>WIDTH$}: {}", + "Started at", + failure.started_at.to_rfc3339_opts(SecondsFormat::Millis, true) + ); + println!( + "{:>WIDTH$}: {:?}", + "Queued for", + Duration::new( + failure.time_queued.secs, + failure.time_queued.nanos + ) + ); + println!( + "{:>WIDTH$}: {:?}", + "Duration", + Duration::new( + failure.time_collecting.secs, + failure.time_collecting.nanos + ) + ); + println!("{:>WIDTH$}: {}", "Reason", failure.reason); + } + } +} + +#[cfg(test)] +mod tests { + use super::print_producer_details; + use chrono::Utc; + use oximeter_client::types::FailedCollection; + use oximeter_client::types::ProducerDetails; + use oximeter_client::types::SuccessfulCollection; + use std::time::Duration; + use uuid::Uuid; + + #[test] + fn test_print_producer_details_success_only() { + let now = Utc::now(); + let details = ProducerDetails { + id: Uuid::new_v4(), + address: "[::1]:12345".parse().unwrap(), + interval: Duration::from_secs(10).into(), + last_success: Some(SuccessfulCollection { + n_samples: 100, + started_at: now, + time_collecting: Duration::from_millis(100).into(), + time_queued: Duration::from_millis(10).into(), + }), + last_failure: None, + n_collections: 1, + n_failures: 0, + registered: now, + updated: now, + }; + print_producer_details(details); + } + + #[test] + fn test_print_producer_details_with_failure() { + let now = Utc::now(); + let details = ProducerDetails { + id: Uuid::new_v4(), + interval: Duration::from_secs(10).into(), + address: "[::1]:12345".parse().unwrap(), + last_success: Some(SuccessfulCollection { + n_samples: 100, + started_at: now, + time_collecting: Duration::from_millis(100).into(), + time_queued: Duration::from_millis(10).into(), + }), + last_failure: Some(FailedCollection { + started_at: now, + time_collecting: Duration::from_millis(100).into(), + time_queued: Duration::from_millis(10).into(), + reason: String::from("unreachable"), + }), + n_collections: 1, + n_failures: 1, + registered: now, + updated: now, + }; + print_producer_details(details); + } +} diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 5e66467403..85fc761289 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -761,8 +761,9 @@ Query oximeter collector state Usage: omdb oximeter [OPTIONS] Commands: - list-producers List the producers the collector is assigned to poll - help Print this message or the help of the given subcommand(s) + list-producers List the producers the collector is assigned to poll + producer-details Fetch details about a single assigned producer + help Print this message or the help of the given subcommand(s) Options: --log-level log level filter [env: LOG_LEVEL=] [default: warn] diff --git a/openapi/oximeter.json b/openapi/oximeter.json index dea3418b8d..b51c56b667 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -84,6 +84,39 @@ } }, "/producers/{producer_id}": { + "get": { + "summary": "Get details about a producer by ID.", + "operationId": "producer_details", + "parameters": [ + { + "in": "path", + "name": "producer_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProducerDetails" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, "delete": { "summary": "Delete a producer by ID.", "operationId": "producer_delete", @@ -171,6 +204,114 @@ "request_id" ] }, + "FailedCollection": { + "description": "Details about a previous failed collection.", + "type": "object", + "properties": { + "reason": { + "description": "The reason the collection failed.", + "type": "string" + }, + "started_at": { + "description": "The time at which we started a collection.\n\nNote that this is the time we queued a request to collect for processing by a background task. The `time_queued` can be added to this time to figure out when processing began, and `time_collecting` can be added to that to figure out how long the actual collection process took.", + "type": "string", + "format": "date-time" + }, + "time_collecting": { + "description": "The time it took for the actual collection.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "time_queued": { + "description": "The time this request spent queued before being processed.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + } + }, + "required": [ + "reason", + "started_at", + "time_collecting", + "time_queued" + ] + }, + "ProducerDetails": { + "type": "object", + "properties": { + "address": { + "description": "The current collection address.", + "type": "string" + }, + "id": { + "description": "The producer's ID.", + "type": "string", + "format": "uuid" + }, + "interval": { + "description": "The current collection interval.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "last_failure": { + "nullable": true, + "description": "Details about the last failed collection.\n\nThis is None if we've never failed to collect from the producer.", + "allOf": [ + { + "$ref": "#/components/schemas/FailedCollection" + } + ] + }, + "last_success": { + "nullable": true, + "description": "Details about the last successful collection.\n\nThis is None if we've never successfully collected from the producer.", + "allOf": [ + { + "$ref": "#/components/schemas/SuccessfulCollection" + } + ] + }, + "n_collections": { + "description": "The total number of successful collections we've made.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "n_failures": { + "description": "The total number of failed collections.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "registered": { + "description": "The time the producer was first registered with us.", + "type": "string", + "format": "date-time" + }, + "updated": { + "description": "The last time the producer's information was updated.", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "address", + "id", + "interval", + "n_collections", + "n_failures", + "registered", + "updated" + ] + }, "ProducerEndpoint": { "description": "Information announced by a metric server, used so that clients can contact it and collect available metric data from it.", "type": "object", @@ -261,6 +402,45 @@ ] } ] + }, + "SuccessfulCollection": { + "description": "Details about a previous successful collection.", + "type": "object", + "properties": { + "n_samples": { + "description": "The number of samples collected.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "started_at": { + "description": "The time at which we started a collection.\n\nNote that this is the time we queued a request to collect for processing by a background task. The `time_queued` can be added to this time to figure out when processing began, and `time_collecting` can be added to that to figure out how long the actual collection process took.", + "type": "string", + "format": "date-time" + }, + "time_collecting": { + "description": "The time it took for the actual collection.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "time_queued": { + "description": "The time this request spent queued before being processed.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + } + }, + "required": [ + "n_samples", + "started_at", + "time_collecting", + "time_queued" + ] } }, "responses": { diff --git a/oximeter/api/src/lib.rs b/oximeter/api/src/lib.rs index 2231a0cc5d..f47a5ba07e 100644 --- a/oximeter/api/src/lib.rs +++ b/oximeter/api/src/lib.rs @@ -10,6 +10,7 @@ use dropshot::{ use omicron_common::api::internal::nexus::ProducerEndpoint; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use std::{net::SocketAddr, time::Duration}; use uuid::Uuid; #[dropshot::api_description] @@ -26,6 +27,16 @@ pub trait OximeterApi { query: Query>, ) -> Result>, HttpError>; + /// Get details about a producer by ID. + #[endpoint { + method = GET, + path = "/producers/{producer_id}", + }] + async fn producer_details( + request_context: RequestContext, + path: dropshot::Path, + ) -> Result, HttpError>; + /// Delete a producer by ID. #[endpoint { method = DELETE, @@ -64,3 +75,120 @@ pub struct CollectorInfo { /// Last time we refreshed our producer list with Nexus. pub last_refresh: Option>, } + +/// Details about a previous successful collection. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct SuccessfulCollection { + /// The time at which we started a collection. + /// + /// Note that this is the time we queued a request to collect for processing + /// by a background task. The `time_queued` can be added to this time to + /// figure out when processing began, and `time_collecting` can be added to + /// that to figure out how long the actual collection process took. + pub started_at: DateTime, + + /// The time this request spent queued before being processed. + pub time_queued: Duration, + + /// The time it took for the actual collection. + pub time_collecting: Duration, + + /// The number of samples collected. + pub n_samples: u64, +} + +/// Details about a previous failed collection. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct FailedCollection { + /// The time at which we started a collection. + /// + /// Note that this is the time we queued a request to collect for processing + /// by a background task. The `time_queued` can be added to this time to + /// figure out when processing began, and `time_collecting` can be added to + /// that to figure out how long the actual collection process took. + pub started_at: DateTime, + + /// The time this request spent queued before being processed. + pub time_queued: Duration, + + /// The time it took for the actual collection. + pub time_collecting: Duration, + + /// The reason the collection failed. + pub reason: String, +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct ProducerDetails { + /// The producer's ID. + pub id: Uuid, + + /// The current collection interval. + pub interval: Duration, + + /// The current collection address. + pub address: SocketAddr, + + /// The time the producer was first registered with us. + pub registered: DateTime, + + /// The last time the producer's information was updated. + pub updated: DateTime, + + /// Details about the last successful collection. + /// + /// This is None if we've never successfully collected from the producer. + pub last_success: Option, + + /// Details about the last failed collection. + /// + /// This is None if we've never failed to collect from the producer. + pub last_failure: Option, + + /// The total number of successful collections we've made. + pub n_collections: u64, + + /// The total number of failed collections. + pub n_failures: u64, +} + +impl ProducerDetails { + pub fn new(info: &ProducerEndpoint) -> Self { + let now = Utc::now(); + Self { + id: info.id, + interval: info.interval, + address: info.address, + registered: now, + updated: now, + last_success: None, + last_failure: None, + n_collections: 0, + n_failures: 0, + } + } + + /// Update with new producer information. + /// + /// # Panics + /// + /// This panics if the new information refers to a different ID. + pub fn update(&mut self, new: &ProducerEndpoint) { + assert_eq!(self.id, new.id); + self.updated = Utc::now(); + self.address = new.address; + self.interval = new.interval; + } + + /// Update when we successfully complete a collection. + pub fn on_success(&mut self, success: SuccessfulCollection) { + self.last_success = Some(success); + self.n_collections += 1; + } + + /// Update when we fail to complete a collection. + pub fn on_failure(&mut self, failure: FailedCollection) { + self.last_failure = Some(failure); + self.n_failures += 1; + } +} diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 6fa8c01c56..4c4f0f4177 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -20,6 +20,9 @@ use omicron_common::backoff; use omicron_common::backoff::BackoffError; use oximeter::types::ProducerResults; use oximeter::types::ProducerResultsItem; +use oximeter_api::FailedCollection; +use oximeter_api::ProducerDetails; +use oximeter_api::SuccessfulCollection; use oximeter_db::Client; use oximeter_db::DbWrite; use qorb::claim::Handle; @@ -40,6 +43,7 @@ use std::ops::Bound; use std::sync::Arc; use std::sync::Mutex as StdMutex; use std::time::Duration; +use std::time::Instant; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TrySendError; use tokio::sync::oneshot; @@ -97,6 +101,18 @@ enum CollectionMessage { Statistics { reply_tx: oneshot::Sender, }, + // Request details from the collection task about its producer. + Details { + reply_tx: oneshot::Sender, + }, +} + +/// Return type for `perform_collection`. +struct SingleCollectionResult { + /// The result of the collection. + result: Result, + /// The duration the collection took. + duration: Duration, } /// Run a single collection from the producer. @@ -104,14 +120,15 @@ async fn perform_collection( log: Logger, client: reqwest::Client, producer: ProducerEndpoint, -) -> Result { +) -> SingleCollectionResult { + let start = Instant::now(); debug!(log, "collecting from producer"); let res = client .get(format!("http://{}/{}", producer.address, producer.id)) .send() .await; trace!(log, "sent collection request to producer"); - match res { + let result = match res { Ok(res) => { if res.status().is_success() { match res.json::().await { @@ -149,7 +166,8 @@ async fn perform_collection( ); Err(self_stats::FailureReason::Unreachable) } - } + }; + SingleCollectionResult { result, duration: start.elapsed() } } // The type of one collection task run to completion. @@ -158,15 +176,54 @@ async fn perform_collection( // can bump the self-stat counter accordingly. type CollectionResult = Result; -// The type of one response message sent from the collection task. -type CollectionResponse = (Option, CollectionResult); +/// Information about when we start a collection. +struct CollectionStartTimes { + /// UTC timestamp at which the request was started. + started_at: DateTime, + /// Instant right before we queued the response for processing. + queued_at: Instant, +} + +impl CollectionStartTimes { + fn new() -> Self { + Self { started_at: Utc::now(), queued_at: Instant::now() } + } +} + +/// Details about a forced collection. +struct ForcedCollectionRequest { + /// The collection token we signal when the collection is completed. + token: CollectionToken, + /// Start time for this collection. + start: CollectionStartTimes, +} + +impl ForcedCollectionRequest { + fn new(token: CollectionToken) -> Self { + Self { token, start: CollectionStartTimes::new() } + } +} + +/// Details about a completed collection. +struct CollectionResponse { + /// Token for a forced collection request. + token: Option, + /// The actual result of the collection. + result: CollectionResult, + /// Time when the collection started. + started_at: DateTime, + /// Time the request spent queued. + time_queued: Duration, + /// Time we spent processing the request. + time_collecting: Duration, +} /// Task that actually performs collections from the producer. async fn inner_collection_loop( log: Logger, mut producer_info_rx: watch::Receiver, - mut forced_collection_rx: mpsc::Receiver, - mut timer_collection_rx: mpsc::Receiver<()>, + mut forced_collection_rx: mpsc::Receiver, + mut timer_collection_rx: mpsc::Receiver, result_tx: mpsc::Sender, ) { let client = reqwest::Client::builder() @@ -178,29 +235,30 @@ async fn inner_collection_loop( loop { // Wait for notification that we have a collection to perform, from // either the forced- or timer-collection queue. - trace!(log, "top of inner collection loop, waiting for next request",); - let maybe_token = tokio::select! { + trace!(log, "top of inner collection loop, waiting for next request"); + let (maybe_token, start_time) = tokio::select! { maybe_request = forced_collection_rx.recv() => { - let Some(request) = maybe_request else { + let Some(ForcedCollectionRequest { token, start }) = maybe_request else { debug!( log, "forced collection request queue closed, exiting" ); return; }; - Some(request) + (Some(token), start) } maybe_request = timer_collection_rx.recv() => { - if maybe_request.is_none() { + let Some(start) = maybe_request else { debug!( log, "timer collection request queue closed, exiting" ); return; }; - None + (None, start) } }; + let time_queued = start_time.queued_at.elapsed(); // Make a future to represent the actual collection. let mut collection_fut = Box::pin(perform_collection( @@ -212,7 +270,7 @@ async fn inner_collection_loop( // Wait for that collection to complete or fail, or for an update to the // producer's information. In the latter case, recreate the future for // the collection itself with the new producer information. - let collection_result = 'collection: loop { + let SingleCollectionResult { result, duration } = 'collection: loop { tokio::select! { biased; @@ -258,8 +316,16 @@ async fn inner_collection_loop( }; // Now that the collection has completed, send on the results, along - // with any collection token we may have gotten with the request. - match result_tx.send((maybe_token, collection_result)).await { + // with the timing information and any collection token we may have + // gotten with the request. + let response = CollectionResponse { + token: maybe_token, + result, + started_at: start_time.started_at, + time_queued, + time_collecting: duration, + }; + match result_tx.send(response).await { Ok(_) => trace!(log, "forwarded results to main collection loop"), Err(_) => { error!( @@ -298,6 +364,10 @@ async fn collection_loop( let mut self_collection_timer = interval(self_stats::COLLECTION_INTERVAL); self_collection_timer.tick().await; + // Keep track of more details about each collection, so we can expose this + // as debugging information in `oximeter`'s public API. + let mut details = ProducerDetails::new(&producer); + // Spawn a task to run the actual collections. // // This is so that we can possibly interrupt and restart collections that @@ -342,23 +412,24 @@ async fn collection_loop( log, "collection task received explicit request to collect" ); - match forced_collection_tx.try_send(token) { + let request = ForcedCollectionRequest::new(token); + match forced_collection_tx.try_send(request) { Ok(_) => trace!( log, "forwarded explicit request to collection task" ), Err(e) => { match e { - TrySendError::Closed(tok) => { + TrySendError::Closed(ForcedCollectionRequest { token, .. }) => { debug!( log, "collection task forced collection \ queue is closed. Attempting to \ notify caller and exiting.", ); - let _ = tok.send(Err(ForcedCollectionError::Closed)); + let _ = token.send(Err(ForcedCollectionError::Closed)); return; } - TrySendError::Full(tok) => { + TrySendError::Full(ForcedCollectionRequest { token, start }) => { error!( log, "collection task forced collection \ @@ -368,7 +439,7 @@ async fn collection_loop( calling `force_collection()` many \ times" ); - if tok + if token .send(Err(ForcedCollectionError::QueueFull)) .is_err() { @@ -379,6 +450,13 @@ async fn collection_loop( closed" ); } + let failure = FailedCollection { + started_at: start.started_at, + time_queued: Duration::ZERO, + time_collecting: Duration::ZERO, + reason: String::from("forced collection queue full"), + }; + details.on_failure(failure); } } } @@ -421,6 +499,8 @@ async fn collection_loop( "interval" => ?new_info.interval, "address" => new_info.address, ); + details.update(&new_info); + stats.update(&new_info); collection_timer = interval(new_info.interval); collection_timer.tick().await; // completes immediately } @@ -442,10 +522,24 @@ async fn collection_loop( ); reply_tx.send(stats.clone()).expect("failed to send statistics"); } + Some(CollectionMessage::Details { reply_tx }) => { + match reply_tx.send(details.clone()) { + Ok(_) => trace!( + log, + "sent producer details reply to oximeter agent", + ), + Err(e) => error!( + log, + "failed to send producer details reply to \ + oximeter agent"; + "error" => ?e, + ), + } + } } } maybe_result = result_rx.recv() => { - let Some((maybe_token, result)) = maybe_result else { + let Some(response) = maybe_result else { error!( log, "channel for receiving results from collection task \ @@ -453,10 +547,31 @@ async fn collection_loop( ); return; }; + let CollectionResponse { + token, + result, + started_at, + time_queued, + time_collecting + } = response; match result { Ok(results) => { stats.collections.datum.increment(); - if outbox.send((maybe_token, results)).await.is_err() { + let n_samples: u64 = results + .iter() + .map(|each| match each { + ProducerResultsItem::Ok(samples) => samples.len() as u64, + _ => 0, + }) + .sum(); + let success = SuccessfulCollection { + started_at, + time_queued, + time_collecting, + n_samples + }; + details.on_success(success); + if outbox.send((token, results)).await.is_err() { error!( log, "failed to send results to outbox, channel is \ @@ -465,7 +580,16 @@ async fn collection_loop( return; } } - Err(reason) => stats.failures_for_reason(reason).datum.increment(), + Err(reason) => { + let failure = FailedCollection { + started_at, + time_queued, + time_collecting, + reason: reason.to_string(), + }; + details.on_failure(failure); + stats.failures_for_reason(reason).datum.increment(); + } } } _ = self_collection_timer.tick() => { @@ -476,7 +600,7 @@ async fn collection_loop( outbox.send((None, stats.sample())).await.unwrap(); } _ = collection_timer.tick() => { - match timer_collection_tx.try_send(()) { + match timer_collection_tx.try_send(CollectionStartTimes::new()) { Ok(_) => { debug!( log, @@ -492,7 +616,14 @@ async fn collection_loop( ); return; } - Err(TrySendError::Full(_)) => { + Err(TrySendError::Full(start)) => { + let failure = FailedCollection { + started_at: start.started_at, + time_queued: Duration::ZERO, + time_collecting: Duration::ZERO, + reason: String::from("collections in progress"), + }; + details.on_failure(failure); error!( log, "timer-based collection request queue is \ @@ -851,6 +982,37 @@ impl OximeterAgent { }) } + /// Fetch details about a producer, if it exists. + pub async fn producer_details( + &self, + id: Uuid, + ) -> Result { + let tasks = self.collection_tasks.lock().await; + let Some((_info, task)) = tasks.get(&id) else { + return Err(Error::NoSuchProducer { id }); + }; + let (reply_tx, rx) = oneshot::channel(); + task.inbox.try_send(CollectionMessage::Details { reply_tx }).map_err( + |_| { + Error::CollectionError( + id, + String::from( + "Failed to send detail request to collection task", + ), + ) + }, + )?; + drop(tasks); + rx.await.map_err(|_| { + Error::CollectionError( + id, + String::from( + "Failed to receive detail response from collection task", + ), + ) + }) + } + /// Register a new producer with this oximeter instance. pub async fn register_producer( &self, @@ -1089,6 +1251,7 @@ async fn refresh_producer_list_task( let mut interval = tokio::time::interval(agent.refresh_interval); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + info!(agent.log, "starting refresh list task"); loop { interval.tick().await; info!(agent.log, "refreshing list of producers from Nexus"); @@ -1208,6 +1371,7 @@ mod tests { use super::OximeterAgent; use super::ProducerEndpoint; use crate::self_stats::FailureReason; + use chrono::Utc; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::Path; @@ -1562,4 +1726,168 @@ mod tests { ); logctx.cleanup_successful(); } + + #[tokio::test] + async fn verify_producer_details() { + let logctx = test_setup_log("verify_producer_details"); + let log = &logctx.log; + + // Spawn an oximeter collector ... + let collector = OximeterAgent::new_standalone( + Uuid::new_v4(), + SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0), + crate::default_refresh_interval(), + None, + log, + ) + .await + .unwrap(); + + // Spawn the mock server that always reports empty statistics. + let collection_count = Arc::new(AtomicUsize::new(0)); + let server = ServerBuilder::new( + producer_api_mod::api_description::().unwrap(), + collection_count.clone(), + log.new(slog::o!("component" => "dropshot")), + ) + .config(Default::default()) + .start() + .expect("failed to spawn empty dropshot server"); + + // Register the dummy producer. + let endpoint = ProducerEndpoint { + id: Uuid::new_v4(), + kind: ProducerKind::Service, + address: server.local_addr(), + interval: COLLECTION_INTERVAL, + }; + let id = endpoint.id; + let before = Utc::now(); + collector + .register_producer(endpoint) + .await + .expect("failed to register dummy producer"); + + // We don't manipulate time manually here, since this is pretty short + // and we want to assert things about the actual timing in the test + // below. + while collection_count.load(Ordering::SeqCst) < 1 { + tokio::time::sleep(TICK_INTERVAL).await; + } + + // Get details about the producer. + let count = collection_count.load(Ordering::SeqCst) as u64; + let details = collector + .producer_details(id) + .await + .expect("Should be able to get producer details"); + assert_eq!(details.id, id); + assert!(details.registered > before); + assert!(details.updated > before); + assert_eq!(details.registered, details.updated); + assert!( + details.n_collections == count + || details.n_collections == count - 1 + ); + assert_eq!(details.n_failures, 0); + let success = + details.last_success.expect("Should have a successful collection"); + assert!(success.time_queued > Duration::ZERO); + assert!(success.time_collecting > Duration::ZERO); + assert!(success.n_samples == 0); + assert!(details.last_failure.is_none()); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_updated_producer_is_still_collected_from() { + let logctx = + test_setup_log("test_updated_producer_is_still_collected_from"); + let log = &logctx.log; + + // Spawn an oximeter collector ... + let collector = OximeterAgent::new_standalone( + Uuid::new_v4(), + SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0), + crate::default_refresh_interval(), + None, + log, + ) + .await + .unwrap(); + + // Spawn the mock server that always reports empty statistics. + let collection_count = Arc::new(AtomicUsize::new(0)); + let server = ServerBuilder::new( + producer_api_mod::api_description::().unwrap(), + collection_count.clone(), + log.new(slog::o!("component" => "dropshot")), + ) + .config(Default::default()) + .start() + .expect("failed to spawn empty dropshot server"); + + // Register the dummy producer. + let id = Uuid::new_v4(); + let endpoint = ProducerEndpoint { + id, + kind: ProducerKind::Service, + address: server.local_addr(), + interval: COLLECTION_INTERVAL, + }; + collector + .register_producer(endpoint) + .await + .expect("failed to register dummy producer"); + + let details = collector.producer_details(id).await.unwrap(); + println!("{details:#?}"); + + // Ensure we get some collections from it. + tokio::time::pause(); + while collection_count.load(Ordering::SeqCst) < 1 { + tokio::time::advance(TICK_INTERVAL).await; + } + + // Now, drop and recreate the server, and register with the same ID at a + // different address. + let collection_count = Arc::new(AtomicUsize::new(0)); + let server = ServerBuilder::new( + producer_api_mod::api_description::().unwrap(), + collection_count.clone(), + log.new(slog::o!("component" => "dropshot")), + ) + .config(Default::default()) + .start() + .expect("failed to spawn empty dropshot server"); + + // Register the dummy producer. + let endpoint = + ProducerEndpoint { address: server.local_addr(), ..endpoint }; + collector + .register_producer(endpoint) + .await + .expect("failed to register dummy producer a second time"); + + // We should just have one producer. + assert_eq!( + collector.collection_tasks.lock().await.len(), + 1, + "Should only have one producer, it was updated and has the \ + same UUID", + ); + + // We should eventually collect from it again. + let now = Instant::now(); + while now.elapsed() < TEST_WAIT_PERIOD { + tokio::time::advance(TICK_INTERVAL).await; + } + let details = collector.producer_details(id).await.unwrap(); + println!("{details:#?}"); + assert_eq!(details.id, id); + assert_eq!(details.address, server.local_addr()); + assert!(details.n_collections > 0); + assert!(collection_count.load(Ordering::SeqCst) > 0); + logctx.cleanup_successful(); + } } diff --git a/oximeter/collector/src/http_entrypoints.rs b/oximeter/collector/src/http_entrypoints.rs index 1962262453..61777daf2b 100644 --- a/oximeter/collector/src/http_entrypoints.rs +++ b/oximeter/collector/src/http_entrypoints.rs @@ -52,6 +52,19 @@ impl OximeterApi for OximeterApiImpl { .map(HttpResponseOk) } + async fn producer_details( + request_context: RequestContext, + path: dropshot::Path, + ) -> Result, HttpError> { + let agent = request_context.context(); + let producer_id = path.into_inner().producer_id; + agent + .producer_details(producer_id) + .await + .map_err(HttpError::from) + .map(HttpResponseOk) + } + async fn producer_delete( request_context: RequestContext, path: dropshot::Path, diff --git a/oximeter/collector/src/lib.rs b/oximeter/collector/src/lib.rs index cc0ef92c13..3f13eb1382 100644 --- a/oximeter/collector/src/lib.rs +++ b/oximeter/collector/src/lib.rs @@ -65,11 +65,18 @@ pub enum Error { #[error("Error running standalone")] Standalone(#[from] anyhow::Error), + + #[error("No registered producer with id '{id}'")] + NoSuchProducer { id: Uuid }, } impl From for HttpError { fn from(e: Error) -> Self { - HttpError::for_internal_error(e.to_string()) + if let Error::NoSuchProducer { .. } = e { + HttpError::for_not_found(None, e.to_string()) + } else { + HttpError::for_internal_error(e.to_string()) + } } } diff --git a/oximeter/collector/src/self_stats.rs b/oximeter/collector/src/self_stats.rs index 2ab7b201e5..ff8776c031 100644 --- a/oximeter/collector/src/self_stats.rs +++ b/oximeter/collector/src/self_stats.rs @@ -99,6 +99,32 @@ impl CollectionTaskStats { } } + /// Update this information with a new producer endpoint. + /// + /// # Panics + /// + /// This panics if `new_info` refers to a different ID. + pub fn update(&mut self, new_info: &ProducerEndpoint) { + assert_eq!(self.collections.producer_id, new_info.id); + + // Only reset the counters if the new information is actually different. + let new_ip = new_info.address.ip(); + let new_port = new_info.address.port(); + if self.collections.producer_ip == new_ip + && self.collections.producer_port == new_port + { + return; + } + self.collections.producer_ip = new_ip; + self.collections.producer_port = new_port; + self.collections.datum = Cumulative::new(0); + for each in self.failed_collections.values_mut() { + each.producer_ip = new_ip; + each.producer_port = new_port; + each.datum = Cumulative::new(0); + } + } + pub fn failures_for_reason( &mut self, reason: FailureReason, @@ -135,18 +161,64 @@ impl CollectionTaskStats { #[cfg(test)] mod tests { + use super::CollectionTaskStats; use super::FailureReason; + use super::OximeterCollector; use super::StatusCode; + use omicron_common::api::internal::nexus::ProducerEndpoint; + use omicron_common::api::internal::nexus::ProducerKind; + use std::time::Duration; + use uuid::Uuid; #[test] fn test_failure_reason_serialization() { let data = &[ - (FailureReason::Deserialization, "deserialization"), - (FailureReason::Unreachable, "unreachable"), + (FailureReason::Deserialization, FailureReason::DESERIALIZATION), + (FailureReason::Unreachable, FailureReason::UNREACHABLE), + ( + FailureReason::CollectionsInProgress, + FailureReason::COLLECTIONS_IN_PROGRESS, + ), (FailureReason::Other(StatusCode::INTERNAL_SERVER_ERROR), "500"), ]; for (variant, as_str) in data.iter() { assert_eq!(variant.to_string(), *as_str); } } + + #[test] + fn only_reset_counters_if_info_is_different() { + let info = ProducerEndpoint { + id: Uuid::new_v4(), + kind: ProducerKind::Service, + address: "[::1]:12345".parse().unwrap(), + interval: Duration::from_secs(1), + }; + let collector = OximeterCollector { + collector_id: Uuid::new_v4(), + collector_ip: "::1".parse().unwrap(), + collector_port: 12345, + }; + let mut stats = CollectionTaskStats::new(collector, &info); + stats.collections.datum.increment(); + + stats.update(&info); + assert_eq!( + stats.collections.datum.value(), + 1, + "Should not have reset the counter when updating \ + with the same producer endpoint information" + ); + let info = ProducerEndpoint { + address: "[::1]:11111".parse().unwrap(), + ..info + }; + stats.update(&info); + assert_eq!( + stats.collections.datum.value(), + 0, + "Should have reset the counter when updating \ + with different producer endpoint information" + ); + } } From 37c7f18b77985e798895d5e8b1f6143e24447538 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 4 Dec 2024 17:13:21 -0500 Subject: [PATCH 08/22] Don't use LookupPath here! (#7197) It won't be able to get deleted Snapshot objects in order to kick off region snapshot replacements. --- .../datastore/region_snapshot_replacement.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index b970bb8962..4faaf228f9 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -11,7 +11,6 @@ use crate::db; use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::lookup::LookupPath; use crate::db::model::RegionSnapshot; use crate::db::model::RegionSnapshotReplacement; use crate::db::model::RegionSnapshotReplacementState; @@ -64,10 +63,19 @@ impl DataStore { opctx: &OpContext, request: RegionSnapshotReplacement, ) -> Result<(), Error> { - let (.., db_snapshot) = LookupPath::new(opctx, &self) - .snapshot_id(request.old_snapshot_id) - .fetch() - .await?; + // Note: if `LookupPath` is used here, it will not be able to retrieve + // deleted snapshots + let db_snapshot = match self + .snapshot_get(opctx, request.old_snapshot_id) + .await? + { + Some(db_snapshot) => db_snapshot, + None => { + return Err(Error::internal_error( + "cannot perform region snapshot replacement without snapshot volume" + )); + } + }; self.insert_region_snapshot_replacement_request_with_volume_id( opctx, From 1922c8ecb8a667c9b1efa876f75d72a7437b3c77 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Thu, 5 Dec 2024 09:31:37 -0800 Subject: [PATCH 09/22] Update Propolis (#7206) Propolis: Switch viona back to packet copying for now #823 This is a workaround for https://github.com/oxidecomputer/omicron/issues/7189 I turns off new work that we think is causing the slow performance until we can get a better idea of what exactly the problem is and if/how we might want to fix it. Co-authored-by: Alan Hanson --- Cargo.lock | 18 +++++++++--------- Cargo.toml | 8 ++++---- package-manifest.toml | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 98a5975316..f3c69e4d14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -693,7 +693,7 @@ dependencies = [ [[package]] name = "bhyve_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" +source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" dependencies = [ "bhyve_api_sys", "libc", @@ -703,7 +703,7 @@ dependencies = [ [[package]] name = "bhyve_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" +source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" dependencies = [ "libc", "strum", @@ -6980,7 +6980,7 @@ dependencies = [ "pq-sys", "pretty_assertions", "progenitor-client", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739)", "qorb", "rand", "rcgen", @@ -7245,7 +7245,7 @@ dependencies = [ "oximeter-producer", "oxnet", "pretty_assertions", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739)", "propolis-mock-server", "propolis_api_types", "rand", @@ -8959,7 +8959,7 @@ dependencies = [ [[package]] name = "propolis-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" +source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" dependencies = [ "async-trait", "base64 0.21.7", @@ -9001,7 +9001,7 @@ dependencies = [ [[package]] name = "propolis-mock-server" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" +source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" dependencies = [ "anyhow", "atty", @@ -9043,7 +9043,7 @@ dependencies = [ [[package]] name = "propolis_api_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" +source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" dependencies = [ "crucible-client-types", "propolis_types", @@ -9056,7 +9056,7 @@ dependencies = [ [[package]] name = "propolis_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37#19a421dceac7756aef26a8771f258af9cc21fc37" +source = "git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739#220a6f367c18f2452dbc4fa9086f3fe73b961739" dependencies = [ "schemars", "serde", @@ -10719,7 +10719,7 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "oxnet", - "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=19a421dceac7756aef26a8771f258af9cc21fc37)", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=220a6f367c18f2452dbc4fa9086f3fe73b961739)", "rcgen", "schemars", "serde", diff --git a/Cargo.toml b/Cargo.toml index 23799e3c3f..b65617f082 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -539,10 +539,10 @@ prettyplease = { version = "0.2.25", features = ["verbatim"] } proc-macro2 = "1.0" progenitor = "0.8.0" progenitor-client = "0.8.0" -bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } -propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } -propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } -propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "19a421dceac7756aef26a8771f258af9cc21fc37" } +bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } +propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } +propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } +propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "220a6f367c18f2452dbc4fa9086f3fe73b961739" } proptest = "1.5.0" qorb = "0.2.1" quote = "1.0" diff --git a/package-manifest.toml b/package-manifest.toml index 5d55c9368a..809c1ce6ca 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -621,10 +621,10 @@ service_name = "propolis-server" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "propolis" -source.commit = "19a421dceac7756aef26a8771f258af9cc21fc37" +source.commit = "220a6f367c18f2452dbc4fa9086f3fe73b961739" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/propolis/image//propolis-server.sha256.txt -source.sha256 = "fbb52fed6312db047a7f56d43162e5d4c5072886a23b5e6a0096f6db78c5d2ba" +source.sha256 = "964bf262677496118f8cea95c257d0a57c76ddca70733217b0666657b53bd6e6" output.type = "zone" [package.mg-ddm-gz] From e73a30ea190d0770cf395a12850d3d6c96b176bf Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 5 Dec 2024 18:12:49 -0500 Subject: [PATCH 10/22] [reconfigurator] Introduce a combined `SledEditor` inside `BlueprintBuilder` (#7204) This is a step on the road to #7078. The current `BlueprintBuilder` internals would not at all be amenable to squishing the disparate maps in `Blueprint` together, so this PR tries to rework those internals. `BlueprintBuilder` now does its own map squishing (inside `new_based_on()`), combining state+zones+disks+datasets into one helper (`SledEditor`). All modifications are preformed via those editors, then `BlueprintBuilder::build()` breaks the combined editors' results back out into four maps for `Blueprint`. There should be only one functional change in this PR (marked with a `TODO-john`): previously, expunging a zone checked that that zone had not been modified in the current planning cycle. I'm not sure that's really necessary; it felt like some defensiveness born out of the complexity of the builder itself, maybe? I think we could put it back (inside `ZonesEditor`, presumably) if folks think it would be good to keep. My intention was to change the tests as little as possible to ensure I didn't break anything as I was moving functionality around, so `new_based_on()` and `build()` have some arguably-pretty-gross concessions to behave exactly the way the builder did before these changes. I'd like to _remove_ those concessions, but those will be nontrivial behavioral changes that I don't want to try to roll in with all of this cleanup. I think I landed this pretty well; there are only a few expectorate changes (due to the slightly reworked `ZonesEditor` producing a different ordering for a few tests), and none in `omdb` (which is where I've often seen incidental planner changes bubble out). Apologies for the size of the PR. I'd lightly recommend that the new `blueprint_editor/` module and its subcomponents should be pretty quickly reviewable and should be looked at first; they're _supposed_ to be simple and obvious, and are largely ported over from the prior storage / disks / datasets / zones editors. I did rework how we're handling #6645 backwards compat to try to reduce how much we need to pass `SledResources` around, so that complicates things in `DatasetsEditor` some, but not too bad IMO. (I also added a test for this, since I changed it and I don't think we had one before.) The `BlueprintBuilter` changes should then be more or less the natural way one would use a collection of `SledEditor`s without changing its API or behavior (yet!). --- Cargo.lock | 1 + common/src/disk.rs | 4 + .../planning/src/blueprint_builder/builder.rs | 975 +++++++++--------- .../builder/datasets_editor.rs | 348 ------- .../blueprint_builder/builder/disks_editor.rs | 195 ---- .../builder/storage_editor.rs | 206 ---- .../planning/src/blueprint_builder/mod.rs | 1 - .../planning/src/blueprint_builder/zones.rs | 527 ---------- .../planning/src/blueprint_editor.rs | 14 + .../src/blueprint_editor/sled_editor.rs | 329 ++++++ .../blueprint_editor/sled_editor/datasets.rs | 398 +++++++ .../src/blueprint_editor/sled_editor/disks.rs | 145 +++ .../src/blueprint_editor/sled_editor/zones.rs | 181 ++++ nexus/reconfigurator/planning/src/example.rs | 4 +- nexus/reconfigurator/planning/src/lib.rs | 1 + nexus/reconfigurator/planning/src/planner.rs | 10 +- nexus/types/Cargo.toml | 1 + nexus/types/src/deployment.rs | 31 + 18 files changed, 1620 insertions(+), 1751 deletions(-) delete mode 100644 nexus/reconfigurator/planning/src/blueprint_builder/builder/datasets_editor.rs delete mode 100644 nexus/reconfigurator/planning/src/blueprint_builder/builder/disks_editor.rs delete mode 100644 nexus/reconfigurator/planning/src/blueprint_builder/builder/storage_editor.rs delete mode 100644 nexus/reconfigurator/planning/src/blueprint_builder/zones.rs create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor.rs create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/disks.rs create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs diff --git a/Cargo.lock b/Cargo.lock index f3c69e4d14..f5e44318bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6271,6 +6271,7 @@ dependencies = [ "gateway-client", "http", "humantime", + "illumos-utils", "internal-dns-types", "ipnetwork", "newtype-uuid", diff --git a/common/src/disk.rs b/common/src/disk.rs index 3500d4dabb..99c2b2db7b 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -103,6 +103,10 @@ impl DatasetName { Self { pool_name, kind } } + pub fn into_parts(self) -> (ZpoolName, DatasetKind) { + (self.pool_name, self.kind) + } + pub fn pool(&self) -> &ZpoolName { &self.pool_name } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index eb50ab19fd..394133132b 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -4,21 +4,28 @@ //! Low-level facility for generating Blueprints +use crate::blueprint_editor::DatasetIdsBackfillFromDb; +use crate::blueprint_editor::EditedSled; +use crate::blueprint_editor::SledEditError; +use crate::blueprint_editor::SledEditor; use crate::ip_allocator::IpAllocator; use crate::planner::rng::PlannerRng; use crate::planner::zone_needs_expungement; use crate::planner::ZoneExpungeReason; use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context as _; use clickhouse_admin_types::OXIMETER_CLUSTER; -use datasets_editor::BlueprintDatasetsEditError; use ipnet::IpAdd; use nexus_inventory::now_db_precision; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintDatasetsConfig; use nexus_types::deployment::BlueprintPhysicalDiskConfig; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; +use nexus_types::deployment::BlueprintPhysicalDisksConfig; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; @@ -33,6 +40,7 @@ use nexus_types::deployment::OmicronZoneExternalSnatIp; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledDetails; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::SledLookupErrorKind; use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolFilter; use nexus_types::deployment::ZpoolName; @@ -62,16 +70,16 @@ use slog::error; use slog::info; use slog::o; use slog::Logger; +use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashSet; use std::fmt; -use std::mem; +use std::iter; use std::net::IpAddr; use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; -use storage_editor::BlueprintStorageEditor; use thiserror::Error; use super::clickhouse::ClickhouseAllocator; @@ -80,13 +88,6 @@ use super::external_networking::BuilderExternalNetworking; use super::external_networking::ExternalNetworkingChoice; use super::external_networking::ExternalSnatNetworkingChoice; use super::internal_dns::DnsSubnetAllocator; -use super::zones::is_already_expunged; -use super::zones::BuilderZoneState; -use super::zones::BuilderZonesConfig; - -mod datasets_editor; -mod disks_editor; -mod storage_editor; /// Errors encountered while assembling blueprints #[derive(Debug, Error)] @@ -125,8 +126,12 @@ pub enum Error { TooManyDnsServers, #[error("planner produced too many {kind:?} zones")] TooManyZones { kind: ZoneKind }, - #[error(transparent)] - BlueprintDatasetsEditError(#[from] BlueprintDatasetsEditError), + #[error("error editing sled {sled_id}")] + SledEditError { + sled_id: SledUuid, + #[source] + err: SledEditError, + }, } /// Describes the result of an idempotent "ensure" operation @@ -197,12 +202,12 @@ impl EditCounts { *self != Self::zeroes() } - pub fn accum(self, other: Self) -> Self { + pub fn difference_since(self, other: Self) -> Self { Self { - added: self.added + other.added, - updated: self.updated + other.updated, - expunged: self.expunged + other.expunged, - removed: self.removed + other.removed, + added: self.added - other.added, + updated: self.updated - other.updated, + expunged: self.expunged - other.expunged, + removed: self.removed - other.removed, } } } @@ -223,11 +228,18 @@ pub struct SledEditCounts { } impl SledEditCounts { - fn accum(self, other: Self) -> Self { + fn has_nonzero_counts(&self) -> bool { + let Self { disks, datasets, zones } = self; + disks.has_nonzero_counts() + || datasets.has_nonzero_counts() + || zones.has_nonzero_counts() + } + + fn difference_since(self, other: Self) -> Self { Self { - disks: self.disks.accum(other.disks), - datasets: self.datasets.accum(other.datasets), - zones: self.zones.accum(other.zones), + disks: self.disks.difference_since(other.disks), + datasets: self.datasets.difference_since(other.datasets), + zones: self.zones.difference_since(other.zones), } } } @@ -349,9 +361,7 @@ pub struct BlueprintBuilder<'a> { // These fields will become part of the final blueprint. See the // corresponding fields in `Blueprint`. - pub(super) zones: BlueprintZonesBuilder<'a>, - storage: BlueprintStorageEditor, - sled_state: BTreeMap, + sled_editors: BTreeMap, cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade, creator: String, @@ -400,6 +410,28 @@ impl<'a> BlueprintBuilder<'a> { (sled_id, config) }) .collect::>(); + let blueprint_disks = blueprint_zones + .keys() + .copied() + .map(|sled_id| { + let config = BlueprintPhysicalDisksConfig { + generation: Generation::new(), + disks: Vec::new(), + }; + (sled_id, config) + }) + .collect(); + let blueprint_datasets = blueprint_zones + .keys() + .copied() + .map(|sled_id| { + let config = BlueprintDatasetsConfig { + generation: Generation::new(), + datasets: BTreeMap::new(), + }; + (sled_id, config) + }) + .collect(); let num_sleds = blueprint_zones.len(); let sled_state = blueprint_zones .keys() @@ -410,8 +442,8 @@ impl<'a> BlueprintBuilder<'a> { Blueprint { id: rng.next_blueprint(), blueprint_zones, - blueprint_disks: BTreeMap::new(), - blueprint_datasets: BTreeMap::new(), + blueprint_disks, + blueprint_datasets, sled_state, parent_blueprint_id: None, internal_dns_version: Generation::new(), @@ -440,30 +472,105 @@ impl<'a> BlueprintBuilder<'a> { "parent_id" => parent_blueprint.id.to_string(), )); - // Prefer the sled state from our parent blueprint for sleds - // that were in it; there may be new sleds in `input`, in which - // case we'll use their current state as our starting point. - let mut sled_state = parent_blueprint.sled_state.clone(); - let mut commissioned_sled_ids = BTreeSet::new(); - for (sled_id, details) in input.all_sleds(SledFilter::Commissioned) { - commissioned_sled_ids.insert(sled_id); - sled_state.entry(sled_id).or_insert(details.state); + // Helper to build a `PreexistingDatasetIds` for a given sled. This will + // go away with https://github.com/oxidecomputer/omicron/issues/6645. + let build_preexisting_dataset_ids = + |sled_id| -> anyhow::Result { + match input.sled_lookup(SledFilter::All, sled_id) { + Ok(details) => { + DatasetIdsBackfillFromDb::build(&details.resources) + .with_context(|| { + format!( + "failed building map of preexisting \ + dataset IDs for sled {sled_id}" + ) + }) + } + Err(err) => match err.kind() { + SledLookupErrorKind::Missing => { + Ok(DatasetIdsBackfillFromDb::empty()) + } + SledLookupErrorKind::Filtered { .. } => unreachable!( + "SledFilter::All should not filter anything out" + ), + }, + } + }; + + // Squish the disparate maps in our parent blueprint into one map of + // `SledEditor`s. + let mut sled_editors = BTreeMap::new(); + for (sled_id, zones) in &parent_blueprint.blueprint_zones { + // Prefer the sled state from our parent blueprint for sleds + // that were in it. + let state = match parent_blueprint.sled_state.get(sled_id).copied() + { + Some(state) => state, + None => { + // If we have zones but no state for a sled, we assume + // it was removed by an earlier version of the planner + // (which pruned decommissioned sleds from + // `sled_state`). Check that all of its zones are + // expunged, which is a prerequisite for + // decommissioning. If any zones aren't, then we don't + // know what to do: the state is missing but we can't + // assume "decommissioned", so fail. + if zones.are_all_zones_expunged() { + SledState::Decommissioned + } else { + bail!( + "sled {sled_id} is missing in parent blueprint \ + sled_state map, but has non-expunged zones" + ); + } + } + }; + + // If we don't have disks/datasets entries, we'll start with an + // empty config and rely on `sled_ensure_{disks,datasets}` calls to + // populate it. It's also possible our parent blueprint removed + // entries because our sled has been expunged, in which case we + // won't do any further editing and what we fill in here is + // irrelevant. + let disks = parent_blueprint + .blueprint_disks + .get(sled_id) + .cloned() + .unwrap_or_else(|| BlueprintPhysicalDisksConfig { + generation: Generation::new(), + disks: Vec::new(), + }); + let datasets = parent_blueprint + .blueprint_datasets + .get(sled_id) + .cloned() + .unwrap_or_else(|| BlueprintDatasetsConfig { + generation: Generation::new(), + datasets: BTreeMap::new(), + }); + let editor = SledEditor::new( + state, + zones.clone(), + disks, + datasets.clone(), + build_preexisting_dataset_ids(*sled_id)?, + ) + .with_context(|| { + format!("failed to construct SledEditor for sled {sled_id}") + })?; + sled_editors.insert(*sled_id, editor); } - // Make a garbage collection pass through `sled_state`. We want to keep - // any sleds which either: - // - // 1. do not have a desired state of `Decommissioned` - // 2. do have a desired state of `Decommissioned` and are still included - // in our input's list of commissioned sleds - // - // Sleds that don't fall into either of these cases have reached the - // actual `Decommissioned` state, which means we no longer need to carry - // forward that desired state. - sled_state.retain(|sled_id, state| { - *state != SledState::Decommissioned - || commissioned_sled_ids.contains(sled_id) - }); + // Add new, empty `SledEditor`s for any commissioned sleds in our input + // that weren't in the parent blueprint. (These are newly-added sleds.) + for sled_id in input.all_sled_ids(SledFilter::Commissioned) { + if let Entry::Vacant(slot) = sled_editors.entry(sled_id) { + slot.insert(SledEditor::new_empty( + SledState::Active, + build_preexisting_dataset_ids(sled_id)?, + )); + } + } Ok(BlueprintBuilder { log, @@ -473,12 +580,7 @@ impl<'a> BlueprintBuilder<'a> { sled_ip_allocators: BTreeMap::new(), external_networking: OnceCell::new(), internal_dns_subnets: OnceCell::new(), - zones: BlueprintZonesBuilder::new(parent_blueprint), - storage: BlueprintStorageEditor::new( - parent_blueprint.blueprint_disks.clone(), - parent_blueprint.blueprint_datasets.clone(), - ), - sled_state, + sled_editors, cockroachdb_setting_preserve_downgrade: parent_blueprint .cockroachdb_setting_preserve_downgrade, creator: creator.to_owned(), @@ -514,12 +616,12 @@ impl<'a> BlueprintBuilder<'a> { )?; BuilderExternalNetworking::new( - self.zones - .current_zones(BlueprintZoneFilter::ShouldBeRunning) - .flat_map(|(_sled_id, zone_config)| zone_config), - self.zones - .current_zones(BlueprintZoneFilter::Expunged) - .flat_map(|(_sled_id, zone_config)| zone_config), + self.sled_editors.values().flat_map(|editor| { + editor.zones(BlueprintZoneFilter::ShouldBeRunning) + }), + self.sled_editors.values().flat_map(|editor| { + editor.zones(BlueprintZoneFilter::Expunged) + }), self.input.service_ip_pool_ranges(), ) }) @@ -534,9 +636,9 @@ impl<'a> BlueprintBuilder<'a> { ) -> Result<&mut DnsSubnetAllocator, Error> { self.internal_dns_subnets.get_or_try_init(|| { DnsSubnetAllocator::new( - self.zones - .current_zones(BlueprintZoneFilter::ShouldBeRunning) - .flat_map(|(_sled_id, zone_config)| zone_config), + self.sled_editors.values().flat_map(|editor| { + editor.zones(BlueprintZoneFilter::ShouldBeRunning) + }), self.input, ) })?; @@ -546,8 +648,8 @@ impl<'a> BlueprintBuilder<'a> { /// Iterates over the list of sled IDs for which we have zones. /// /// This may include decommissioned sleds. - pub fn sled_ids_with_zones(&self) -> impl Iterator { - self.zones.sled_ids_with_zones() + pub fn sled_ids_with_zones(&self) -> impl Iterator + '_ { + self.sled_editors.keys().copied() } pub fn current_sled_zones( @@ -555,20 +657,82 @@ impl<'a> BlueprintBuilder<'a> { sled_id: SledUuid, filter: BlueprintZoneFilter, ) -> impl Iterator { - self.zones.current_sled_zones(sled_id, filter).map(|(config, _)| config) + let Some(editor) = self.sled_editors.get(&sled_id) else { + return Box::new(iter::empty()) + as Box>; + }; + Box::new(editor.zones(filter)) } /// Assemble a final [`Blueprint`] based on the contents of the builder pub fn build(mut self) -> Blueprint { + let blueprint_id = self.rng.next_blueprint(); + // Collect the Omicron zones config for all sleds, including sleds that // are no longer in service and need expungement work. - let blueprint_zones = self - .zones - .into_zones_map(self.input.all_sled_ids(SledFilter::Commissioned)); - let (blueprint_disks, blueprint_datasets) = - self.storage.into_blueprint_maps( - self.input.all_sled_ids(SledFilter::InService), - ); + let mut sled_state = BTreeMap::new(); + let mut blueprint_zones = BTreeMap::new(); + let mut blueprint_disks = BTreeMap::new(); + let mut blueprint_datasets = BTreeMap::new(); + for (sled_id, editor) in self.sled_editors { + let EditedSled { zones, disks, datasets, state, edit_counts } = + editor.finalize(); + sled_state.insert(sled_id, state); + blueprint_disks.insert(sled_id, disks); + blueprint_datasets.insert(sled_id, datasets); + blueprint_zones.insert(sled_id, zones); + if edit_counts.has_nonzero_counts() { + debug!( + self.log, "sled modified in new blueprint"; + "sled_id" => %sled_id, + "blueprint_id" => %blueprint_id, + "disk_edits" => ?edit_counts.disks, + "dataset_edits" => ?edit_counts.datasets, + "zone_edits" => ?edit_counts.zones, + ); + } else { + debug!( + self.log, "sled unchanged in new blueprint"; + "sled_id" => %sled_id, + "blueprint_id" => %blueprint_id, + ); + } + } + // Preserving backwards compatibility, for now: prune sled_state of any + // fully decommissioned sleds, which we determine by the state being + // `Decommissioned` _and_ the sled is no longer in our PlanningInput's + // list of commissioned sleds. + let commissioned_sled_ids = self + .input + .all_sled_ids(SledFilter::Commissioned) + .collect::>(); + sled_state.retain(|sled_id, state| { + *state != SledState::Decommissioned + || commissioned_sled_ids.contains(sled_id) + }); + // Preserving backwards compatibility, for now: disks should only + // have entries for in-service sleds, and expunged disks should be + // removed entirely. + let in_service_sled_ids = self + .input + .all_sled_ids(SledFilter::InService) + .collect::>(); + blueprint_disks.retain(|sled_id, disks_config| { + if !in_service_sled_ids.contains(sled_id) { + return false; + } + + disks_config.disks.retain(|config| match config.disposition { + BlueprintPhysicalDiskDisposition::InService => true, + BlueprintPhysicalDiskDisposition::Expunged => false, + }); + + true + }); + // Preserving backwards compatibility, for now: datasets should only + // have entries for in-service sleds. + blueprint_datasets + .retain(|sled_id, _| in_service_sled_ids.contains(sled_id)); // If we have the clickhouse cluster setup enabled via policy and we // don't yet have a `ClickhouseClusterConfiguration`, then we must create @@ -623,11 +787,11 @@ impl<'a> BlueprintBuilder<'a> { } }); Blueprint { - id: self.rng.next_blueprint(), + id: blueprint_id, blueprint_zones, blueprint_disks, blueprint_datasets, - sled_state: self.sled_state, + sled_state, parent_blueprint_id: Some(self.parent_blueprint.id), internal_dns_version: self.input.internal_dns_version(), external_dns_version: self.input.external_dns_version(), @@ -655,8 +819,14 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, desired_state: SledState, - ) { - self.sled_state.insert(sled_id, desired_state); + ) -> Result<(), Error> { + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to set sled state for unknown sled {sled_id}" + )) + })?; + editor.set_state(desired_state); + Ok(()) } /// Within tests, set an RNG for deterministic results. @@ -698,12 +868,16 @@ impl<'a> BlueprintBuilder<'a> { "sled_id" => sled_id.to_string(), )); + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to expunge zones for unknown sled {sled_id}" + )) + })?; + // Do any zones need to be marked expunged? let mut zones_to_expunge = BTreeMap::new(); - let sled_zones = - self.zones.current_sled_zones(sled_id, BlueprintZoneFilter::All); - for (zone_config, state) in sled_zones { + for zone_config in editor.zones(BlueprintZoneFilter::All) { let zone_id = zone_config.id; let log = log.new(o!( "zone_id" => zone_id.to_string() @@ -715,12 +889,13 @@ impl<'a> BlueprintBuilder<'a> { continue; }; - let is_expunged = - is_already_expunged(zone_config, state).map_err(|error| { - Error::Planner(anyhow!(error).context(format!( - "for sled {sled_id}, error computing zones to expunge" - ))) - })?; + // TODO-john we lost the check for "are we expunging a zone we + // modified in this planner iteration" - do we need that? + let is_expunged = match zone_config.disposition { + BlueprintZoneDisposition::InService + | BlueprintZoneDisposition::Quiesced => false, + BlueprintZoneDisposition::Expunged => true, + }; if !is_expunged { match reason { @@ -778,34 +953,13 @@ impl<'a> BlueprintBuilder<'a> { return Ok(zones_to_expunge); } - let sled_resources = self.sled_resources(sled_id)?; - let mut sled_storage = self.storage.sled_storage_editor( - sled_id, - sled_resources, - &mut self.rng, - )?; - // Now expunge all the zones that need it. - let removed_zones = { - let change = self.zones.change_sled_zones(sled_id); - change - .expunge_zones(zones_to_expunge.keys().cloned().collect()) - .map_err(|error| { - Error::Planner(anyhow!(error).context(format!( - "for sled {sled_id}, error expunging zones" - ))) - })? - }; - - // Also expunge the datasets of all removed zones. - for zone in removed_zones { - sled_storage.expunge_zone_datasets(zone); + for zone_id in zones_to_expunge.keys() { + editor + .expunge_zone(&zone_id) + .map_err(|err| Error::SledEditError { sled_id, err })?; } - // We're done with `sled_storage`; drop it so the borrow checker is okay - // with calling other methods on `self` below. - mem::drop(sled_storage); - // Finally, add comments describing what happened. // // Group the zones by their reason for expungement. @@ -869,12 +1023,17 @@ impl<'a> BlueprintBuilder<'a> { resources: &SledResources, ) -> Result { // These are the disks known to our (last?) blueprint - let mut sled_storage = self.storage.sled_storage_editor( - sled_id, - resources, - &mut self.rng, - )?; - let blueprint_disk_ids = sled_storage.disk_ids().collect::>(); + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to ensure disks for unknown sled {sled_id}" + )) + })?; + let initial_counts = editor.edit_counts(); + + let blueprint_disk_ids = editor + .disks(DiskFilter::InService) + .map(|config| config.id) + .collect::>(); // These are the in-service disks as we observed them in the database, // during the planning phase @@ -887,42 +1046,28 @@ impl<'a> BlueprintBuilder<'a> { // blueprint for (disk_id, (zpool, disk)) in database_disks { database_disk_ids.insert(disk_id); - sled_storage.ensure_disk(BlueprintPhysicalDiskConfig { - disposition: BlueprintPhysicalDiskDisposition::InService, - identity: disk.disk_identity.clone(), - id: disk_id, - pool_id: *zpool, - }); + editor.ensure_disk( + BlueprintPhysicalDiskConfig { + disposition: BlueprintPhysicalDiskDisposition::InService, + identity: disk.disk_identity.clone(), + id: disk_id, + pool_id: *zpool, + }, + &mut self.rng, + ); } // Remove any disks that appear in the blueprint, but not the database - let mut zones_to_expunge = BTreeSet::new(); for disk_id in blueprint_disk_ids { if !database_disk_ids.contains(&disk_id) { - if let Some(expunged_zpool) = sled_storage.remove_disk(&disk_id) - { - zones_to_expunge.extend( - self.zones - .zones_using_zpool( - sled_id, - BlueprintZoneFilter::ShouldBeRunning, - &expunged_zpool, - ) - .map(|zone| zone.id), - ); - } + editor + .expunge_disk(&disk_id) + .map_err(|err| Error::SledEditError { sled_id, err })?; } } - let mut edit_counts: SledEditCounts = sled_storage.finalize().into(); + let final_counts = editor.edit_counts(); - // Expunging a zpool necessarily requires also expunging any zones that - // depended on it. - for zone_id in zones_to_expunge { - edit_counts = - edit_counts.accum(self.sled_expunge_zone(sled_id, zone_id)?); - } - - Ok(edit_counts) + Ok(final_counts.difference_since(initial_counts)) } /// Ensure that a sled in the blueprint has all the datasets it needs for @@ -942,31 +1087,32 @@ impl<'a> BlueprintBuilder<'a> { pub fn sled_ensure_zone_datasets( &mut self, sled_id: SledUuid, - resources: &SledResources, ) -> Result { - let mut sled_storage = self.storage.sled_storage_editor( - sled_id, - resources, - &mut self.rng, - )?; + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to ensure zone datasets for unknown sled {sled_id}" + )) + })?; - // Ensure that datasets needed for zones exist. - for (zone, _zone_state) in self - .zones - .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) - { - sled_storage.ensure_zone_datasets(zone); - } + let initial_counts = editor.edit_counts(); + editor + .ensure_datasets_for_running_zones(&mut self.rng) + .map_err(|err| Error::SledEditError { sled_id, err })?; + let final_counts = editor.edit_counts(); - let StorageEditCounts { disks: disk_edits, datasets: dataset_edits } = - sled_storage.finalize(); + let SledEditCounts { disks, datasets, zones } = + final_counts.difference_since(initial_counts); debug_assert_eq!( - disk_edits, + disks, EditCounts::zeroes(), - "we only edited datasets, not disks" + "we only edited datasets" ); - - Ok(dataset_edits.into()) + debug_assert_eq!( + zones, + EditCounts::zeroes(), + "we only edited datasets" + ); + Ok(datasets.into()) } fn next_internal_dns_gz_address_index(&self, sled_id: SledUuid) -> u32 { @@ -1077,10 +1223,16 @@ impl<'a> BlueprintBuilder<'a> { sled_id: SledUuid, ) -> Result { // If there's already an NTP zone on this sled, do nothing. - let has_ntp = self - .zones - .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) - .any(|(z, _)| z.zone_type.is_ntp()); + let has_ntp = { + let editor = self.sled_editors.get(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to ensure NTP zone for unknown sled {sled_id}" + )) + })?; + editor + .zones(BlueprintZoneFilter::ShouldBeRunning) + .any(|z| z.zone_type.is_ntp()) + }; if has_ntp { return Ok(Ensure::NotNeeded); } @@ -1114,10 +1266,13 @@ impl<'a> BlueprintBuilder<'a> { let pool_name = ZpoolName::new_external(zpool_id); // If this sled already has a Crucible zone on this pool, do nothing. - let has_crucible_on_this_pool = self - .zones - .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) - .any(|(z, _)| { + let has_crucible_on_this_pool = { + let editor = self.sled_editors.get(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to ensure crucible zone for unknown sled {sled_id}" + )) + })?; + editor.zones(BlueprintZoneFilter::ShouldBeRunning).any(|z| { matches!( &z.zone_type, BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { @@ -1126,7 +1281,8 @@ impl<'a> BlueprintBuilder<'a> { }) if dataset.pool_name == pool_name ) - }); + }) + }; if has_crucible_on_this_pool { return Ok(Ensure::NotNeeded); } @@ -1172,9 +1328,12 @@ impl<'a> BlueprintBuilder<'a> { sled_id: SledUuid, kind: ZoneKind, ) -> usize { - self.zones - .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) - .filter(|(z, _)| z.zone_type.kind() == kind) + let Some(editor) = self.sled_editors.get(&sled_id) else { + return 0; + }; + editor + .zones(BlueprintZoneFilter::ShouldBeRunning) + .filter(|z| z.zone_type.kind() == kind) .count() } @@ -1461,20 +1620,18 @@ impl<'a> BlueprintBuilder<'a> { dns_servers: Vec, domain: Option, ) -> Result<(), Error> { - // Check the sled id and return an appropriate error if it's invalid. - let _ = self.sled_resources(sled_id)?; - - let sled_zones = self.zones.change_sled_zones(sled_id); + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to promote NTP zone on unknown sled {sled_id}" + )) + })?; // Find the internal NTP zone and expunge it. - let mut internal_ntp_zone_id_iter = sled_zones - .iter_zones(BlueprintZoneFilter::ShouldBeRunning) - .filter_map(|config| { - if matches!( - config.zone().zone_type, - BlueprintZoneType::InternalNtp(_) - ) { - Some(config.zone().id) + let mut internal_ntp_zone_id_iter = editor + .zones(BlueprintZoneFilter::ShouldBeRunning) + .filter_map(|zone| { + if matches!(zone.zone_type, BlueprintZoneType::InternalNtp(_)) { + Some(zone.id) } else { None } @@ -1496,7 +1653,7 @@ impl<'a> BlueprintBuilder<'a> { std::mem::drop(internal_ntp_zone_id_iter); // Expunge the internal NTP zone. - sled_zones.expunge_zone(internal_ntp_zone_id).map_err(|error| { + editor.expunge_zone(&internal_ntp_zone_id).map_err(|error| { Error::Planner(anyhow!(error).context(format!( "error expunging internal NTP zone from sled {sled_id}" ))) @@ -1559,31 +1716,18 @@ impl<'a> BlueprintBuilder<'a> { sled_id: SledUuid, zone_id: OmicronZoneUuid, ) -> Result { - let sled_resources = self.sled_resources(sled_id)?; - - let sled_zones = self.zones.change_sled_zones(sled_id); - let (builder_config, did_expunge) = - sled_zones.expunge_zone(zone_id).map_err(|error| { - Error::Planner( - anyhow!(error) - .context("failed to expunge zone from sled {sled_id}"), - ) - })?; - let zone_config = builder_config.zone(); - - let mut storage = self.storage.sled_storage_editor( - sled_id, - sled_resources, - &mut self.rng, - )?; - storage.expunge_zone_datasets(zone_config); - - let mut edit_counts: SledEditCounts = storage.finalize().into(); - if did_expunge { - edit_counts.zones.expunged += 1; - } + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to expunge zone on unknown sled {sled_id}" + )) + })?; + let initial_counts = editor.edit_counts(); + editor + .expunge_zone(&zone_id) + .map_err(|err| Error::SledEditError { sled_id, err })?; + let final_counts = editor.edit_counts(); - Ok(edit_counts) + Ok(final_counts.difference_since(initial_counts)) } fn sled_add_zone( @@ -1591,30 +1735,25 @@ impl<'a> BlueprintBuilder<'a> { sled_id: SledUuid, zone: BlueprintZoneConfig, ) -> Result<(), Error> { - // Check the sled id and return an appropriate error if it's invalid. - let sled_resources = self.sled_resources(sled_id)?; - let mut sled_storage = self.storage.sled_storage_editor( - sled_id, - sled_resources, - &mut self.rng, - )?; - sled_storage.ensure_zone_datasets(&zone); - - let sled_zones = self.zones.change_sled_zones(sled_id); - sled_zones.add_zone(zone).map_err(|error| { - Error::Planner( - anyhow!(error) - .context(format!("error adding zone to sled {sled_id}")), - ) + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to add zone on unknown sled {sled_id}" + )) })?; - - Ok(()) + editor + .add_zone(zone, &mut self.rng) + .map_err(|err| Error::SledEditError { sled_id, err }) } /// Returns a newly-allocated underlay address suitable for use by Omicron /// zones fn sled_alloc_ip(&mut self, sled_id: SledUuid) -> Result { let sled_subnet = self.sled_resources(sled_id)?.subnet; + let editor = self.sled_editors.get(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to allocate underlay IP for unknown sled {sled_id}" + )) + })?; let allocator = self.sled_ip_allocators.entry(sled_id).or_insert_with(|| { let sled_subnet_addr = sled_subnet.net().prefix(); @@ -1640,10 +1779,7 @@ impl<'a> BlueprintBuilder<'a> { // Record each of the sled's zones' underlay IPs as // allocated. - for (z, _) in self - .zones - .current_sled_zones(sled_id, BlueprintZoneFilter::All) - { + for z in editor.zones(BlueprintZoneFilter::All) { allocator.reserve(z.underlay_ip()); } @@ -1653,15 +1789,6 @@ impl<'a> BlueprintBuilder<'a> { allocator.alloc().ok_or(Error::OutOfAddresses { sled_id }) } - #[cfg(test)] - pub(crate) fn sled_select_zpool_for_tests( - &self, - sled_id: SledUuid, - zone_kind: ZoneKind, - ) -> Result { - self.sled_select_zpool(sled_id, zone_kind) - } - /// Selects a zpool for this zone type. /// /// This zpool may be used for either durable storage or transient @@ -1674,14 +1801,17 @@ impl<'a> BlueprintBuilder<'a> { sled_id: SledUuid, zone_kind: ZoneKind, ) -> Result { + let editor = self.sled_editors.get(&sled_id).ok_or_else(|| { + Error::Planner(anyhow!( + "tried to select zpool for unknown sled {sled_id}" + )) + })?; + // We'll check both the disks available to this sled per our current // blueprint and the list of all in-service zpools on this sled per our // planning input, and only pick zpools that are available in both. - let current_sled_disks = self - .storage - .current_sled_disks(&sled_id) - .ok_or(Error::NoAvailableZpool { sled_id, kind: zone_kind })? - .values() + let current_sled_disks = editor + .disks(DiskFilter::InService) .map(|disk_config| disk_config.pool_id) .collect::>(); @@ -1758,157 +1888,6 @@ impl<'a> BlueprintBuilder<'a> { } } -/// Helper for working with sets of zones on each sled -/// -/// Tracking the set of zones is slightly non-trivial because we need to bump -/// the per-sled generation number iff the zones are changed. So we need to -/// keep track of whether we've changed the zones relative to the parent -/// blueprint. We do this by keeping a copy of any [`BlueprintZonesConfig`] -/// that we've changed and a _reference_ to the parent blueprint's zones. This -/// struct makes it easy for callers iterate over the right set of zones. -pub(super) struct BlueprintZonesBuilder<'a> { - changed_zones: BTreeMap, - parent_zones: &'a BTreeMap, -} - -impl<'a> BlueprintZonesBuilder<'a> { - pub fn new(parent_blueprint: &'a Blueprint) -> BlueprintZonesBuilder { - BlueprintZonesBuilder { - changed_zones: BTreeMap::new(), - parent_zones: &parent_blueprint.blueprint_zones, - } - } - - /// Returns a mutable reference to a sled's Omicron zones *because* we're - /// going to change them. - /// - /// This updates internal data structures, and it is recommended that it be - /// called only when the caller actually wishes to make changes to zones. - /// But making no changes after calling this does not result in a changed - /// blueprint. (In particular, the generation number is only updated if - /// the state of any zones was updated.) - pub fn change_sled_zones( - &mut self, - sled_id: SledUuid, - ) -> &mut BuilderZonesConfig { - self.changed_zones.entry(sled_id).or_insert_with(|| { - if let Some(old_sled_zones) = self.parent_zones.get(&sled_id) { - BuilderZonesConfig::from_parent(old_sled_zones) - } else { - BuilderZonesConfig::new() - } - }) - } - - /// Iterates over the list of sled IDs for which we have zones. - /// - /// This may include decommissioned sleds. - pub fn sled_ids_with_zones(&self) -> impl Iterator { - let mut sled_ids = - self.changed_zones.keys().copied().collect::>(); - for &sled_id in self.parent_zones.keys() { - sled_ids.insert(sled_id); - } - sled_ids.into_iter() - } - - /// Iterates over the list of `current_sled_zones` for all sled IDs for - /// which we have zones. - /// - /// This may include decommissioned sleds. - pub fn current_zones( - &self, - filter: BlueprintZoneFilter, - ) -> impl Iterator)> { - let sled_ids = self.sled_ids_with_zones(); - sled_ids.map(move |sled_id| { - let zones = self - .current_sled_zones(sled_id, filter) - .map(|(zone_config, _)| zone_config) - .collect(); - (sled_id, zones) - }) - } - - /// Iterates over the list of Omicron zones currently configured for this - /// sled in the blueprint that's being built, along with each zone's state - /// in the builder. - pub fn current_sled_zones( - &self, - sled_id: SledUuid, - filter: BlueprintZoneFilter, - ) -> Box + '_> - { - if let Some(sled_zones) = self.changed_zones.get(&sled_id) { - Box::new( - sled_zones.iter_zones(filter).map(|z| (z.zone(), z.state())), - ) - } else if let Some(parent_zones) = self.parent_zones.get(&sled_id) { - Box::new(parent_zones.zones.iter().filter_map(move |z| { - if z.disposition.matches(filter) { - Some((z, BuilderZoneState::Unchanged)) - } else { - None - } - })) - } else { - Box::new(std::iter::empty()) - } - } - - /// Builds a set of all zones whose filesystem or durable dataset reside on - /// the given `zpool`. - pub fn zones_using_zpool<'b>( - &'b self, - sled_id: SledUuid, - filter: BlueprintZoneFilter, - zpool: &'b ZpoolName, - ) -> impl Iterator + 'b { - self.current_sled_zones(sled_id, filter).filter_map( - move |(config, _state)| { - if Some(zpool) == config.filesystem_pool.as_ref() - || Some(zpool) == config.zone_type.durable_zpool() - { - Some(config) - } else { - None - } - }, - ) - } - - /// Produces an owned map of zones for the sleds recorded in this blueprint - /// plus any newly-added sleds - pub fn into_zones_map( - self, - added_sled_ids: impl Iterator, - ) -> BTreeMap { - // Start with self.changed_zones, which contains entries for any - // sled whose zones config is changing in this blueprint. - let mut zones = self - .changed_zones - .into_iter() - .map(|(sled_id, zones)| (sled_id, zones.build())) - .collect::>(); - - // Carry forward any zones from our parent blueprint. This may include - // zones for decommissioned sleds. - for (sled_id, parent_zones) in self.parent_zones { - zones.entry(*sled_id).or_insert_with(|| parent_zones.clone()); - } - - // Finally, insert any newly-added sleds. - for sled_id in added_sled_ids { - zones.entry(sled_id).or_insert_with(|| BlueprintZonesConfig { - generation: Generation::new(), - zones: vec![], - }); - } - - zones - } -} - #[cfg(test)] pub mod test { use super::*; @@ -2035,9 +2014,13 @@ pub mod test { } } - // All commissioned disks should have debug and zone root datasets. + // All disks should have debug and zone root datasets. for (sled_id, disk_config) in &blueprint.blueprint_disks { for disk in &disk_config.disks { + eprintln!( + "checking datasets for sled {sled_id} disk {}", + disk.id + ); let zpool = ZpoolName::new_external(disk.pool_id); let datasets = datasets_for_sled(&blueprint, *sled_id); @@ -2074,10 +2057,8 @@ pub mod test { } let datasets = datasets_for_sled(&blueprint, sled_id); - let zpool = zone_config.filesystem_pool.as_ref().unwrap(); - let kind = DatasetKind::TransientZone { - name: storage_editor::zone_name(&zone_config), - }; + let (zpool, kind) = + zone_config.filesystem_dataset().unwrap().into_parts(); let dataset = find_dataset(&datasets, &zpool, kind); assert_eq!( dataset.disposition, @@ -2256,9 +2237,7 @@ pub mod test { for pool_id in new_sled_resources.zpools.keys() { builder.sled_ensure_zone_crucible(new_sled_id, *pool_id).unwrap(); } - builder - .sled_ensure_zone_datasets(new_sled_id, new_sled_resources) - .unwrap(); + builder.sled_ensure_zone_datasets(new_sled_id).unwrap(); let blueprint3 = builder.build(); verify_blueprint(&blueprint3); @@ -2381,7 +2360,7 @@ pub mod test { // Generate a new blueprint. This sled should still be included: even // though the desired state is decommissioned, the current state is // still active, so we should carry it forward. - let blueprint2 = BlueprintBuilder::new_based_on( + let mut blueprint2 = BlueprintBuilder::new_based_on( &logctx.log, &blueprint1, &input, @@ -2399,11 +2378,21 @@ pub mod test { ); // Change the input to mark the sled decommissioned. (Normally realizing - // blueprint2 would make this change.) + // blueprint2 would make this change.) We must also mark all its zones + // expunged to avoid tripping over an invalid state check in + // `new_based_on()`. let mut builder = input.into_builder(); builder.sleds_mut().get_mut(&decommision_sled_id).unwrap().state = SledState::Decommissioned; let input = builder.build(); + for z in &mut blueprint2 + .blueprint_zones + .get_mut(&decommision_sled_id) + .unwrap() + .zones + { + z.disposition = BlueprintZoneDisposition::Expunged; + } // Generate a new blueprint. This desired sled state should no longer be // present: it has reached the terminal decommissioned state, so there's @@ -2468,9 +2457,11 @@ pub mod test { // not have any disks in them. for sled_id in input.all_sled_ids(SledFilter::InService) { let disks = builder - .storage - .current_sled_disks(&sled_id) - .expect("found disks config for sled"); + .sled_editors + .get(&sled_id) + .unwrap() + .disks(DiskFilter::All) + .collect::>(); assert!( disks.is_empty(), "expected empty disks for sled {sled_id}, got {disks:?}" @@ -2505,19 +2496,14 @@ pub mod test { ); } - let new_disks = builder - .storage - .into_blueprint_maps(input.all_sled_ids(SledFilter::InService)) - .0; // We should have disks and a generation bump for every sled. let parent_disk_gens = parent .blueprint_disks .iter() .map(|(&sled_id, config)| (sled_id, config.generation)); for (sled_id, parent_gen) in parent_disk_gens { - let new_sled_disks = new_disks - .get(&sled_id) - .expect("found child entry for sled present in parent"); + let EditedSled { disks: new_sled_disks, .. } = + builder.sled_editors.remove(&sled_id).unwrap().finalize(); assert_eq!(new_sled_disks.generation, parent_gen.next()); assert_eq!( new_sled_disks.disks.len(), @@ -2577,11 +2563,8 @@ pub mod test { // Before we make any modifications, there should be no work to do. // // If we haven't changed inputs, the output should be the same! - for (sled_id, resources) in - input.all_sled_resources(SledFilter::Commissioned) - { - let r = - builder.sled_ensure_zone_datasets(sled_id, resources).unwrap(); + for sled_id in input.all_sled_ids(SledFilter::Commissioned) { + let r = builder.sled_ensure_zone_datasets(sled_id).unwrap(); assert_eq!(r, EnsureMultiple::NotNeeded); } @@ -2591,48 +2574,32 @@ pub mod test { .all_sled_ids(SledFilter::Commissioned) .next() .expect("at least one sled present"); - let sled_details = - input.sled_lookup(SledFilter::Commissioned, sled_id).unwrap(); - let crucible_zone_id = builder - .zones - .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) - .find_map(|(zone_config, _)| { + let editor = + builder.sled_editors.get_mut(&sled_id).expect("found sled"); + let crucible_zone_id = editor + .zones(BlueprintZoneFilter::ShouldBeRunning) + .find_map(|zone_config| { if zone_config.zone_type.is_crucible() { return Some(zone_config.id); } None }) .expect("at least one crucible must be present"); - let change = builder.zones.change_sled_zones(sled_id); println!("Expunging crucible zone: {crucible_zone_id}"); - let expunged_zones = - change.expunge_zones(BTreeSet::from([crucible_zone_id])).unwrap(); - assert_eq!(expunged_zones.len(), 1); + + let initial_counts = editor.edit_counts(); + editor.expunge_zone(&crucible_zone_id).expect("expunged crucible"); + let changed_counts = + editor.edit_counts().difference_since(initial_counts); // In the case of Crucible, we have a durable dataset and a transient // zone filesystem, so we expect two datasets to be expunged. - let r = builder - .storage - .sled_storage_editor( - sled_id, - &sled_details.resources, - &mut builder.rng, - ) - .unwrap() - .expunge_zone_datasets(&expunged_zones[0]); assert_eq!( - r, - EnsureMultiple::Changed { - added: 0, - updated: 0, - expunged: 2, - removed: 0 - } + changed_counts.datasets, + EditCounts { added: 0, updated: 0, expunged: 2, removed: 0 } ); // Once the datasets are expunged, no further changes will be proposed. - let r = builder - .sled_ensure_zone_datasets(sled_id, &sled_details.resources) - .unwrap(); + let r = builder.sled_ensure_zone_datasets(sled_id).unwrap(); assert_eq!(r, EnsureMultiple::NotNeeded); let blueprint = builder.build(); @@ -2649,9 +2616,7 @@ pub mod test { // While the datasets still exist in the input (effectively, the db) we // cannot remove them. - let r = builder - .sled_ensure_zone_datasets(sled_id, &sled_details.resources) - .unwrap(); + let r = builder.sled_ensure_zone_datasets(sled_id).unwrap(); assert_eq!(r, EnsureMultiple::NotNeeded); let blueprint = builder.build(); @@ -2703,11 +2668,7 @@ pub mod test { // Now, we should see the datasets "removed" from the blueprint, since // we no longer need to keep around records of their expungement. - let sled_details = - input.sled_lookup(SledFilter::Commissioned, sled_id).unwrap(); - let r = builder - .sled_ensure_zone_datasets(sled_id, &sled_details.resources) - .unwrap(); + let r = builder.sled_ensure_zone_datasets(sled_id).unwrap(); // TODO(https://github.com/oxidecomputer/omicron/issues/6646): // Because of the workaround for #6646, we don't actually remove @@ -2950,9 +2911,7 @@ pub mod test { .sled_add_zone_cockroachdb(target_sled_id) .expect("added CRDB zone"); } - builder - .sled_ensure_zone_datasets(target_sled_id, sled_resources) - .unwrap(); + builder.sled_ensure_zone_datasets(target_sled_id).unwrap(); let blueprint = builder.build(); verify_blueprint(&blueprint); @@ -3003,4 +2962,92 @@ pub mod test { logctx.cleanup_successful(); } + + // This test can go away with + // https://github.com/oxidecomputer/omicron/issues/6645; for now, it + // confirms we maintain the compatibility layer it needs. + #[test] + fn test_backcompat_reuse_existing_database_dataset_ids() { + static TEST_NAME: &str = + "backcompat_reuse_existing_database_dataset_ids"; + let logctx = test_setup_log(TEST_NAME); + + // Start with the standard example blueprint. + let (collection, input, mut parent) = example(&logctx.log, TEST_NAME); + + // `parent` was not created prior to the addition of disks and datasets, + // so it should have datasets for all the disks and zones, and the + // dataset IDs should match the input. + let mut input_dataset_ids = BTreeMap::new(); + let mut input_ndatasets = 0; + for (_, resources) in input.all_sled_resources(SledFilter::All) { + for (zpool_id, dataset_configs) in + resources.all_datasets(ZpoolFilter::All) + { + for dataset in dataset_configs { + let id = dataset.id; + let kind = dataset.name.dataset(); + let by_kind: &mut BTreeMap<_, _> = + input_dataset_ids.entry(*zpool_id).or_default(); + let prev = by_kind.insert(kind.clone(), id); + input_ndatasets += 1; + assert!(prev.is_none()); + } + } + } + // We should have 3 datasets per disk (debug + zone root + crucible), + // plus some number of datasets for discretionary zones. We'll just + // check that we have more than 3 per disk. + assert!( + input_ndatasets + > 3 * usize::from(SledBuilder::DEFAULT_NPOOLS) + * ExampleSystemBuilder::DEFAULT_N_SLEDS, + "too few datasets: {input_ndatasets}" + ); + + // Now _remove_ the blueprint datasets entirely, to emulate a + // pre-dataset-addition blueprint. + parent.blueprint_datasets = BTreeMap::new(); + + // Build a new blueprint. + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &parent, + &input, + &collection, + TEST_NAME, + ) + .expect("failed to create builder"); + + // Ensure disks and datasets. This should repopulate the datasets. + for (sled_id, resources) in input.all_sled_resources(SledFilter::All) { + builder + .sled_ensure_disks(sled_id, resources) + .expect("ensured disks"); + builder + .sled_ensure_zone_datasets(sled_id) + .expect("ensured zone datasets"); + } + let output = builder.build(); + + // Repeat the logic above on our new blueprint; it should have the same + // number of datasets, and they should all have identical IDs. + let mut output_dataset_ids = BTreeMap::new(); + let mut output_ndatasets = 0; + for datasets in output.blueprint_datasets.values() { + for (id, dataset) in &datasets.datasets { + let zpool_id = dataset.pool.id(); + let kind = dataset.kind.clone(); + let by_kind: &mut BTreeMap<_, _> = + output_dataset_ids.entry(zpool_id).or_default(); + let prev = by_kind.insert(kind, *id); + output_ndatasets += 1; + assert!(prev.is_none()); + } + } + assert_eq!(input_ndatasets, output_ndatasets); + assert_eq!(input_dataset_ids, output_dataset_ids); + + logctx.cleanup_successful(); + } } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder/datasets_editor.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder/datasets_editor.rs deleted file mode 100644 index 160b841a88..0000000000 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder/datasets_editor.rs +++ /dev/null @@ -1,348 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper for editing the datasets of a Blueprint - -use super::EditCounts; -use crate::planner::PlannerRng; -use illumos_utils::zpool::ZpoolName; -use nexus_types::deployment::BlueprintDatasetConfig; -use nexus_types::deployment::BlueprintDatasetDisposition; -use nexus_types::deployment::BlueprintDatasetsConfig; -use nexus_types::deployment::SledResources; -use nexus_types::deployment::ZpoolFilter; -use omicron_common::api::external::ByteCount; -use omicron_common::api::external::Generation; -use omicron_common::disk::CompressionAlgorithm; -use omicron_common::disk::DatasetKind; -use omicron_common::disk::DatasetName; -use omicron_common::disk::GzipLevel; -use omicron_uuid_kinds::DatasetUuid; -use omicron_uuid_kinds::SledUuid; -use omicron_uuid_kinds::ZpoolUuid; -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::collections::BTreeSet; -use std::net::SocketAddrV6; - -#[derive(Debug, thiserror::Error)] -pub enum BlueprintDatasetsEditError { - #[error( - "{data_source} inconsistency: multiple datasets with kind {kind:?} \ - on zpool {zpool_id}: {id1}, {id2}" - )] - MultipleDatasetsOfKind { - data_source: &'static str, - zpool_id: ZpoolUuid, - kind: DatasetKind, - id1: DatasetUuid, - id2: DatasetUuid, - }, -} - -/// Helper for working with sets of datasets on each sled -/// -/// Tracking the set of datasets is slightly non-trivial because we need to -/// bump the per-sled generation number iff the datasets are changed. So -/// we need to keep track of whether we've changed the datasets relative -/// to the parent blueprint. -#[derive(Debug)] -pub(super) struct BlueprintDatasetsEditor { - current: BTreeMap, - changed: BTreeSet, -} - -impl BlueprintDatasetsEditor { - pub fn new(current: BTreeMap) -> Self { - Self { current, changed: BTreeSet::new() } - } - - /// Get a helper to edit the datasets of a specific sled. - /// - /// If any changes are made via the returned editor, the sled will be - /// recorded as needing a generation bump in its dataset config when the - /// editor is dropped. - pub fn sled_datasets_editor<'a>( - &'a mut self, - sled_id: SledUuid, - sled_resources: &SledResources, - rng: &'a mut PlannerRng, - ) -> Result, BlueprintDatasetsEditError> { - let config = self - .current - .entry(sled_id) - .or_insert_with(empty_blueprint_datasets_config); - - // Gather all dataset IDs known to the database. - // - // See the comment below where this is used; this is a - // backwards-compatibility layer for - // https://github.com/oxidecomputer/omicron/issues/6645. - let database_dataset_ids = build_dataset_kind_id_map( - "database", - sled_resources.all_datasets(ZpoolFilter::InService).flat_map( - |(&zpool_id, configs)| { - configs.iter().map(move |config| { - (zpool_id, config.name.dataset().clone(), config.id) - }) - }, - ), - )?; - - SledDatasetsEditor::new( - rng, - database_dataset_ids, - sled_id, - config, - &mut self.changed, - ) - } - - pub fn build( - mut self, - sled_ids: impl Iterator, - ) -> BTreeMap { - sled_ids - .map(|sled_id| { - let config = match self.current.remove(&sled_id) { - Some(mut config) => { - // Bump generation number for any sled whose - // DatasetsConfig changed - if self.changed.contains(&sled_id) { - config.generation = config.generation.next(); - } - config - } - None => empty_blueprint_datasets_config(), - }; - (sled_id, config) - }) - .collect() - } -} - -#[derive(Debug)] -pub(super) struct SledDatasetsEditor<'a> { - rng: &'a mut PlannerRng, - blueprint_dataset_ids: - BTreeMap>, - database_dataset_ids: - BTreeMap>, - config: &'a mut BlueprintDatasetsConfig, - counts: EditCounts, - sled_id: SledUuid, - parent_changed_set: &'a mut BTreeSet, -} - -impl Drop for SledDatasetsEditor<'_> { - fn drop(&mut self) { - if self.counts.has_nonzero_counts() { - self.parent_changed_set.insert(self.sled_id); - } - } -} - -impl<'a> SledDatasetsEditor<'a> { - fn new( - rng: &'a mut PlannerRng, - database_dataset_ids: BTreeMap< - ZpoolUuid, - BTreeMap, - >, - sled_id: SledUuid, - config: &'a mut BlueprintDatasetsConfig, - parent_changed_set: &'a mut BTreeSet, - ) -> Result { - let blueprint_dataset_ids = build_dataset_kind_id_map( - "parent blueprint", - config.datasets.values().map(|dataset| { - (dataset.pool.id(), dataset.kind.clone(), dataset.id) - }), - )?; - Ok(Self { - rng, - blueprint_dataset_ids, - database_dataset_ids, - config, - counts: EditCounts::zeroes(), - sled_id, - parent_changed_set, - }) - } - - pub fn expunge_datasets_if(&mut self, mut expunge_if: F) -> usize - where - F: FnMut(&BlueprintDatasetConfig) -> bool, - { - let mut num_expunged = 0; - - for dataset in self.config.datasets.values_mut() { - match dataset.disposition { - // Already expunged; ignore - BlueprintDatasetDisposition::Expunged => continue, - // Potentially expungeable - BlueprintDatasetDisposition::InService => (), - } - if expunge_if(&*dataset) { - dataset.disposition = BlueprintDatasetDisposition::Expunged; - num_expunged += 1; - self.counts.expunged += 1; - } - } - - num_expunged - } - - pub fn ensure_debug_dataset(&mut self, zpool: ZpoolName) { - const DEBUG_QUOTA_SIZE_GB: u32 = 100; - - let address = None; - let quota = Some(ByteCount::from_gibibytes_u32(DEBUG_QUOTA_SIZE_GB)); - let reservation = None; - - self.ensure_dataset( - DatasetName::new(zpool, DatasetKind::Debug), - address, - quota, - reservation, - CompressionAlgorithm::GzipN { level: GzipLevel::new::<9>() }, - ) - } - - pub fn ensure_zone_root_dataset(&mut self, zpool: ZpoolName) { - let address = None; - let quota = None; - let reservation = None; - - self.ensure_dataset( - DatasetName::new(zpool, DatasetKind::TransientZoneRoot), - address, - quota, - reservation, - CompressionAlgorithm::Off, - ) - } - - /// Ensures a dataset exists on this sled. - /// - /// - If the dataset exists in the blueprint already, use it. - /// - Otherwise, if the dataset exists in the database, re-use the UUID, but - /// add it to the blueprint. - /// - Otherwise, create a new dataset in the blueprint, which will propagate - /// to the database during execution. - pub fn ensure_dataset( - &mut self, - dataset: DatasetName, - address: Option, - quota: Option, - reservation: Option, - compression: CompressionAlgorithm, - ) { - let zpool_id = dataset.pool().id(); - let kind = dataset.dataset(); - - let make_config = |id: DatasetUuid| BlueprintDatasetConfig { - disposition: BlueprintDatasetDisposition::InService, - id, - pool: dataset.pool().clone(), - kind: kind.clone(), - address, - quota, - reservation, - compression, - }; - - // Is this dataset already in the blueprint? If so, update it if it's - // changed. - if let Some(existing_id) = self - .blueprint_dataset_ids - .get(&zpool_id) - .and_then(|kind_to_id| kind_to_id.get(kind)) - { - // We built `self.blueprint_dataset_ids` based on the contents of - // `self.config.datasets`, so we know we can unwrap this `get_mut`. - let old_config = self.config.datasets.get_mut(existing_id).expect( - "internal inconsistency: \ - entry in blueprint_dataset_ids but not current", - ); - let new_config = make_config(*existing_id); - - if new_config != *old_config { - *old_config = new_config; - self.counts.updated += 1; - } - - return; - } - - // Is there a dataset ID matching this one in the database? If so, use - // that. - // - // TODO(https://github.com/oxidecomputer/omicron/issues/6645): We - // could avoid reading from the datastore if we were confident all - // provisioned datasets existed in the parent blueprint. - let id = self - .database_dataset_ids - .get(&zpool_id) - .and_then(|kind_to_id| kind_to_id.get(kind)) - .copied() - .unwrap_or_else(|| self.rng.next_dataset()); - - self.config.datasets.insert(id, make_config(id)); - self.counts.added += 1; - - // We updated our config, so also record this ID in our "present in - // the blueprint" map. We know the entry doesn't exist or we would have - // found it when we checked above. - self.blueprint_dataset_ids - .entry(zpool_id) - .or_default() - .insert(kind.clone(), id); - } - - /// Consume this editor, returning a summary of changes made. - pub fn finalize(self) -> EditCounts { - self.counts - } -} - -fn build_dataset_kind_id_map( - data_source: &'static str, - iter: impl Iterator, -) -> Result< - BTreeMap>, - BlueprintDatasetsEditError, -> { - let mut kind_id_map: BTreeMap< - ZpoolUuid, - BTreeMap, - > = BTreeMap::new(); - for (zpool_id, kind, dataset_id) in iter { - let dataset_ids_by_kind = kind_id_map.entry(zpool_id).or_default(); - match dataset_ids_by_kind.entry(kind) { - Entry::Vacant(slot) => { - slot.insert(dataset_id); - } - Entry::Occupied(prev) => { - return Err( - BlueprintDatasetsEditError::MultipleDatasetsOfKind { - data_source, - zpool_id, - kind: prev.key().clone(), - id1: *prev.get(), - id2: dataset_id, - }, - ); - } - } - } - Ok(kind_id_map) -} - -fn empty_blueprint_datasets_config() -> BlueprintDatasetsConfig { - BlueprintDatasetsConfig { - generation: Generation::new(), - datasets: BTreeMap::new(), - } -} diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder/disks_editor.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder/disks_editor.rs deleted file mode 100644 index 7c5c4c318f..0000000000 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder/disks_editor.rs +++ /dev/null @@ -1,195 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper for editing the disks of a Blueprint - -use super::EditCounts; -use nexus_types::deployment::BlueprintPhysicalDiskConfig; -use nexus_types::deployment::BlueprintPhysicalDisksConfig; -use omicron_common::api::external::Generation; -use omicron_uuid_kinds::PhysicalDiskUuid; -use omicron_uuid_kinds::SledUuid; -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::collections::BTreeSet; - -/// Helper for working with sets of disks on each sled -/// -/// Tracking the set of disks is slightly non-trivial because we need to -/// bump the per-sled generation number iff the disks are changed. So -/// we need to keep track of whether we've changed the disks relative -/// to the parent blueprint. -#[derive(Debug)] -pub(super) struct BlueprintDisksEditor { - current: BTreeMap, - changed: BTreeSet, -} - -impl BlueprintDisksEditor { - pub fn new( - current: BTreeMap, - ) -> Self { - let current = current - .into_iter() - .map(|(sled_id, config)| (sled_id, config.into())) - .collect(); - Self { current, changed: BTreeSet::new() } - } - - /// Get a helper to edit the disks of a specific sled. - /// - /// If any changes are made via the returned editor, the sled will be - /// recorded as needing a generation bump in its disk config when the editor - /// is dropped. - pub fn sled_disks_editor( - &mut self, - sled_id: SledUuid, - ) -> SledDisksEditor<'_> { - let config = - self.current.entry(sled_id).or_insert_with(DisksConfig::empty); - SledDisksEditor::new(sled_id, config, &mut self.changed) - } - - pub fn current_sled_disks( - &self, - sled_id: &SledUuid, - ) -> Option<&BTreeMap> { - let config = self.current.get(sled_id)?; - Some(&config.disks) - } - - /// Compile all edits into a new map suitable for a blueprint's - /// `blueprint_disks`, bumping the generation number for any sleds whose - /// disk config changed. - /// - /// Only sleds listed in `sled_ids` will be present in the returned map. - /// This primarily allows the caller to drop sleds that are no longer in - /// service. (Any new sleds will be given an empty set of disks, but - /// presumably any new sleds will have _some_ disks that will have already - /// been populated via a relevant `sled_disks_editor()` call.) - pub fn build( - mut self, - sled_ids: impl Iterator, - ) -> BTreeMap { - sled_ids - .map(|sled_id| { - let config = match self.current.remove(&sled_id) { - Some(mut config) => { - // Bump generation number for any sled whose DisksConfig - // changed - if self.changed.contains(&sled_id) { - config.generation = config.generation.next() - } - config.into() - } - None => DisksConfig::empty().into(), - }; - (sled_id, config) - }) - .collect() - } -} - -#[derive(Debug)] -pub(super) struct SledDisksEditor<'a> { - config: &'a mut DisksConfig, - counts: EditCounts, - sled_id: SledUuid, - parent_changed_set: &'a mut BTreeSet, -} - -impl Drop for SledDisksEditor<'_> { - fn drop(&mut self) { - if self.counts.has_nonzero_counts() { - self.parent_changed_set.insert(self.sled_id); - } - } -} - -impl<'a> SledDisksEditor<'a> { - fn new( - sled_id: SledUuid, - config: &'a mut DisksConfig, - parent_changed_set: &'a mut BTreeSet, - ) -> Self { - Self { - config, - counts: EditCounts::zeroes(), - sled_id, - parent_changed_set, - } - } - - pub fn disk_ids(&self) -> impl Iterator + '_ { - self.config.disks.keys().copied() - } - - pub fn ensure_disk(&mut self, disk: BlueprintPhysicalDiskConfig) { - let disk_id = disk.id; - match self.config.disks.entry(disk_id) { - Entry::Vacant(slot) => { - slot.insert(disk); - self.counts.added += 1; - } - Entry::Occupied(mut slot) => { - if *slot.get() != disk { - slot.insert(disk); - self.counts.updated += 1; - } - } - } - } - - pub fn remove_disk( - &mut self, - disk_id: &PhysicalDiskUuid, - ) -> Option { - let old = self.config.disks.remove(disk_id); - if old.is_some() { - self.counts.removed += 1; - } - old - } - - pub fn finalize(self) -> EditCounts { - self.counts - } -} - -// We want add and remove to be cheap and easy to check whether they performed -// the requested operation, so we'll internally convert from the vec of disks to -// a map of disks keyed by disk ID. -#[derive(Debug)] -struct DisksConfig { - generation: Generation, - disks: BTreeMap, -} - -impl DisksConfig { - fn empty() -> Self { - Self { generation: Generation::new(), disks: BTreeMap::new() } - } -} - -impl From for BlueprintPhysicalDisksConfig { - fn from(config: DisksConfig) -> Self { - BlueprintPhysicalDisksConfig { - generation: config.generation, - disks: config.disks.into_values().collect(), - } - } -} - -impl From for DisksConfig { - fn from(config: BlueprintPhysicalDisksConfig) -> Self { - Self { - generation: config.generation, - disks: config - .disks - .into_iter() - .map(|disk| (disk.id, disk)) - .collect(), - } - } -} diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder/storage_editor.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder/storage_editor.rs deleted file mode 100644 index 2119656da3..0000000000 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder/storage_editor.rs +++ /dev/null @@ -1,206 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper for editing the storage (disks and datasets) of a Blueprint - -use crate::planner::PlannerRng; - -use super::datasets_editor::BlueprintDatasetsEditError; -use super::datasets_editor::BlueprintDatasetsEditor; -use super::datasets_editor::SledDatasetsEditor; -use super::disks_editor::BlueprintDisksEditor; -use super::disks_editor::SledDisksEditor; -use super::EnsureMultiple; -use super::StorageEditCounts; -use illumos_utils::zpool::ZpoolName; -use nexus_types::deployment::blueprint_zone_type; -use nexus_types::deployment::BlueprintDatasetsConfig; -use nexus_types::deployment::BlueprintPhysicalDiskConfig; -use nexus_types::deployment::BlueprintPhysicalDisksConfig; -use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneType; -use nexus_types::deployment::SledResources; -use omicron_common::disk::CompressionAlgorithm; -use omicron_common::disk::DatasetKind; -use omicron_common::disk::DatasetName; -use omicron_uuid_kinds::PhysicalDiskUuid; -use omicron_uuid_kinds::SledUuid; -use std::collections::BTreeMap; - -#[derive(Debug)] -pub(super) struct BlueprintStorageEditor { - disks: BlueprintDisksEditor, - datasets: BlueprintDatasetsEditor, -} - -impl BlueprintStorageEditor { - pub fn new( - disks: BTreeMap, - datasets: BTreeMap, - ) -> Self { - Self { - disks: BlueprintDisksEditor::new(disks), - datasets: BlueprintDatasetsEditor::new(datasets), - } - } - - pub fn sled_storage_editor<'a>( - &'a mut self, - sled_id: SledUuid, - sled_resources: &SledResources, - rng: &'a mut PlannerRng, - ) -> Result, BlueprintDatasetsEditError> { - let disks = self.disks.sled_disks_editor(sled_id); - let datasets = - self.datasets.sled_datasets_editor(sled_id, sled_resources, rng)?; - Ok(SledStorageEditor { disks, datasets }) - } - - pub fn current_sled_disks( - &self, - sled_id: &SledUuid, - ) -> Option<&BTreeMap> { - self.disks.current_sled_disks(sled_id) - } - - pub fn into_blueprint_maps( - self, - sled_ids: impl Iterator + Clone, - ) -> ( - BTreeMap, - BTreeMap, - ) { - (self.disks.build(sled_ids.clone()), self.datasets.build(sled_ids)) - } -} - -#[derive(Debug)] -pub(super) struct SledStorageEditor<'a> { - disks: SledDisksEditor<'a>, - datasets: SledDatasetsEditor<'a>, -} - -impl SledStorageEditor<'_> { - pub fn disk_ids(&self) -> impl Iterator + '_ { - self.disks.disk_ids() - } - - pub fn ensure_disk(&mut self, disk: BlueprintPhysicalDiskConfig) { - let zpool = ZpoolName::new_external(disk.pool_id); - - self.disks.ensure_disk(disk); - self.datasets.ensure_debug_dataset(zpool.clone()); - self.datasets.ensure_zone_root_dataset(zpool); - } - - pub fn remove_disk( - &mut self, - disk_id: &PhysicalDiskUuid, - ) -> Option { - let Some(disk) = self.disks.remove_disk(disk_id) else { - return None; - }; - self.datasets - .expunge_datasets_if(|dataset| dataset.pool.id() == disk.pool_id); - Some(ZpoolName::new_external(disk.pool_id)) - } - - pub fn ensure_zone_datasets(&mut self, zone: &BlueprintZoneConfig) { - // TODO check that zpools are on valid disks? - - // Dataset for transient zone filesystem - if let Some(fs_zpool) = &zone.filesystem_pool { - let name = zone_name(&zone); - let address = None; - let quota = None; - let reservation = None; - self.datasets.ensure_dataset( - DatasetName::new( - fs_zpool.clone(), - DatasetKind::TransientZone { name }, - ), - address, - quota, - reservation, - CompressionAlgorithm::Off, - ); - } - - // Dataset for durable dataset co-located with zone - if let Some(dataset) = zone.zone_type.durable_dataset() { - let zpool = &dataset.dataset.pool_name; - - if let Some(fs_zpool) = &zone.filesystem_pool { - debug_assert_eq!( - zpool, fs_zpool, - "zone has durable dataset and transient root \ - on different zpools" - ); - } - - let address = match zone.zone_type { - BlueprintZoneType::Crucible( - blueprint_zone_type::Crucible { address, .. }, - ) => Some(address), - _ => None, - }; - let quota = None; - let reservation = None; - self.datasets.ensure_dataset( - DatasetName::new(zpool.clone(), dataset.kind), - address, - quota, - reservation, - CompressionAlgorithm::Off, - ); - } - } - - pub fn expunge_zone_datasets( - &mut self, - zone: &BlueprintZoneConfig, - ) -> EnsureMultiple { - let mut expunged = 0; - - if zone.filesystem_pool.is_some() { - let name = zone_name(&zone); - let kind = DatasetKind::TransientZone { name }; - expunged += self.datasets.expunge_datasets_if(|dataset_config| { - dataset_config.kind == kind - }); - } - - if let Some(dataset) = zone.zone_type.durable_dataset() { - expunged += self.datasets.expunge_datasets_if(|dataset_config| { - dataset_config.pool == dataset.dataset.pool_name - && dataset_config.kind == dataset.kind - }); - } - - if expunged == 0 { - EnsureMultiple::NotNeeded - } else { - EnsureMultiple::Changed { - added: 0, - updated: 0, - expunged, - removed: 0, - } - } - } - - pub fn finalize(self) -> StorageEditCounts { - StorageEditCounts { - disks: self.disks.finalize(), - datasets: self.datasets.finalize(), - } - } -} - -pub(super) fn zone_name(zone: &BlueprintZoneConfig) -> String { - illumos_utils::zone::zone_name( - zone.zone_type.kind().zone_prefix(), - Some(zone.id), - ) -} diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs b/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs index 725835f4ae..bab6476456 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs @@ -8,7 +8,6 @@ mod builder; mod clickhouse; mod external_networking; mod internal_dns; -mod zones; pub use builder::*; pub use clickhouse::{ClickhouseAllocator, ClickhouseZonesThatShouldBeRunning}; diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs b/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs deleted file mode 100644 index 672331ab81..0000000000 --- a/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs +++ /dev/null @@ -1,527 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use std::collections::BTreeSet; - -use nexus_types::deployment::{ - BlueprintZoneConfig, BlueprintZoneDisposition, BlueprintZoneFilter, - BlueprintZonesConfig, -}; -use omicron_common::api::external::Generation; -use omicron_uuid_kinds::OmicronZoneUuid; -use thiserror::Error; - -#[derive(Debug)] -#[must_use] -pub(super) struct BuilderZonesConfig { - // The current generation -- this is bumped at blueprint build time and is - // otherwise not exposed to callers. - generation: Generation, - - // The list of zones, along with their state. - zones: Vec, -} - -impl BuilderZonesConfig { - pub(super) fn new() -> Self { - Self { - // Note that the first generation is reserved to mean the one - // containing no zones. See - // OmicronZonesConfig::INITIAL_GENERATION. - // - // Since we're currently assuming that creating a new - // `BuilderZonesConfig` means that we're going to add new zones - // shortly, we start with Generation::new() here. It'll get - // bumped up to the next one in `Self::build`. - generation: Generation::new(), - zones: vec![], - } - } - - pub(super) fn from_parent(parent: &BlueprintZonesConfig) -> Self { - Self { - // We'll bump this up at build time. - generation: parent.generation, - - zones: parent - .zones - .iter() - .map(|zone| BuilderZoneConfig { - zone: zone.clone(), - state: BuilderZoneState::Unchanged, - }) - .collect(), - } - } - - pub(super) fn add_zone( - &mut self, - zone: BlueprintZoneConfig, - ) -> Result<(), BuilderZonesConfigError> { - if self.zones.iter().any(|z| z.zone.id == zone.id) { - // We shouldn't be trying to add zones that already exist -- - // something went wrong in the planner logic. - return Err(BuilderZonesConfigError::AddExistingZone { - zone_id: zone.id, - }); - }; - - self.zones - .push(BuilderZoneConfig { zone, state: BuilderZoneState::Added }); - Ok(()) - } - - // On success, returns the now-expunged zone and whether or not it was set - // to expunged (as opposed to already being marked expunged). - pub(super) fn expunge_zone( - &mut self, - zone_id: OmicronZoneUuid, - ) -> Result<(&BuilderZoneConfig, bool), BuilderZonesConfigError> { - let zone = self - .zones - .iter_mut() - .find(|zone| zone.zone.id == zone_id) - .ok_or_else(|| { - let mut unmatched = BTreeSet::new(); - unmatched.insert(zone_id); - BuilderZonesConfigError::ExpungeUnmatchedZones { unmatched } - })?; - - // Check that the zone is expungeable. Typically, zones passed - // in here should have had this check done to them already, but - // in case they're not, or in case something else about those - // zones changed in between, check again. - let needs_expunged = !is_already_expunged(&zone.zone, zone.state)?; - - if needs_expunged { - zone.zone.disposition = BlueprintZoneDisposition::Expunged; - zone.state = BuilderZoneState::Modified; - } - - Ok((&*zone, needs_expunged)) - } - - pub(super) fn expunge_zones( - &mut self, - mut zones: BTreeSet, - ) -> Result, BuilderZonesConfigError> { - let mut removed = Vec::new(); - - for zone in &mut self.zones { - if zones.remove(&zone.zone.id) { - // Check that the zone is expungeable. Typically, zones passed - // in here should have had this check done to them already, but - // in case they're not, or in case something else about those - // zones changed in between, check again. - is_already_expunged(&zone.zone, zone.state)?; - zone.zone.disposition = BlueprintZoneDisposition::Expunged; - zone.state = BuilderZoneState::Modified; - removed.push(&zone.zone); - } - } - - // All zones passed in should have been found -- are there any left - // over? - if !zones.is_empty() { - return Err(BuilderZonesConfigError::ExpungeUnmatchedZones { - unmatched: zones, - }); - } - - Ok(removed) - } - - pub(super) fn iter_zones( - &self, - filter: BlueprintZoneFilter, - ) -> impl Iterator { - self.zones.iter().filter(move |z| z.zone().disposition.matches(filter)) - } - - pub(super) fn build(self) -> BlueprintZonesConfig { - // Only bump the generation if any zones have been changed. - let generation = if self - .zones - .iter() - .any(|z| z.state != BuilderZoneState::Unchanged) - { - self.generation.next() - } else { - self.generation - }; - - let mut ret = BlueprintZonesConfig { - generation, - zones: self.zones.into_iter().map(|z| z.zone).collect(), - }; - ret.sort(); - ret - } -} - -pub(super) fn is_already_expunged( - zone: &BlueprintZoneConfig, - state: BuilderZoneState, -) -> Result { - match zone.disposition { - BlueprintZoneDisposition::InService - | BlueprintZoneDisposition::Quiesced => { - if state != BuilderZoneState::Unchanged { - // We shouldn't be trying to expunge zones that have also been - // changed in this blueprint -- something went wrong in the planner - // logic. - return Err(BuilderZonesConfigError::ExpungeModifiedZone { - zone_id: zone.id, - state, - }); - } - Ok(false) - } - BlueprintZoneDisposition::Expunged => { - // Treat expungement as idempotent. - Ok(true) - } - } -} - -#[derive(Debug)] -pub(super) struct BuilderZoneConfig { - zone: BlueprintZoneConfig, - state: BuilderZoneState, -} - -impl BuilderZoneConfig { - pub(super) fn zone(&self) -> &BlueprintZoneConfig { - &self.zone - } - - pub(super) fn state(&self) -> BuilderZoneState { - self.state - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub(super) enum BuilderZoneState { - Unchanged, - Modified, - Added, -} - -#[derive(Clone, Debug, PartialEq, Eq, Error)] -pub(super) enum BuilderZonesConfigError { - #[error("attempted to add zone that already exists: {zone_id}")] - AddExistingZone { zone_id: OmicronZoneUuid }, - #[error( - "attempted to expunge zone {zone_id} that was in state {state:?} \ - (can only expunge unchanged zones)" - )] - ExpungeModifiedZone { zone_id: OmicronZoneUuid, state: BuilderZoneState }, - #[error( - "while expunging zones, not all zones provided were found: {unmatched:?}" - )] - ExpungeUnmatchedZones { unmatched: BTreeSet }, -} - -#[cfg(test)] -mod tests { - use std::{ - collections::BTreeMap, - net::{Ipv6Addr, SocketAddrV6}, - }; - - use maplit::btreeset; - use nexus_sled_agent_shared::inventory::ZoneKind; - use nexus_types::deployment::SledDisk; - use nexus_types::external_api::views::PhysicalDiskPolicy; - use nexus_types::external_api::views::PhysicalDiskState; - use nexus_types::{ - deployment::{ - blueprint_zone_type, BlueprintZoneType, SledDetails, SledFilter, - SledResources, - }, - external_api::views::{SledPolicy, SledState}, - }; - use omicron_common::address::Ipv6Subnet; - use omicron_common::disk::DiskIdentity; - use omicron_test_utils::dev::test_setup_log; - use omicron_uuid_kinds::PhysicalDiskUuid; - use omicron_uuid_kinds::ZpoolUuid; - - use crate::{ - blueprint_builder::{test::verify_blueprint, BlueprintBuilder, Ensure}, - example::{ExampleSystemBuilder, SimRngState}, - planner::rng::PlannerRng, - }; - - use super::*; - - /// A test focusing on `BlueprintZonesBuilder` and its internal logic. - #[test] - fn test_builder_zones() { - static TEST_NAME: &str = "blueprint_test_builder_zones"; - let logctx = test_setup_log(TEST_NAME); - - let mut rng = SimRngState::from_seed(TEST_NAME); - let (example, blueprint_initial) = ExampleSystemBuilder::new_with_rng( - &logctx.log, - rng.next_system_rng(), - ) - .build(); - - // Add a completely bare sled to the input. - let (new_sled_id, input2) = { - let mut sled_id_rng = rng.next_sled_id_rng(); - let new_sled_id = sled_id_rng.next(); - - let mut input = example.input.clone().into_builder(); - - input - .add_sled( - new_sled_id, - SledDetails { - policy: SledPolicy::provisionable(), - state: SledState::Active, - resources: SledResources { - subnet: Ipv6Subnet::new( - "fd00:1::".parse().unwrap(), - ), - zpools: BTreeMap::from([( - ZpoolUuid::new_v4(), - ( - SledDisk { - disk_identity: DiskIdentity { - vendor: String::from("fake-vendor"), - serial: String::from("fake-serial"), - model: String::from("fake-model"), - }, - disk_id: PhysicalDiskUuid::new_v4(), - policy: PhysicalDiskPolicy::InService, - state: PhysicalDiskState::Active, - }, - // Datasets: Leave empty - vec![], - ), - )]), - }, - }, - ) - .expect("adding new sled"); - - (new_sled_id, input.build()) - }; - - let existing_sled_id = example - .input - .all_sled_ids(SledFilter::Commissioned) - .next() - .expect("at least one sled present"); - - let mut builder = BlueprintBuilder::new_based_on( - &logctx.log, - &blueprint_initial, - &input2, - &example.collection, - "the_test", - ) - .expect("creating blueprint builder"); - builder.set_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))); - let new_sled_resources = &input2 - .sled_lookup(SledFilter::Commissioned, new_sled_id) - .unwrap() - .resources; - - // Test adding a new sled with an NTP zone. - builder.sled_ensure_disks(new_sled_id, new_sled_resources).unwrap(); - assert_eq!( - builder.sled_ensure_zone_ntp(new_sled_id).unwrap(), - Ensure::Added - ); - - // Iterate over the zones for the sled and ensure that the NTP zone is - // present. - { - let mut zones = builder.zones.current_sled_zones( - new_sled_id, - BlueprintZoneFilter::ShouldBeRunning, - ); - let (_, state) = zones.next().expect("exactly one zone for sled"); - assert!(zones.next().is_none(), "exactly one zone for sled"); - assert_eq!( - state, - BuilderZoneState::Added, - "NTP zone should have been added" - ); - } - - // Now, test adding a new zone (Oximeter, picked arbitrarily) to an - // existing sled. - let filesystem_pool = builder - .sled_select_zpool_for_tests(existing_sled_id, ZoneKind::Oximeter) - .expect("chose zpool for new zone"); - let change = builder.zones.change_sled_zones(existing_sled_id); - let new_zone_id = OmicronZoneUuid::new_v4(); - change - .add_zone(BlueprintZoneConfig { - disposition: BlueprintZoneDisposition::InService, - id: new_zone_id, - filesystem_pool: Some(filesystem_pool), - zone_type: BlueprintZoneType::Oximeter( - blueprint_zone_type::Oximeter { - address: SocketAddrV6::new( - Ipv6Addr::UNSPECIFIED, - 0, - 0, - 0, - ), - }, - ), - }) - .expect("adding new zone"); - - // Attempt to expunge one of the other zones on the sled. - let existing_zone_id = change - .iter_zones(BlueprintZoneFilter::ShouldBeRunning) - .find(|z| z.zone.id != new_zone_id) - .expect("at least one existing zone") - .zone - .id; - change - .expunge_zones(btreeset! { existing_zone_id }) - .expect("expunging existing zone"); - // Do it again to ensure that expunging an already-expunged zone is - // idempotent, even within the same blueprint. - change - .expunge_zones(btreeset! { existing_zone_id }) - .expect("expunging already-expunged zone"); - // But expunging a zone that doesn't exist should fail. - let non_existent_zone_id = OmicronZoneUuid::new_v4(); - let non_existent_set = btreeset! { non_existent_zone_id }; - let error = change - .expunge_zones(non_existent_set.clone()) - .expect_err("expunging non-existent zone"); - assert_eq!( - error, - BuilderZonesConfigError::ExpungeUnmatchedZones { - unmatched: non_existent_set - } - ); - - { - // Iterate over the zones and ensure that the Oximeter zone is - // present, and marked added. - let mut zones = builder.zones.current_sled_zones( - existing_sled_id, - BlueprintZoneFilter::ShouldBeRunning, - ); - zones - .find_map(|(z, state)| { - if z.id == new_zone_id { - assert_eq!( - state, - BuilderZoneState::Added, - "new zone ID {new_zone_id} should be marked added" - ); - Some(()) - } else { - None - } - }) - .expect("new zone ID should be present"); - } - - // Attempt to expunge the newly added Oximeter zone. This should fail - // because we only support expunging zones that are unchanged from the - // parent blueprint. - let error = builder - .zones - .change_sled_zones(existing_sled_id) - .expunge_zones(btreeset! { new_zone_id }) - .expect_err("expunging a new zone should fail"); - assert_eq!( - error, - BuilderZonesConfigError::ExpungeModifiedZone { - zone_id: new_zone_id, - state: BuilderZoneState::Added - } - ); - - // Ensure all datasets are created for the zones we've provisioned - for (sled_id, resources) in - input2.all_sled_resources(SledFilter::Commissioned) - { - builder.sled_ensure_zone_datasets(sled_id, resources).unwrap(); - } - - // Now build the blueprint and ensure that all the changes we described - // above are present. - let blueprint = builder.build(); - verify_blueprint(&blueprint); - let diff = blueprint.diff_since_blueprint(&blueprint_initial); - println!("expecting new NTP and Oximeter zones:\n{}", diff.display()); - - // No sleds were removed. - assert_eq!(diff.sleds_removed.len(), 0); - - // One sled was added. - assert_eq!(diff.sleds_added.len(), 1); - let sled_id = diff.sleds_added.first().unwrap(); - assert_eq!(*sled_id, new_sled_id); - let new_sled_zones = diff.zones.added.get(sled_id).unwrap(); - // The generation number should be newer than the initial default. - assert_eq!( - new_sled_zones.generation_after.unwrap(), - Generation::new().next() - ); - assert_eq!(new_sled_zones.zones.len(), 1); - - // TODO: AJS - See comment above - we don't actually use the control sled anymore - // so the comparison was changed. - // One sled was modified: existing_sled_id - assert_eq!(diff.sleds_modified.len(), 1, "1 sled modified"); - for sled_id in &diff.sleds_modified { - assert_eq!(*sled_id, existing_sled_id); - let added = diff.zones.added.get(sled_id).unwrap(); - assert_eq!( - added.generation_after.unwrap(), - added.generation_before.unwrap().next() - ); - assert_eq!(added.zones.len(), 1); - let added_zone = &added.zones[0]; - assert_eq!(added_zone.id(), new_zone_id); - - assert!(!diff.zones.removed.contains_key(sled_id)); - let modified = diff.zones.modified.get(sled_id).unwrap(); - assert_eq!(modified.zones.len(), 1); - let modified_zone = &modified.zones[0]; - assert_eq!(modified_zone.zone.id(), existing_zone_id); - } - - // Test a no-op change. - { - let mut builder = BlueprintBuilder::new_based_on( - &logctx.log, - &blueprint, - &input2, - &example.collection, - "the_test", - ) - .expect("creating blueprint builder"); - builder.set_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))); - - // This call by itself shouldn't bump the generation number. - builder.zones.change_sled_zones(existing_sled_id); - - let blueprint_noop = builder.build(); - verify_blueprint(&blueprint_noop); - let diff = blueprint_noop.diff_since_blueprint(&blueprint); - println!("expecting a noop:\n{}", diff.display()); - - assert!(diff.sleds_modified.is_empty(), "no sleds modified"); - assert!(diff.sleds_added.is_empty(), "no sleds added"); - assert!(diff.sleds_removed.is_empty(), "no sleds removed"); - } - - logctx.cleanup_successful(); - } -} diff --git a/nexus/reconfigurator/planning/src/blueprint_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor.rs new file mode 100644 index 0000000000..652b541de1 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor.rs @@ -0,0 +1,14 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! High-level facilities for editing Blueprints +//! +//! See crate-level documentation for details. + +mod sled_editor; + +pub(crate) use sled_editor::DatasetIdsBackfillFromDb; +pub(crate) use sled_editor::EditedSled; +pub(crate) use sled_editor::SledEditError; +pub(crate) use sled_editor::SledEditor; diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs new file mode 100644 index 0000000000..13094b97a4 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs @@ -0,0 +1,329 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Support for editing the blueprint details of a single sled. + +use crate::blueprint_builder::SledEditCounts; +use crate::planner::PlannerRng; +use illumos_utils::zpool::ZpoolName; +use nexus_types::deployment::blueprint_zone_type; +use nexus_types::deployment::BlueprintDatasetsConfig; +use nexus_types::deployment::BlueprintPhysicalDiskConfig; +use nexus_types::deployment::BlueprintPhysicalDisksConfig; +use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::deployment::BlueprintZoneFilter; +use nexus_types::deployment::BlueprintZoneType; +use nexus_types::deployment::BlueprintZonesConfig; +use nexus_types::deployment::DiskFilter; +use nexus_types::external_api::views::SledState; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::PhysicalDiskUuid; + +mod datasets; +mod disks; +mod zones; + +pub(crate) use self::datasets::DatasetIdsBackfillFromDb; + +pub use self::datasets::DatasetsEditError; +pub use self::datasets::MultipleDatasetsOfKind; +pub use self::disks::DisksEditError; +pub use self::disks::DuplicateDiskId; +pub use self::zones::DuplicateZoneId; +pub use self::zones::ZonesEditError; + +use self::datasets::DatasetsEditor; +use self::datasets::PartialDatasetConfig; +use self::disks::DisksEditor; +use self::zones::ZonesEditor; + +#[derive(Debug, thiserror::Error)] +pub enum SledInputError { + #[error(transparent)] + DuplicateZoneId(#[from] DuplicateZoneId), + #[error(transparent)] + DuplicateDiskId(#[from] DuplicateDiskId), + #[error(transparent)] + MultipleDatasetsOfKind(#[from] MultipleDatasetsOfKind), +} + +#[derive(Debug, thiserror::Error)] +pub enum SledEditError { + #[error("failed to edit disks")] + EditDisks(#[from] DisksEditError), + #[error("failed to edit datasets")] + EditDatasetsError(#[from] DatasetsEditError), + #[error("failed to edit zones")] + EditZones(#[from] ZonesEditError), + #[error( + "invalid configuration for zone {zone_id}: \ + filesystem root zpool ({fs_zpool}) and durable dataset zpool \ + ({dur_zpool}) should be the same" + )] + ZoneInvalidZpoolCombination { + zone_id: OmicronZoneUuid, + fs_zpool: ZpoolName, + dur_zpool: ZpoolName, + }, + #[error( + "invalid configuration for zone {zone_id}: \ + zpool ({zpool}) is not present in this sled's disks" + )] + ZoneOnNonexistentZpool { zone_id: OmicronZoneUuid, zpool: ZpoolName }, +} + +#[derive(Debug)] +pub(crate) struct SledEditor { + state: SledState, + zones: ZonesEditor, + disks: DisksEditor, + datasets: DatasetsEditor, +} + +#[derive(Debug)] +pub(crate) struct EditedSled { + pub state: SledState, + pub zones: BlueprintZonesConfig, + pub disks: BlueprintPhysicalDisksConfig, + pub datasets: BlueprintDatasetsConfig, + pub edit_counts: SledEditCounts, +} + +impl SledEditor { + pub fn new( + state: SledState, + zones: BlueprintZonesConfig, + disks: BlueprintPhysicalDisksConfig, + datasets: BlueprintDatasetsConfig, + preexisting_dataset_ids: DatasetIdsBackfillFromDb, + ) -> Result { + Ok(Self { + state, + zones: zones.try_into()?, + disks: disks.try_into()?, + datasets: DatasetsEditor::new(datasets, preexisting_dataset_ids)?, + }) + } + + pub fn new_empty( + state: SledState, + preexisting_dataset_ids: DatasetIdsBackfillFromDb, + ) -> Self { + Self { + state, + zones: ZonesEditor::empty(), + disks: DisksEditor::empty(), + datasets: DatasetsEditor::empty(preexisting_dataset_ids), + } + } + + pub fn finalize(self) -> EditedSled { + let (disks, disks_counts) = self.disks.finalize(); + let (datasets, datasets_counts) = self.datasets.finalize(); + let (zones, zones_counts) = self.zones.finalize(); + EditedSled { + state: self.state, + zones, + disks, + datasets, + edit_counts: SledEditCounts { + disks: disks_counts, + datasets: datasets_counts, + zones: zones_counts, + }, + } + } + + pub fn edit_counts(&self) -> SledEditCounts { + SledEditCounts { + disks: self.disks.edit_counts(), + datasets: self.datasets.edit_counts(), + zones: self.zones.edit_counts(), + } + } + + pub fn set_state(&mut self, new_state: SledState) { + self.state = new_state; + } + + pub fn disks( + &self, + filter: DiskFilter, + ) -> impl Iterator { + self.disks.disks(filter) + } + + pub fn zones( + &self, + filter: BlueprintZoneFilter, + ) -> impl Iterator { + self.zones.zones(filter) + } + + pub fn ensure_disk( + &mut self, + disk: BlueprintPhysicalDiskConfig, + rng: &mut PlannerRng, + ) { + let zpool = ZpoolName::new_external(disk.pool_id); + + self.disks.ensure(disk); + + // Every disk also gets a Debug and Transient Zone Root dataset; ensure + // both of those exist as well. + let debug = PartialDatasetConfig::for_debug(zpool.clone()); + let zone_root = PartialDatasetConfig::for_transient_zone_root(zpool); + + self.datasets.ensure_in_service(debug, rng); + self.datasets.ensure_in_service(zone_root, rng); + } + + pub fn expunge_disk( + &mut self, + disk_id: &PhysicalDiskUuid, + ) -> Result<(), SledEditError> { + let zpool_id = self.disks.expunge(disk_id)?; + + // When we expunge a disk, we must also expunge any datasets on it, and + // any zones that relied on those datasets. + self.datasets.expunge_all_on_zpool(&zpool_id); + self.zones.expunge_all_on_zpool(&zpool_id); + + Ok(()) + } + + pub fn add_zone( + &mut self, + zone: BlueprintZoneConfig, + rng: &mut PlannerRng, + ) -> Result<(), SledEditError> { + // Ensure we can construct the configs for the datasets for this zone. + let datasets = ZoneDatasetConfigs::new(&self.disks, &zone)?; + + // Actually add the zone and its datasets. + self.zones.add_zone(zone)?; + datasets.ensure_in_service(&mut self.datasets, rng); + + Ok(()) + } + + pub fn expunge_zone( + &mut self, + zone_id: &OmicronZoneUuid, + ) -> Result<(), SledEditError> { + let (did_expunge, config) = self.zones.expunge(zone_id)?; + + // If we didn't actually expunge the zone in this edit, we don't + // move on and expunge its datasets. This is to guard against + // accidentally exposing a different zone's datasets (if that zone has + // happens to have the same dataset kind as us and is running on the + // same zpool as us, which is only possible if we were previously + // expunged). + // + // This wouldn't be necessary if `config` tracked its dataset IDs + // explicitly instead of only recording its zpool; once we fix that we + // should be able to remove this check. + if !did_expunge { + return Ok(()); + } + + if let Some(dataset) = config.filesystem_dataset() { + self.datasets.expunge(&dataset.pool().id(), dataset.dataset())?; + } + if let Some(dataset) = config.zone_type.durable_dataset() { + self.datasets + .expunge(&dataset.dataset.pool_name.id(), &dataset.kind)?; + } + + Ok(()) + } + + /// Backwards compatibility / test helper: If we're given a blueprint that + /// has zones but wasn't created via `SledEditor`, it might not have + /// datasets for all its zones. This method backfills them. + pub fn ensure_datasets_for_running_zones( + &mut self, + rng: &mut PlannerRng, + ) -> Result<(), SledEditError> { + for zone in self.zones.zones(BlueprintZoneFilter::ShouldBeRunning) { + ZoneDatasetConfigs::new(&self.disks, zone)? + .ensure_in_service(&mut self.datasets, rng); + } + Ok(()) + } +} + +#[derive(Debug)] +struct ZoneDatasetConfigs { + filesystem: Option, + durable: Option, +} + +impl ZoneDatasetConfigs { + fn new( + disks: &DisksEditor, + zone: &BlueprintZoneConfig, + ) -> Result { + let filesystem_dataset = zone + .filesystem_dataset() + .map(|dataset| PartialDatasetConfig::for_transient_zone(dataset)); + let durable_dataset = zone.zone_type.durable_dataset().map(|dataset| { + // `dataset` records include an optional socket address, which is + // only applicable for durable datasets backing crucible. This this + // is a little fishy and might go away with + // https://github.com/oxidecomputer/omicron/issues/6998. + let address = match &zone.zone_type { + BlueprintZoneType::Crucible( + blueprint_zone_type::Crucible { address, .. }, + ) => Some(*address), + _ => None, + }; + PartialDatasetConfig::for_durable_zone( + dataset.dataset.pool_name.clone(), + dataset.kind, + address, + ) + }); + + // Ensure that if this zone has both kinds of datasets, they reside on + // the same zpool. + if let (Some(fs), Some(dur)) = (&filesystem_dataset, &durable_dataset) { + if fs.zpool() != dur.zpool() { + return Err(SledEditError::ZoneInvalidZpoolCombination { + zone_id: zone.id, + fs_zpool: fs.zpool().clone(), + dur_zpool: dur.zpool().clone(), + }); + } + } + + // Ensure that if we have a zpool, we have a matching disk (i.e., a zone + // can't be added if it has a dataset on a zpool that we don't have) + if let Some(dataset) = + filesystem_dataset.as_ref().or(durable_dataset.as_ref()) + { + if !disks.contains_zpool(&dataset.zpool().id()) { + return Err(SledEditError::ZoneOnNonexistentZpool { + zone_id: zone.id, + zpool: dataset.zpool().clone(), + }); + } + } + + Ok(Self { filesystem: filesystem_dataset, durable: durable_dataset }) + } + + fn ensure_in_service( + self, + datasets: &mut DatasetsEditor, + rng: &mut PlannerRng, + ) { + if let Some(dataset) = self.filesystem { + datasets.ensure_in_service(dataset, rng); + } + if let Some(dataset) = self.durable { + datasets.ensure_in_service(dataset, rng); + } + } +} diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs new file mode 100644 index 0000000000..3830f02233 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs @@ -0,0 +1,398 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::blueprint_builder::EditCounts; +use crate::planner::PlannerRng; +use illumos_utils::zpool::ZpoolName; +use nexus_types::deployment::BlueprintDatasetConfig; +use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintDatasetsConfig; +use nexus_types::deployment::SledResources; +use nexus_types::deployment::ZpoolFilter; +use omicron_common::api::external::ByteCount; +use omicron_common::api::external::Generation; +use omicron_common::disk::CompressionAlgorithm; +use omicron_common::disk::DatasetKind; +use omicron_common::disk::DatasetName; +use omicron_common::disk::GzipLevel; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::ZpoolUuid; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::net::SocketAddrV6; + +#[derive(Debug, thiserror::Error)] +#[error( + "invalid blueprint input: multiple datasets with kind {kind:?} \ + on zpool {zpool_id}: {id1}, {id2}" +)] +pub struct MultipleDatasetsOfKind { + zpool_id: ZpoolUuid, + kind: DatasetKind, + id1: DatasetUuid, + id2: DatasetUuid, +} + +#[derive(Debug, thiserror::Error)] +pub enum DatasetsEditError { + #[error( + "tried to expunge nonexistent dataset: \ + zpool {zpool_id}, kind {kind}" + )] + ExpungeNonexistentDataset { zpool_id: ZpoolUuid, kind: DatasetKind }, +} + +/// TODO(): In between +/// the addition of datasets to blueprints and knowing all deployed system +/// have _generated_ a blueprint that populates datasets, we are in a sticky +/// situation where a dataset might have already existed in CRDB with an ID, +/// but the blueprint system doesn't know about it. We accept a map of all +/// existing dataset IDs, and then when determining the ID of a dataset, +/// we'll try these in order: +/// +/// 1. Is the dataset in our blueprint already? If so, use its ID. +/// 2. Is the dataset in `preexisting_database_ids`? If so, use that ID. +/// 3. Generate a new random ID. +#[derive(Debug)] +pub(crate) struct DatasetIdsBackfillFromDb( + BTreeMap>, +); + +impl DatasetIdsBackfillFromDb { + pub fn build( + resources: &SledResources, + ) -> Result { + let iter = resources.all_datasets(ZpoolFilter::InService).flat_map( + |(&zpool_id, configs)| { + configs.iter().map(move |config| { + (zpool_id, config.name.dataset().clone(), config.id) + }) + }, + ); + + let mut kind_id_map: BTreeMap< + ZpoolUuid, + BTreeMap, + > = BTreeMap::new(); + + for (zpool_id, kind, dataset_id) in iter { + let dataset_ids_by_kind = kind_id_map.entry(zpool_id).or_default(); + match dataset_ids_by_kind.entry(kind) { + Entry::Vacant(slot) => { + slot.insert(dataset_id); + } + Entry::Occupied(prev) => { + return Err(MultipleDatasetsOfKind { + zpool_id, + kind: prev.key().clone(), + id1: *prev.get(), + id2: dataset_id, + }); + } + } + } + Ok(Self(kind_id_map)) + } + + pub fn empty() -> Self { + Self(BTreeMap::new()) + } +} + +impl DatasetIdsBackfillFromDb { + fn get( + &self, + zpool_id: &ZpoolUuid, + kind: &DatasetKind, + ) -> Option { + self.0.get(zpool_id).and_then(|by_kind| by_kind.get(kind).copied()) + } +} + +/// Container for most of the information needed to construct a +/// `BlueprintDatasetConfig`. +/// +/// Omitted from this set are the disposition (in practice, this will typically +/// be "in service", as one constructs a `PartialDatasetConfig` to describe a +/// dataset that should be in service) and the ID. Dataset IDs are a little +/// tricky at the moment (see `DatasetIdsBackfillFromDb` above), so they're +/// determined internally by `DatasetsEditor`. +#[derive(Debug)] +pub(crate) struct PartialDatasetConfig { + pub name: DatasetName, + pub address: Option, + pub quota: Option, + pub reservation: Option, + pub compression: CompressionAlgorithm, +} + +impl PartialDatasetConfig { + pub fn zpool(&self) -> &ZpoolName { + self.name.pool() + } + + pub fn for_debug(zpool: ZpoolName) -> Self { + const DEBUG_QUOTA_SIZE_GB: u32 = 100; + + Self { + name: DatasetName::new(zpool, DatasetKind::Debug), + address: None, + quota: Some(ByteCount::from_gibibytes_u32(DEBUG_QUOTA_SIZE_GB)), + reservation: None, + compression: CompressionAlgorithm::GzipN { + level: GzipLevel::new::<9>(), + }, + } + } + + pub fn for_transient_zone_root(zpool: ZpoolName) -> Self { + Self { + name: DatasetName::new(zpool, DatasetKind::TransientZoneRoot), + address: None, + quota: None, + reservation: None, + compression: CompressionAlgorithm::Off, + } + } + + pub fn for_transient_zone(name: DatasetName) -> Self { + assert!( + matches!(name.dataset(), DatasetKind::TransientZone { .. }), + "for_transient_zone called with incorrect dataset kind: {name:?}" + ); + Self { + name, + address: None, + quota: None, + reservation: None, + compression: CompressionAlgorithm::Off, + } + } + + pub fn for_durable_zone( + zpool: ZpoolName, + kind: DatasetKind, + address: Option, + ) -> Self { + Self { + name: DatasetName::new(zpool, kind), + address, + quota: None, + reservation: None, + compression: CompressionAlgorithm::Off, + } + } +} + +#[derive(Debug)] +pub(super) struct DatasetsEditor { + preexisting_dataset_ids: DatasetIdsBackfillFromDb, + config: BlueprintDatasetsConfig, + by_zpool_and_kind: BTreeMap>, + counts: EditCounts, +} + +impl DatasetsEditor { + pub fn new( + config: BlueprintDatasetsConfig, + preexisting_dataset_ids: DatasetIdsBackfillFromDb, + ) -> Result { + let mut by_zpool_and_kind = BTreeMap::new(); + for dataset in config.datasets.values() { + let by_kind: &mut BTreeMap<_, _> = + by_zpool_and_kind.entry(dataset.pool.id()).or_default(); + match by_kind.entry(dataset.kind.clone()) { + Entry::Vacant(slot) => { + slot.insert(dataset.id); + } + Entry::Occupied(prev) => { + return Err(MultipleDatasetsOfKind { + zpool_id: dataset.pool.id(), + kind: dataset.kind.clone(), + id1: *prev.get(), + id2: dataset.id, + }); + } + } + } + Ok(Self { + preexisting_dataset_ids, + config, + by_zpool_and_kind, + counts: EditCounts::zeroes(), + }) + } + + pub fn empty(preexisting_dataset_ids: DatasetIdsBackfillFromDb) -> Self { + Self { + preexisting_dataset_ids, + config: BlueprintDatasetsConfig { + generation: Generation::new(), + datasets: BTreeMap::new(), + }, + by_zpool_and_kind: BTreeMap::new(), + counts: EditCounts::zeroes(), + } + } + + pub fn finalize(self) -> (BlueprintDatasetsConfig, EditCounts) { + let mut config = self.config; + if self.counts.has_nonzero_counts() { + config.generation = config.generation.next(); + } + (config, self.counts) + } + + pub fn edit_counts(&self) -> EditCounts { + self.counts + } + + // If there is a dataset of the given `kind` on the given `zpool`, return + // its ID. + // + // This prefers IDs we already have; if we don't have one, it falls back to + // backfilling based on IDs recorded in the database from before blueprints + // tracked datasets (see `DatasetIdsBackfillFromDb` above). + fn get_id( + &self, + zpool: &ZpoolUuid, + kind: &DatasetKind, + ) -> Option { + if let Some(blueprint_id) = self + .by_zpool_and_kind + .get(zpool) + .and_then(|by_kind| by_kind.get(kind).copied()) + { + return Some(blueprint_id); + }; + if let Some(preexisting_database_id) = + self.preexisting_dataset_ids.get(zpool, kind) + { + return Some(preexisting_database_id); + }; + None + } + + fn expunge_impl( + dataset: &mut BlueprintDatasetConfig, + counts: &mut EditCounts, + ) { + match dataset.disposition { + BlueprintDatasetDisposition::InService => { + dataset.disposition = BlueprintDatasetDisposition::Expunged; + counts.expunged += 1; + } + BlueprintDatasetDisposition::Expunged => { + // already expunged; nothing to do + } + } + } + + /// Expunge a dataset identified by its zpool + kind combo. + /// + /// TODO-cleanup This seems fishy. We require that there is at most one + /// dataset of a given `DatasetKind` on a given zpool at a time, but over + /// time we might have had multiple. For example: + /// + /// * Blueprint A: Nexus 1 is on zpool 12 + /// * Blueprint B: Nexus 1 is expunged + /// * Blueprint C: Nexus 2 is added and is placed on zpool 12 + /// + /// When we go to plan Blueprint D, if Nexus 1 is still being carried + /// forward, it will already be expunged (which is fine). If we then try to + /// expunge it again, which should be idempotent, expunging its + /// datasets would incorrectly expunge Nexus 2's datasets (because we'd look + /// up "the dataset with kind Nexus on zpool 12"). We should probably take + /// an explicit dataset ID here, but that would require + /// `BlueprintZoneConfig` to track its dataset IDs explicitly instead of + /// only tracking their zpools. + pub fn expunge( + &mut self, + zpool: &ZpoolUuid, + kind: &DatasetKind, + ) -> Result<(), DatasetsEditError> { + let Some(id) = self + .by_zpool_and_kind + .get(zpool) + .and_then(|by_kind| by_kind.get(kind)) + else { + return Err(DatasetsEditError::ExpungeNonexistentDataset { + zpool_id: *zpool, + kind: kind.clone(), + }); + }; + let dataset = self + .config + .datasets + .get_mut(id) + .expect("by_zpool_and_kind and config out of sync"); + Self::expunge_impl(dataset, &mut self.counts); + Ok(()) + } + + pub fn expunge_all_on_zpool(&mut self, zpool: &ZpoolUuid) { + let Some(by_kind) = self.by_zpool_and_kind.get(zpool) else { + return; + }; + + for id in by_kind.values() { + let dataset = self + .config + .datasets + .get_mut(id) + .expect("by_zpool_and_kind and config out of sync"); + Self::expunge_impl(dataset, &mut self.counts); + } + } + + pub fn ensure_in_service( + &mut self, + dataset: PartialDatasetConfig, + rng: &mut PlannerRng, + ) { + // Convert the partial config into a full config by finding or + // generating its ID. + let dataset = { + let PartialDatasetConfig { + name, + address, + quota, + reservation, + compression, + } = dataset; + let (pool, kind) = name.into_parts(); + let id = self + .get_id(&pool.id(), &kind) + .unwrap_or_else(|| rng.next_dataset()); + BlueprintDatasetConfig { + disposition: BlueprintDatasetDisposition::InService, + id, + pool, + kind, + address, + quota, + reservation, + compression, + } + }; + + // Add or update our config with this new dataset info. + match self.config.datasets.entry(dataset.id) { + Entry::Vacant(slot) => { + self.by_zpool_and_kind + .entry(dataset.pool.id()) + .or_default() + .insert(dataset.kind.clone(), dataset.id); + slot.insert(dataset); + self.counts.added += 1; + } + Entry::Occupied(mut prev) => { + if *prev.get() != dataset { + prev.insert(dataset); + self.counts.updated += 1; + } + } + } + } +} diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/disks.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/disks.rs new file mode 100644 index 0000000000..f7c0dcba36 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/disks.rs @@ -0,0 +1,145 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::blueprint_builder::EditCounts; +use nexus_types::deployment::BlueprintPhysicalDiskConfig; +use nexus_types::deployment::BlueprintPhysicalDiskDisposition; +use nexus_types::deployment::BlueprintPhysicalDisksConfig; +use nexus_types::deployment::DiskFilter; +use omicron_common::api::external::Generation; +use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::ZpoolUuid; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; + +#[derive(Debug, thiserror::Error)] +pub enum DisksEditError { + #[error("tried to expunge nonexistent disk {id}")] + ExpungeNonexistentDisk { id: PhysicalDiskUuid }, +} + +#[derive(Debug, thiserror::Error)] +#[error( + "invalid blueprint input: duplicate disk ID {id} \ + (zpools: {zpool1:?}, {zpool2:?})" +)] +pub struct DuplicateDiskId { + pub id: PhysicalDiskUuid, + pub zpool1: ZpoolUuid, + pub zpool2: ZpoolUuid, +} + +#[derive(Debug)] +pub(super) struct DisksEditor { + generation: Generation, + disks: BTreeMap, + counts: EditCounts, +} + +impl DisksEditor { + pub fn empty() -> Self { + Self { + generation: Generation::new(), + disks: BTreeMap::new(), + counts: EditCounts::zeroes(), + } + } + + pub fn finalize(self) -> (BlueprintPhysicalDisksConfig, EditCounts) { + let mut generation = self.generation; + if self.counts.has_nonzero_counts() { + generation = generation.next(); + } + + ( + BlueprintPhysicalDisksConfig { + generation, + disks: self.disks.into_values().collect(), + }, + self.counts, + ) + } + + pub fn edit_counts(&self) -> EditCounts { + self.counts + } + + pub fn disks( + &self, + filter: DiskFilter, + ) -> impl Iterator { + self.disks + .values() + .filter(move |config| config.disposition.matches(filter)) + } + + pub fn contains_zpool(&self, zpool_id: &ZpoolUuid) -> bool { + self.disks.values().any(|disk| disk.pool_id == *zpool_id) + } + + pub fn ensure(&mut self, disk: BlueprintPhysicalDiskConfig) { + match self.disks.entry(disk.id) { + Entry::Vacant(slot) => { + slot.insert(disk); + self.counts.added += 1; + } + Entry::Occupied(mut slot) => { + if *slot.get() != disk { + slot.insert(disk); + self.counts.updated += 1; + } + } + } + } + + pub fn expunge( + &mut self, + disk_id: &PhysicalDiskUuid, + ) -> Result { + let config = self.disks.get_mut(disk_id).ok_or_else(|| { + DisksEditError::ExpungeNonexistentDisk { id: *disk_id } + })?; + + match config.disposition { + BlueprintPhysicalDiskDisposition::InService => { + config.disposition = BlueprintPhysicalDiskDisposition::Expunged; + self.counts.expunged += 1; + } + BlueprintPhysicalDiskDisposition::Expunged => { + // expunge is idempotent; do nothing + } + } + + Ok(config.pool_id) + } +} + +impl TryFrom for DisksEditor { + type Error = DuplicateDiskId; + + fn try_from( + config: BlueprintPhysicalDisksConfig, + ) -> Result { + let mut disks = BTreeMap::new(); + for disk in config.disks { + match disks.entry(disk.id) { + Entry::Vacant(slot) => { + slot.insert(disk); + } + Entry::Occupied(prev) => { + return Err(DuplicateDiskId { + id: disk.id, + zpool1: disk.pool_id, + zpool2: prev.get().pool_id, + }); + } + } + } + Ok(Self { + generation: config.generation, + disks, + counts: EditCounts::zeroes(), + }) + } +} diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs new file mode 100644 index 0000000000..5a5c7a1807 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs @@ -0,0 +1,181 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::blueprint_builder::EditCounts; +use nexus_sled_agent_shared::inventory::ZoneKind; +use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneFilter; +use nexus_types::deployment::BlueprintZonesConfig; +use omicron_common::api::external::Generation; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::ZpoolUuid; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; + +#[derive(Debug, thiserror::Error)] +pub enum ZonesEditError { + #[error( + "tried to add duplicate zone ID {id} (kinds: {kind1:?}, {kind2:?})" + )] + AddDuplicateZoneId { id: OmicronZoneUuid, kind1: ZoneKind, kind2: ZoneKind }, + #[error("tried to expunge nonexistent zone {id}")] + ExpungeNonexistentZone { id: OmicronZoneUuid }, +} + +#[derive(Debug, thiserror::Error)] +#[error( + "invalid blueprint input: duplicate zone ID {id} \ + (kinds: {kind1:?}, {kind2:?})" +)] +pub struct DuplicateZoneId { + pub id: OmicronZoneUuid, + pub kind1: ZoneKind, + pub kind2: ZoneKind, +} + +#[derive(Debug)] +pub(super) struct ZonesEditor { + generation: Generation, + zones: BTreeMap, + counts: EditCounts, +} + +impl ZonesEditor { + pub fn empty() -> Self { + Self { + generation: Generation::new(), + zones: BTreeMap::new(), + counts: EditCounts::zeroes(), + } + } + + pub fn finalize(self) -> (BlueprintZonesConfig, EditCounts) { + let mut generation = self.generation; + if self.counts.has_nonzero_counts() { + generation = generation.next(); + } + let mut config = BlueprintZonesConfig { + generation, + zones: self.zones.into_values().collect(), + }; + config.sort(); + (config, self.counts) + } + + pub fn edit_counts(&self) -> EditCounts { + self.counts + } + + pub fn zones( + &self, + filter: BlueprintZoneFilter, + ) -> impl Iterator { + self.zones + .values() + .filter(move |config| config.disposition.matches(filter)) + } + + pub fn add_zone( + &mut self, + zone: BlueprintZoneConfig, + ) -> Result<(), ZonesEditError> { + match self.zones.entry(zone.id) { + Entry::Vacant(slot) => { + slot.insert(zone); + self.counts.added += 1; + Ok(()) + } + Entry::Occupied(prev) => { + // We shouldn't be trying to add zones that already exist -- + // something went wrong in the planner logic. + Err(ZonesEditError::AddDuplicateZoneId { + id: zone.id, + kind1: zone.zone_type.kind(), + kind2: prev.get().zone_type.kind(), + }) + } + } + } + + /// Expunge a zone, returning `true` if the zone was expunged and `false` if + /// the zone was already expunged, along with the updated zone config. + pub fn expunge( + &mut self, + zone_id: &OmicronZoneUuid, + ) -> Result<(bool, &BlueprintZoneConfig), ZonesEditError> { + let config = self.zones.get_mut(zone_id).ok_or_else(|| { + ZonesEditError::ExpungeNonexistentZone { id: *zone_id } + })?; + + let did_expunge = Self::expunge_impl(config, &mut self.counts); + + Ok((did_expunge, &*config)) + } + + fn expunge_impl( + config: &mut BlueprintZoneConfig, + counts: &mut EditCounts, + ) -> bool { + match config.disposition { + BlueprintZoneDisposition::InService + | BlueprintZoneDisposition::Quiesced => { + config.disposition = BlueprintZoneDisposition::Expunged; + counts.expunged += 1; + true + } + BlueprintZoneDisposition::Expunged => { + // expunge is idempotent; do nothing + false + } + } + } + + pub fn expunge_all_on_zpool(&mut self, zpool: &ZpoolUuid) { + for config in self.zones.values_mut() { + // Expunge this zone if its filesystem or durable dataset are on + // this zpool. (If it has both, they should be on the _same_ zpool, + // but that's not strictly required by this method - we'll expunge a + // zone that depends on this zpool in any way.) + let fs_is_on_zpool = config + .filesystem_pool + .as_ref() + .map_or(false, |pool| pool.id() == *zpool); + let dd_is_on_zpool = config + .zone_type + .durable_zpool() + .map_or(false, |pool| pool.id() == *zpool); + if fs_is_on_zpool || dd_is_on_zpool { + Self::expunge_impl(config, &mut self.counts); + } + } + } +} + +impl TryFrom for ZonesEditor { + type Error = DuplicateZoneId; + + fn try_from(config: BlueprintZonesConfig) -> Result { + let mut zones = BTreeMap::new(); + for zone in config.zones { + match zones.entry(zone.id) { + Entry::Vacant(slot) => { + slot.insert(zone); + } + Entry::Occupied(prev) => { + return Err(DuplicateZoneId { + id: zone.id, + kind1: zone.zone_type.kind(), + kind2: prev.get().zone_type.kind(), + }); + } + } + } + Ok(Self { + generation: config.generation, + zones, + counts: EditCounts::zeroes(), + }) + } +} diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 3848934d19..dfba3f9992 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -453,9 +453,7 @@ impl ExampleSystemBuilder { .unwrap(); } } - builder - .sled_ensure_zone_datasets(sled_id, &sled_resources) - .unwrap(); + builder.sled_ensure_zone_datasets(sled_id).unwrap(); } let blueprint = builder.build(); diff --git a/nexus/reconfigurator/planning/src/lib.rs b/nexus/reconfigurator/planning/src/lib.rs index a5a47c933d..f6c521c0f8 100644 --- a/nexus/reconfigurator/planning/src/lib.rs +++ b/nexus/reconfigurator/planning/src/lib.rs @@ -7,6 +7,7 @@ //! See docs/reconfigurator.adoc for an overview. pub mod blueprint_builder; +pub mod blueprint_editor; pub mod example; mod ip_allocator; pub mod planner; diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 9bdb29048b..56fc671667 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -160,7 +160,7 @@ impl<'a> Planner<'a> { if all_zones_expunged && num_instances_assigned == 0 { self.blueprint - .set_sled_state(sled_id, SledState::Decommissioned); + .set_sled_state(sled_id, SledState::Decommissioned)?; } } @@ -362,17 +362,13 @@ impl<'a> Planner<'a> { } fn do_plan_datasets(&mut self) -> Result<(), Error> { - for (sled_id, sled_resources) in - self.input.all_sled_resources(SledFilter::InService) - { + for sled_id in self.input.all_sled_ids(SledFilter::InService) { if let EnsureMultiple::Changed { added, updated, expunged, removed, - } = self - .blueprint - .sled_ensure_zone_datasets(sled_id, &sled_resources)? + } = self.blueprint.sled_ensure_zone_datasets(sled_id)? { info!( &self.log, diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index 5f21652feb..8990b0b83b 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -21,6 +21,7 @@ dropshot.workspace = true futures.workspace = true http.workspace = true humantime.workspace = true +illumos-utils.workspace = true ipnetwork.workspace = true newtype_derive.workspace = true omicron-uuid-kinds.workspace = true diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 3a17f69863..a487fea2ce 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -736,6 +736,17 @@ impl BlueprintZoneConfig { pub fn underlay_ip(&self) -> Ipv6Addr { self.zone_type.underlay_ip() } + + /// Returns the dataset used for the the zone's (transient) root filesystem. + pub fn filesystem_dataset(&self) -> Option { + let pool_name = self.filesystem_pool.clone()?; + let name = illumos_utils::zone::zone_name( + self.zone_type.kind().zone_prefix(), + Some(self.id), + ); + let kind = DatasetKind::TransientZone { name }; + Some(DatasetName::new(pool_name, kind)) + } } impl From for OmicronZoneConfig { @@ -917,6 +928,26 @@ pub enum BlueprintPhysicalDiskDisposition { Expunged, } +impl BlueprintPhysicalDiskDisposition { + /// Returns true if the disk disposition matches this filter. + pub fn matches(self, filter: DiskFilter) -> bool { + match self { + Self::InService => match filter { + DiskFilter::All => true, + DiskFilter::InService => true, + // TODO remove this variant? + DiskFilter::ExpungedButActive => false, + }, + Self::Expunged => match filter { + DiskFilter::All => true, + DiskFilter::InService => false, + // TODO remove this variant? + DiskFilter::ExpungedButActive => true, + }, + } + } +} + /// Information about an Omicron physical disk as recorded in a bluerprint. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] pub struct BlueprintPhysicalDiskConfig { From 9b662ea823d4899702fb3ee7799d843d440dd746 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 6 Dec 2024 12:15:39 -0800 Subject: [PATCH 11/22] [nexus] Use retryable transactions more extensively (#7212) This PR finds spots where we use `transaction_async` and makes them use `transaction_retry_wrapper` instead. This means that under contention, we'll avoid wasting work, and can make use of CockroachDB's automated retry mechanisms. Additionally, this PR adds a clippy lint to help future usage avoid the "non-retryable" transaction variant. There are some use cases where avoiding retries is still reasonable: 1. Test-only code 2. Transactions which have truly minimal contention, or which can fail with serialization errors without issue 3. Nested transactions --- clippy.toml | 6 + dev-tools/omdb/src/bin/omdb/db.rs | 2 + .../db-queries/src/db/datastore/deployment.rs | 35 +-- nexus/db-queries/src/db/datastore/dns.rs | 79 ++++--- .../db-queries/src/db/datastore/inventory.rs | 171 +++++++------- nexus/db-queries/src/db/datastore/ip_pool.rs | 141 +++++++----- nexus/db-queries/src/db/datastore/rack.rs | 5 +- .../src/db/datastore/region_replacement.rs | 211 +++++++++--------- .../datastore/region_snapshot_replacement.rs | 158 +++++++------ nexus/db-queries/src/db/datastore/role.rs | 5 + nexus/db-queries/src/db/datastore/saga.rs | 1 + nexus/db-queries/src/db/datastore/silo.rs | 26 ++- .../db-queries/src/db/datastore/silo_group.rs | 2 + .../db-queries/src/db/datastore/silo_user.rs | 18 +- nexus/db-queries/src/db/pagination.rs | 1 + .../region_snapshot_replacement_start.rs | 1 + 16 files changed, 478 insertions(+), 384 deletions(-) diff --git a/clippy.toml b/clippy.toml index ffa3ffac70..31e28d5911 100644 --- a/clippy.toml +++ b/clippy.toml @@ -10,4 +10,10 @@ disallowed-methods = [ # `IncompleteOnConflictExt::as_partial_index` in `nexus-db-queries`. # See the documentation of that method for more. "diesel::upsert::DecoratableTarget::filter_target", + + # This form of transaction is susceptible to serialization failures, + # and can fail spuriously. + # Instead, the "transaction_retry_wrapper" should be preferred, as it + # automatically retries transactions experiencing contention. + { path = "async_bb8_diesel::AsyncConnection::transaction_async", reason = "Prefer to use transaction_retry_wrapper, if possible. Feel free to override this for tests and nested transactions." }, ] diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 4cccc3c23e..667a666375 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -14,6 +14,8 @@ // NOTE: emanates from Tabled macros #![allow(clippy::useless_vec)] +// NOTE: allowing "transaction_async" without retry +#![allow(clippy::disallowed_methods)] use crate::check_allow_destructive::DestructiveOperationToken; use crate::helpers::const_max_len; diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 380a5c1b00..0c73ae1ae2 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -335,6 +335,11 @@ impl DataStore { // batch rather than making a bunch of round-trips to the database. // We'd do that if we had an interface for doing that with bound // parameters, etc. See oxidecomputer/omicron#973. + + // The risk of a serialization error is possible here, but low, + // as most of the operations should be insertions rather than in-place + // modifications of existing tables. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { // Insert the row for the blueprint. { @@ -1087,6 +1092,7 @@ impl DataStore { // start removing it and we'd also need to make sure we didn't leak a // collection if we crash while deleting it. let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); let ( nblueprints, @@ -1101,19 +1107,23 @@ impl DataStore { nclickhouse_cluster_configs, nclickhouse_keepers, nclickhouse_servers, - ) = conn - .transaction_async(|conn| async move { + ) = self.transaction_retry_wrapper("blueprint_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { // Ensure that blueprint we're about to delete is not the // current target. - let current_target = - Self::blueprint_current_target_only(&conn).await?; + let current_target = Self::blueprint_current_target_only(&conn) + .await + .map_err(|txn_err| txn_err.into_diesel(&err))?; + if current_target.target_id == blueprint_id { - return Err(TransactionError::CustomError( + return Err(err.bail(TransactionError::CustomError( Error::conflict(format!( "blueprint {blueprint_id} is the \ current target and cannot be deleted", )), - )); + ))); } // Remove the record describing the blueprint itself. @@ -1130,9 +1140,9 @@ impl DataStore { // references to it in any of the remaining tables either, since // deletion always goes through this transaction. if nblueprints == 0 { - return Err(TransactionError::CustomError( + return Err(err.bail(TransactionError::CustomError( authz_blueprint.not_found(), - )); + ))); } // Remove rows associated with sled states. @@ -1259,13 +1269,12 @@ impl DataStore { nclickhouse_keepers, nclickhouse_servers, )) + } }) .await - .map_err(|error| match error { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } + .map_err(|e| match err.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), })?; info!(&opctx.log, "removed blueprint"; diff --git a/nexus/db-queries/src/db/datastore/dns.rs b/nexus/db-queries/src/db/datastore/dns.rs index a691ce43aa..3f0f7828fa 100644 --- a/nexus/db-queries/src/db/datastore/dns.rs +++ b/nexus/db-queries/src/db/datastore/dns.rs @@ -19,6 +19,7 @@ use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::pool::DbConnection; use crate::db::TransactionError; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -363,40 +364,49 @@ impl DataStore { ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, &authz::DNS_CONFIG).await?; let conn = self.pool_connection_authorized(opctx).await?; - conn.transaction_async(|c| async move { - let zones = self - .dns_zones_list_all_on_connection(opctx, &c, update.dns_group) - .await?; - // This looks like a time-of-check-to-time-of-use race, but this - // approach works because we're inside a transaction and the - // isolation level is SERIALIZABLE. - let version = self - .dns_group_latest_version_conn(opctx, &c, update.dns_group) - .await?; - if version.version != old_version { - return Err(TransactionError::CustomError(Error::conflict( - format!( - "expected current DNS version to be {}, found {}", - *old_version, *version.version, - ), - ))); - } - self.dns_write_version_internal( - &c, - update, - zones, - Generation(old_version.next()), - ) + let err = OptionalError::new(); + + self.transaction_retry_wrapper("dns_update_from_version") + .transaction(&conn, |c| { + let err = err.clone(); + let update = update.clone(); + async move { + let zones = self + .dns_zones_list_all_on_connection(opctx, &c, update.dns_group) + .await + .map_err(|txn_error| txn_error.into_diesel(&err))?; + // This looks like a time-of-check-to-time-of-use race, but this + // approach works because we're inside a transaction and the + // isolation level is SERIALIZABLE. + let version = self + .dns_group_latest_version_conn(opctx, &c, update.dns_group) + .await + .map_err(|txn_error| txn_error.into_diesel(&err))?; + if version.version != old_version { + return Err(err.bail(TransactionError::CustomError(Error::conflict( + format!( + "expected current DNS version to be {}, found {}", + *old_version, *version.version, + ), + )))); + } + + self.dns_write_version_internal( + &c, + update, + zones, + Generation(old_version.next()), + ) + .await + .map_err(|txn_error| txn_error.into_diesel(&err)) + } + }) .await - }) - .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + .map_err(|e| match err.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), + }) } /// Update the configuration of a DNS zone as specified in `update` @@ -441,6 +451,9 @@ impl DataStore { .dns_zones_list_all_on_connection(opctx, conn, update.dns_group) .await?; + // This method is used in nested transactions, which are not supported + // with retryable transactions. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|c| async move { let version = self .dns_group_latest_version_conn(opctx, conn, update.dns_group) @@ -1724,6 +1737,8 @@ mod test { let cds = datastore.clone(); let copctx = opctx.child(std::collections::BTreeMap::new()); + + #[allow(clippy::disallowed_methods)] let mut fut = conn1 .transaction_async(|c1| async move { cds.dns_update_incremental(&copctx, &c1, update1) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index a6e2a6cf2a..9269b233f3 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -11,7 +11,6 @@ use crate::db::error::public_error_from_diesel_lookup; use crate::db::error::ErrorHandler; use crate::db::pagination::{paginated, paginated_multicolumn, Paginator}; use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; -use crate::db::TransactionError; use anyhow::Context; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; @@ -280,6 +279,11 @@ impl DataStore { // We'd do that if we had an interface for doing that with bound // parameters, etc. See oxidecomputer/omicron#973. let pool = self.pool_connection_authorized(opctx).await?; + + // The risk of a serialization error is possible here, but low, + // as most of the operations should be insertions rather than in-place + // modifications of existing tables. + #[allow(clippy::disallowed_methods)] pool.transaction_async(|conn| async move { // Insert records (and generate ids) for any baseboards that do not // already exist in the database. These rows are not scoped to a @@ -1242,6 +1246,7 @@ impl DataStore { // collection if we crash while deleting it. let conn = self.pool_connection_authorized(opctx).await?; let db_collection_id = to_db_typed_uuid(collection_id); + let ( ncollections, nsps, @@ -1258,22 +1263,22 @@ impl DataStore { nzpools, nerrors, nclickhouse_keeper_membership, - ) = conn - .transaction_async(|conn| async move { - // Remove the record describing the collection itself. - let ncollections = { - use db::schema::inv_collection::dsl; - diesel::delete( - dsl::inv_collection - .filter(dsl::id.eq(db_collection_id)), - ) - .execute_async(&conn) - .await? - }; + ) = + self.transaction_retry_wrapper("inventory_delete_collection") + .transaction(&conn, |conn| async move { + // Remove the record describing the collection itself. + let ncollections = { + use db::schema::inv_collection::dsl; + diesel::delete( + dsl::inv_collection + .filter(dsl::id.eq(db_collection_id)), + ) + .execute_async(&conn) + .await? + }; - // Remove rows for service processors. - let nsps = - { + // Remove rows for service processors. + let nsps = { use db::schema::inv_service_processor::dsl; diesel::delete(dsl::inv_service_processor.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1282,9 +1287,8 @@ impl DataStore { .await? }; - // Remove rows for roots of trust. - let nrots = - { + // Remove rows for roots of trust. + let nrots = { use db::schema::inv_root_of_trust::dsl; diesel::delete(dsl::inv_root_of_trust.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1293,9 +1297,8 @@ impl DataStore { .await? }; - // Remove rows for cabooses found. - let ncabooses = - { + // Remove rows for cabooses found. + let ncabooses = { use db::schema::inv_caboose::dsl; diesel::delete(dsl::inv_caboose.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1304,9 +1307,8 @@ impl DataStore { .await? }; - // Remove rows for root of trust pages found. - let nrot_pages = - { + // Remove rows for root of trust pages found. + let nrot_pages = { use db::schema::inv_root_of_trust_page::dsl; diesel::delete(dsl::inv_root_of_trust_page.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1315,9 +1317,8 @@ impl DataStore { .await? }; - // Remove rows for sled agents found. - let nsled_agents = - { + // Remove rows for sled agents found. + let nsled_agents = { use db::schema::inv_sled_agent::dsl; diesel::delete(dsl::inv_sled_agent.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1326,9 +1327,8 @@ impl DataStore { .await? }; - // Remove rows for datasets - let ndatasets = - { + // Remove rows for datasets + let ndatasets = { use db::schema::inv_dataset::dsl; diesel::delete(dsl::inv_dataset.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1337,9 +1337,8 @@ impl DataStore { .await? }; - // Remove rows for physical disks found. - let nphysical_disks = - { + // Remove rows for physical disks found. + let nphysical_disks = { use db::schema::inv_physical_disk::dsl; diesel::delete(dsl::inv_physical_disk.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1348,9 +1347,8 @@ impl DataStore { .await? }; - // Remove rows for NVMe physical disk firmware found. - let nnvme_disk_firwmare = - { + // Remove rows for NVMe physical disk firmware found. + let nnvme_disk_firwmare = { use db::schema::inv_nvme_disk_firmware::dsl; diesel::delete(dsl::inv_nvme_disk_firmware.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1359,9 +1357,8 @@ impl DataStore { .await? }; - // Remove rows associated with Omicron zones - let nsled_agent_zones = - { + // Remove rows associated with Omicron zones + let nsled_agent_zones = { use db::schema::inv_sled_omicron_zones::dsl; diesel::delete(dsl::inv_sled_omicron_zones.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1370,8 +1367,7 @@ impl DataStore { .await? }; - let nzones = - { + let nzones = { use db::schema::inv_omicron_zone::dsl; diesel::delete(dsl::inv_omicron_zone.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1380,8 +1376,7 @@ impl DataStore { .await? }; - let nnics = - { + let nnics = { use db::schema::inv_omicron_zone_nic::dsl; diesel::delete(dsl::inv_omicron_zone_nic.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1390,8 +1385,7 @@ impl DataStore { .await? }; - let nzpools = - { + let nzpools = { use db::schema::inv_zpool::dsl; diesel::delete(dsl::inv_zpool.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1400,9 +1394,8 @@ impl DataStore { .await? }; - // Remove rows for errors encountered. - let nerrors = - { + // Remove rows for errors encountered. + let nerrors = { use db::schema::inv_collection_error::dsl; diesel::delete(dsl::inv_collection_error.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1411,43 +1404,40 @@ impl DataStore { .await? }; - // Remove rows for clickhouse keeper membership - let nclickhouse_keeper_membership = { - use db::schema::inv_clickhouse_keeper_membership::dsl; - diesel::delete( - dsl::inv_clickhouse_keeper_membership.filter( - dsl::inv_collection_id.eq(db_collection_id), - ), - ) - .execute_async(&conn) - .await? - }; - - Ok(( - ncollections, - nsps, - nrots, - ncabooses, - nrot_pages, - nsled_agents, - ndatasets, - nphysical_disks, - nnvme_disk_firwmare, - nsled_agent_zones, - nzones, - nnics, - nzpools, - nerrors, - nclickhouse_keeper_membership, - )) - }) - .await - .map_err(|error| match error { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - })?; + // Remove rows for clickhouse keeper membership + let nclickhouse_keeper_membership = { + use db::schema::inv_clickhouse_keeper_membership::dsl; + diesel::delete( + dsl::inv_clickhouse_keeper_membership.filter( + dsl::inv_collection_id.eq(db_collection_id), + ), + ) + .execute_async(&conn) + .await? + }; + + Ok(( + ncollections, + nsps, + nrots, + ncabooses, + nrot_pages, + nsled_agents, + ndatasets, + nphysical_disks, + nnvme_disk_firwmare, + nsled_agent_zones, + nzones, + nnics, + nzpools, + nerrors, + nclickhouse_keeper_membership, + )) + }) + .await + .map_err(|error| { + public_error_from_diesel(error, ErrorHandler::Server) + })?; info!(&opctx.log, "removed inventory collection"; "collection_id" => collection_id.to_string(), @@ -2429,6 +2419,9 @@ impl DataStoreInventoryTest for DataStore { .pool_connection_for_tests() .await .context("getting connection")?; + + // This transaction is used by tests, and does not need to retry. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) .await @@ -2484,6 +2477,8 @@ mod test { impl CollectionCounts { async fn new(conn: &DataStoreConnection) -> anyhow::Result { + // This transaction is used by tests, and does not need to retry. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) .await @@ -2933,6 +2928,8 @@ mod test { .expect("failed to delete collection"); assert!(datastore.inventory_collections().await.unwrap().is_empty()); + // This transaction is used by tests, and does not need to retry. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); let count = schema::inv_collection::dsl::inv_collection @@ -3055,6 +3052,8 @@ mod test { bail!("Tables missing from information_schema query"); } + // This transaction is used by tests, and does not need to retry. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { // We need this to call "COUNT(*)" below. conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) diff --git a/nexus/db-queries/src/db/datastore/ip_pool.rs b/nexus/db-queries/src/db/datastore/ip_pool.rs index 9548003ee5..2409839eb4 100644 --- a/nexus/db-queries/src/db/datastore/ip_pool.rs +++ b/nexus/db-queries/src/db/datastore/ip_pool.rs @@ -30,7 +30,7 @@ use crate::db::pagination::Paginator; use crate::db::pool::DbConnection; use crate::db::queries::ip_pool::FilterOverlappingIpRanges; use crate::db::TransactionError; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -722,67 +722,90 @@ impl DataStore { } type TxnError = TransactionError; - conn.transaction_async(|conn| async move { - // note this is matching the specified silo, but could be any pool - let existing_default_for_silo = dsl::ip_pool_resource - .filter(dsl::resource_type.eq(IpPoolResourceType::Silo)) - .filter(dsl::resource_id.eq(silo_id)) - .filter(dsl::is_default.eq(true)) - .select(IpPoolResource::as_select()) - .get_result_async(&conn) - .await; - - // if there is an existing default, we need to unset it before we can - // set the new default - if let Ok(existing_default) = existing_default_for_silo { - // if the pool we're making default is already default for this - // silo, don't error: just noop - if existing_default.ip_pool_id == ip_pool_id { - return Ok(existing_default); - } + let err = OptionalError::new(); + + self.transaction_retry_wrapper("ip_pool_set_default") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // note this is matching the specified silo, but could be any pool + let existing_default_for_silo = dsl::ip_pool_resource + .filter(dsl::resource_type.eq(IpPoolResourceType::Silo)) + .filter(dsl::resource_id.eq(silo_id)) + .filter(dsl::is_default.eq(true)) + .select(IpPoolResource::as_select()) + .get_result_async(&conn) + .await; + + // if there is an existing default, we need to unset it before we can + // set the new default + if let Ok(existing_default) = existing_default_for_silo { + // if the pool we're making default is already default for this + // silo, don't error: just noop + if existing_default.ip_pool_id == ip_pool_id { + return Ok(existing_default); + } + + let unset_default = + diesel::update(dsl::ip_pool_resource) + .filter( + dsl::resource_id + .eq(existing_default.resource_id), + ) + .filter( + dsl::ip_pool_id + .eq(existing_default.ip_pool_id), + ) + .filter( + dsl::resource_type + .eq(existing_default.resource_type), + ) + .set(dsl::is_default.eq(false)) + .execute_async(&conn) + .await; + if let Err(e) = unset_default { + return Err(err.bail(TxnError::CustomError( + IpPoolResourceUpdateError::FailedToUnsetDefault( + e, + ), + ))); + } + } - let unset_default = diesel::update(dsl::ip_pool_resource) - .filter(dsl::resource_id.eq(existing_default.resource_id)) - .filter(dsl::ip_pool_id.eq(existing_default.ip_pool_id)) - .filter( - dsl::resource_type.eq(existing_default.resource_type), + let updated_link = diesel::update(dsl::ip_pool_resource) + .filter(dsl::resource_id.eq(silo_id)) + .filter(dsl::ip_pool_id.eq(ip_pool_id)) + .filter(dsl::resource_type.eq(IpPoolResourceType::Silo)) + .set(dsl::is_default.eq(true)) + .returning(IpPoolResource::as_returning()) + .get_result_async(&conn) + .await?; + Ok(updated_link) + } + }) + .await + .map_err(|e| match err.take() { + Some(TxnError::CustomError( + IpPoolResourceUpdateError::FailedToUnsetDefault(err), + )) => public_error_from_diesel(err, ErrorHandler::Server), + Some(TxnError::Database(err)) => { + public_error_from_diesel(err, ErrorHandler::Server) + } + None => { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::IpPoolResource, + // TODO: would be nice to put the actual names and/or ids in + // here but LookupType on each of the two silos doesn't have + // a nice to_string yet or a way of composing them + LookupType::ByCompositeId( + "(pool, silo)".to_string(), + ), + ), ) - .set(dsl::is_default.eq(false)) - .execute_async(&conn) - .await; - if let Err(e) = unset_default { - return Err(TxnError::CustomError( - IpPoolResourceUpdateError::FailedToUnsetDefault(e), - )); } - } - - let updated_link = diesel::update(dsl::ip_pool_resource) - .filter(dsl::resource_id.eq(silo_id)) - .filter(dsl::ip_pool_id.eq(ip_pool_id)) - .filter(dsl::resource_type.eq(IpPoolResourceType::Silo)) - .set(dsl::is_default.eq(true)) - .returning(IpPoolResource::as_returning()) - .get_result_async(&conn) - .await?; - Ok(updated_link) - }) - .await - .map_err(|e| match e { - TransactionError::CustomError( - IpPoolResourceUpdateError::FailedToUnsetDefault(e), - ) => public_error_from_diesel(e, ErrorHandler::Server), - TransactionError::Database(e) => public_error_from_diesel( - e, - ErrorHandler::NotFoundByLookup( - ResourceType::IpPoolResource, - // TODO: would be nice to put the actual names and/or ids in - // here but LookupType on each of the two silos doesn't have - // a nice to_string yet or a way of composing them - LookupType::ByCompositeId("(pool, silo)".to_string()), - ), - ), - }) + }) } /// Ephemeral and snat IPs are associated with a silo through an instance, diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 74b3440a7d..dc3175c22d 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -674,8 +674,9 @@ impl DataStore { let log = opctx.log.clone(); let err = Arc::new(OnceLock::new()); - // NOTE: This transaction cannot yet be made retryable, as it uses - // nested transactions. + // This method uses nested transactions, which are not supported + // with retryable transactions. + #[allow(clippy::disallowed_methods)] let rack = self .pool_connection_authorized(opctx) .await? diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index de047d6d0c..0fda6b46ba 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -21,7 +21,7 @@ use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use crate::db::TransactionError; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use omicron_common::api::external::Error; @@ -52,21 +52,28 @@ impl DataStore { opctx: &OpContext, request: RegionReplacement, ) -> Result<(), Error> { - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - use db::schema::region_replacement::dsl; + let conn = self.pool_connection_authorized(opctx).await?; - Self::volume_repair_insert_query(request.volume_id, request.id) - .execute_async(&conn) - .await?; + self.transaction_retry_wrapper("insert_region_replacement_request") + .transaction(&conn, |conn| { + let request = request.clone(); + async move { + use db::schema::region_replacement::dsl; - diesel::insert_into(dsl::region_replacement) - .values(request) + Self::volume_repair_insert_query( + request.volume_id, + request.id, + ) .execute_async(&conn) .await?; - Ok(()) + diesel::insert_into(dsl::region_replacement) + .values(request) + .execute_async(&conn) + .await?; + + Ok(()) + } }) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) @@ -666,60 +673,62 @@ impl DataStore { ) -> Result<(), Error> { type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - Self::volume_repair_delete_query( - request.volume_id, - request.id, - ) - .execute_async(&conn) - .await?; - - use db::schema::region_replacement::dsl; - - let result = diesel::update(dsl::region_replacement) - .filter(dsl::id.eq(request.id)) - .filter( - dsl::replacement_state.eq(RegionReplacementState::Completing), + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("set_region_replacement_complete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_repair_delete_query( + request.volume_id, + request.id, ) - .filter(dsl::operating_saga_id.eq(operating_saga_id)) - .set(( - dsl::replacement_state.eq(RegionReplacementState::Complete), - dsl::operating_saga_id.eq(Option::::None), - )) - .check_if_exists::(request.id) - .execute_and_check(&conn) + .execute_async(&conn) .await?; - match result.status { - UpdateStatus::Updated => Ok(()), - UpdateStatus::NotUpdatedButExists => { - let record = result.found; - - if record.operating_saga_id == None - && record.replacement_state - == RegionReplacementState::Complete - { - Ok(()) - } else { - Err(TxnError::CustomError(Error::conflict(format!( - "region replacement {} set to {:?} (operating saga id {:?})", - request.id, - record.replacement_state, - record.operating_saga_id, - )))) + use db::schema::region_replacement::dsl; + + let result = diesel::update(dsl::region_replacement) + .filter(dsl::id.eq(request.id)) + .filter( + dsl::replacement_state.eq(RegionReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state.eq(RegionReplacementState::Complete), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::(request.id) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionReplacementState::Complete + { + Ok(()) + } else { + Err(err.bail(TxnError::from(Error::conflict(format!( + "region replacement {} set to {:?} (operating saga id {:?})", + request.id, + record.replacement_state, + record.operating_saga_id, + ))))) + } } } } }) .await - .map_err(|e| match e { - TxnError::CustomError(error) => error, - - TxnError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } + .map_err(|e| match err.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), }) } @@ -738,57 +747,59 @@ impl DataStore { RegionReplacementState::Requested, ); - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - Self::volume_repair_delete_query( - request.volume_id, - request.id, - ) - .execute_async(&conn) - .await?; - - use db::schema::region_replacement::dsl; - - let result = diesel::update(dsl::region_replacement) - .filter(dsl::id.eq(request.id)) - .filter( - dsl::replacement_state.eq(RegionReplacementState::Requested), + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("set_region_replacement_complete_from_requested") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_repair_delete_query( + request.volume_id, + request.id, ) - .filter(dsl::operating_saga_id.is_null()) - .set(( - dsl::replacement_state.eq(RegionReplacementState::Complete), - )) - .check_if_exists::(request.id) - .execute_and_check(&conn) + .execute_async(&conn) .await?; - match result.status { - UpdateStatus::Updated => Ok(()), - - UpdateStatus::NotUpdatedButExists => { - let record = result.found; - - if record.replacement_state == RegionReplacementState::Complete { - Ok(()) - } else { - Err(TxnError::CustomError(Error::conflict(format!( - "region replacement {} set to {:?} (operating saga id {:?})", - request.id, - record.replacement_state, - record.operating_saga_id, - )))) + use db::schema::region_replacement::dsl; + + let result = diesel::update(dsl::region_replacement) + .filter(dsl::id.eq(request.id)) + .filter( + dsl::replacement_state.eq(RegionReplacementState::Requested), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state.eq(RegionReplacementState::Complete), + )) + .check_if_exists::(request.id) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state == RegionReplacementState::Complete { + Ok(()) + } else { + Err(err.bail(TxnError::from(Error::conflict(format!( + "region replacement {} set to {:?} (operating saga id {:?})", + request.id, + record.replacement_state, + record.operating_saga_id, + ))))) + } } } } }) .await - .map_err(|e| match e { - TxnError::CustomError(error) => error, - - TxnError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } + .map_err(|e| match err.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), }) } diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 4faaf228f9..76a83cca2a 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -22,7 +22,7 @@ use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use crate::db::TransactionError; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use omicron_common::api::external::Error; @@ -93,9 +93,14 @@ impl DataStore { request: RegionSnapshotReplacement, volume_id: Uuid, ) -> Result<(), Error> { - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "insert_region_snapshot_replacement_request_with_volume_id", + ) + .transaction(&conn, |conn| { + let request = request.clone(); + async move { use db::schema::region_snapshot_replacement::dsl; use db::schema::volume_repair::dsl as volume_repair_dsl; @@ -116,9 +121,10 @@ impl DataStore { .await?; Ok(()) - }) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn get_region_snapshot_replacement_request_by_id( @@ -563,67 +569,69 @@ impl DataStore { ) -> Result<(), Error> { type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - use db::schema::volume_repair::dsl as volume_repair_dsl; - - diesel::delete( - volume_repair_dsl::volume_repair.filter( - volume_repair_dsl::repair_id - .eq(region_snapshot_replacement_id), - ), - ) - .execute_async(&conn) - .await?; - - use db::schema::region_snapshot_replacement::dsl; + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; - let result = diesel::update(dsl::region_snapshot_replacement) - .filter(dsl::id.eq(region_snapshot_replacement_id)) - .filter( - dsl::replacement_state - .eq(RegionSnapshotReplacementState::Running), + self.transaction_retry_wrapper("set_region_snapshot_replacement_complete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_id), + ), ) - .filter(dsl::operating_saga_id.is_null()) - .set((dsl::replacement_state - .eq(RegionSnapshotReplacementState::Complete),)) - .check_if_exists::( - region_snapshot_replacement_id, - ) - .execute_and_check(&conn) + .execute_async(&conn) .await?; - match result.status { - UpdateStatus::Updated => Ok(()), - UpdateStatus::NotUpdatedButExists => { - let record = result.found; + use db::schema::region_snapshot_replacement::dsl; - if record.replacement_state - == RegionSnapshotReplacementState::Complete - { - Ok(()) - } else { - Err(TxnError::CustomError(Error::conflict( - format!( - "region snapshot replacement {} set to {:?} \ - (operating saga id {:?})", - region_snapshot_replacement_id, - record.replacement_state, - record.operating_saga_id, - ), - ))) + let result = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .set((dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete),)) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementState::Complete + { + Ok(()) + } else { + Err(err.bail(TxnError::from(Error::conflict( + format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ), + )))) + } } } } }) .await - .map_err(|e| match e { - TxnError::CustomError(error) => error, - - TxnError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } + .map_err(|e| match err.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), }) } @@ -893,9 +901,15 @@ impl DataStore { ) -> Result<(), Error> { type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_step_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + async move { use db::schema::volume_repair::dsl as volume_repair_dsl; diesel::delete( @@ -943,27 +957,25 @@ impl DataStore { { Ok(()) } else { - Err(TxnError::CustomError(Error::conflict( + Err(err.bail(TxnError::from(Error::conflict( format!( "region snapshot replacement step {} set \ - to {:?} (operating saga id {:?})", + to {:?} (operating saga id {:?})", region_snapshot_replacement_step_id, record.replacement_state, record.operating_saga_id, ), - ))) + )))) } } } - }) - .await - .map_err(|e| match e { - TxnError::CustomError(error) => error, - - TxnError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } - }) + } + }) + .await + .map_err(|e| match err.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), + }) } /// Count all in-progress region snapshot replacement steps for a particular diff --git a/nexus/db-queries/src/db/datastore/role.rs b/nexus/db-queries/src/db/datastore/role.rs index b91597ad1d..ed8ec6fcd9 100644 --- a/nexus/db-queries/src/db/datastore/role.rs +++ b/nexus/db-queries/src/db/datastore/role.rs @@ -209,6 +209,11 @@ impl DataStore { // We might instead want to first-class the idea of Policies in the // database so that we can build up a whole new Policy in batches and // then flip the resource over to using it. + + // This method should probably be retryable, but this is slightly + // complicated by the cloning semantics of the queries, which + // must be Clone to be retried. + #[allow(clippy::disallowed_methods)] self.pool_connection_authorized(opctx) .await? .transaction_async(|conn| async move { diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs index 4bc212e997..87d94e2377 100644 --- a/nexus/db-queries/src/db/datastore/saga.rs +++ b/nexus/db-queries/src/db/datastore/saga.rs @@ -654,6 +654,7 @@ mod test { .expect("failed to re-assign sagas"); // Fetch all the sagas and check their states. + #[allow(clippy::disallowed_methods)] let all_sagas: Vec<_> = datastore .pool_connection_for_tests() .await diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index 2b7afa3270..b862f3c461 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -67,10 +67,11 @@ impl DataStore { use db::schema::silo::dsl; use db::schema::silo_quotas::dsl as quotas_dsl; + let conn = self.pool_connection_authorized(opctx).await?; + let count = self - .pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + .transaction_retry_wrapper("load_builtin_silos") + .transaction(&conn, |conn| async move { diesel::insert_into(quotas_dsl::silo_quotas) .values(SiloQuotas::arbitrarily_high_default( DEFAULT_SILO.id(), @@ -78,19 +79,17 @@ impl DataStore { .on_conflict(quotas_dsl::silo_id) .do_nothing() .execute_async(&conn) - .await - .map_err(TransactionError::CustomError) - .unwrap(); - diesel::insert_into(dsl::silo) + .await?; + let count = diesel::insert_into(dsl::silo) .values([&*DEFAULT_SILO, &*INTERNAL_SILO]) .on_conflict(dsl::id) .do_nothing() .execute_async(&conn) - .await - .map_err(TransactionError::CustomError) + .await?; + Ok(count) }) .await - .unwrap(); + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; info!(opctx.log, "created {} built-in silos", count); @@ -226,6 +225,9 @@ impl DataStore { None }; + // This method uses nested transactions, which are not supported + // with retryable transactions. + #[allow(clippy::disallowed_methods)] let silo = conn .transaction_async(|conn| async move { let silo = silo_create_query @@ -424,6 +426,10 @@ impl DataStore { let now = Utc::now(); type TxnError = TransactionError; + + // This method uses nested transactions, which are not supported + // with retryable transactions. + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { let updated_rows = diesel::update(silo::dsl::silo) .filter(silo::dsl::time_deleted.is_null()) diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index b8ef759116..e6168f4e42 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -199,6 +199,8 @@ impl DataStore { let group_id = authz_silo_group.id(); + // Prefer to use "transaction_retry_wrapper" + #[allow(clippy::disallowed_methods)] self.pool_connection_authorized(opctx) .await? .transaction_async(|conn| async move { diff --git a/nexus/db-queries/src/db/datastore/silo_user.rs b/nexus/db-queries/src/db/datastore/silo_user.rs index 2825e2a310..40f6b3f0be 100644 --- a/nexus/db-queries/src/db/datastore/silo_user.rs +++ b/nexus/db-queries/src/db/datastore/silo_user.rs @@ -21,7 +21,6 @@ use crate::db::model::UserBuiltin; use crate::db::model::UserProvisionType; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -92,9 +91,10 @@ impl DataStore { // TODO-robustness We might consider the RFD 192 "rcgen" pattern as well // so that people can't, say, login while we do this. let authz_silo_user_id = authz_silo_user.id(); - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|mut conn| async move { + + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("silo_user_delete") + .transaction(&conn, |conn| async move { // Delete the user record. { use db::schema::silo_user::dsl; @@ -103,7 +103,7 @@ impl DataStore { .filter(dsl::time_deleted.is_null()) .set(dsl::time_deleted.eq(Utc::now())) .check_if_exists::(authz_silo_user_id) - .execute_and_check(&mut conn) + .execute_and_check(&conn) .await?; } @@ -112,7 +112,7 @@ impl DataStore { use db::schema::console_session::dsl; diesel::delete(dsl::console_session) .filter(dsl::silo_user_id.eq(authz_silo_user_id)) - .execute_async(&mut conn) + .execute_async(&conn) .await?; } @@ -121,7 +121,7 @@ impl DataStore { use db::schema::device_access_token::dsl; diesel::delete(dsl::device_access_token) .filter(dsl::silo_user_id.eq(authz_silo_user_id)) - .execute_async(&mut conn) + .execute_async(&conn) .await?; } @@ -130,7 +130,7 @@ impl DataStore { use db::schema::silo_group_membership::dsl; diesel::delete(dsl::silo_group_membership) .filter(dsl::silo_user_id.eq(authz_silo_user_id)) - .execute_async(&mut conn) + .execute_async(&conn) .await?; } @@ -141,7 +141,7 @@ impl DataStore { .filter(dsl::silo_user_id.eq(authz_silo_user_id)) .filter(dsl::time_deleted.is_null()) .set(dsl::time_deleted.eq(Utc::now())) - .execute_async(&mut conn) + .execute_async(&conn) .await?; } diff --git a/nexus/db-queries/src/db/pagination.rs b/nexus/db-queries/src/db/pagination.rs index 01911eb802..1929632980 100644 --- a/nexus/db-queries/src/db/pagination.rs +++ b/nexus/db-queries/src/db/pagination.rs @@ -679,6 +679,7 @@ mod test { pagparams: &DataPageParams<'_, (i64, i64)>, ) -> Vec { let conn = pool.claim().await.unwrap(); + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { // I couldn't figure out how to make this work without requiring a full // table scan, and I just want the test to work so that I can get on diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 55927f7de8..4855f64ac2 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -1041,6 +1041,7 @@ pub(crate) mod test { let conn = datastore.pool_connection_for_tests().await.unwrap(); + #[allow(clippy::disallowed_methods)] conn.transaction_async(|conn| async move { // Selecting all regions requires a full table scan conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); From 836deb37feff896d92d7cbd1ab5f3a47818084ae Mon Sep 17 00:00:00 2001 From: David Crespo Date: Fri, 6 Dec 2024 16:06:28 -0600 Subject: [PATCH 12/22] Bump web console (minor) (#7215) https://github.com/oxidecomputer/console/compare/fd47bee7...927c8b63 * [927c8b63](https://github.com/oxidecomputer/console/commit/927c8b63) oxidecomputer/console#2608 * [01a0ac99](https://github.com/oxidecomputer/console/commit/01a0ac99) oxidecomputer/console#2597 * [243c55d7](https://github.com/oxidecomputer/console/commit/243c55d7) oxidecomputer/console#2606 * [deb5e187](https://github.com/oxidecomputer/console/commit/deb5e187) oxidecomputer/console#2603 * [67633aac](https://github.com/oxidecomputer/console/commit/67633aac) oxidecomputer/console#2601 * [6405e0cb](https://github.com/oxidecomputer/console/commit/6405e0cb) oxidecomputer/console#2605 * [8c37bad2](https://github.com/oxidecomputer/console/commit/8c37bad2) oxidecomputer/console#2602 * [081599c3](https://github.com/oxidecomputer/console/commit/081599c3) oxidecomputer/console#2599 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index cf50396baf..08078c264e 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="fd47bee7c1f2baf189661c4e44318f8a9caa3d4c" -SHA2="74fdcd4c4f102c35cee25e893873596cf26fd7bdaf7a6962371e56e786a981f2" +COMMIT="927c8b63a6f97c230cd8766a80fa1cfef6429eb4" +SHA2="96550b6e485aaee1c6ced00a4a1aeec86267c99fc79a4b2b253141cf0222d346" From 2568a2eca76dace2ae1928dd48d5bf3df32c5933 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 9 Dec 2024 12:36:06 -0500 Subject: [PATCH 13/22] [reconfigurator] Fix DatasetsEditor internal zpool,kind -> ID map (#7216) Blueprint zones don't contain explicit dataset IDs (yet; see #7214), so `DatasetsEditor` maintained an internal cache mapping `(zpool, kind) -> dataset ID`. However, that mapping is only unique for _in service_ datasets, and `DatasetsEditor` was erroneously trying to build it for _all_ datasets (both in-service and expunged). This PR adds a few property tests and fixes for the maintenance of this cache, and should ensure that we only try to maintain the "at most one dataset of a given kind on a given zpool" map for in-service datasets. --- common/src/api/internal/shared.rs | 1 + nexus/reconfigurator/planning/Cargo.toml | 1 + .../blueprint_editor/sled_editor/datasets.txt | 7 + .../blueprint_editor/sled_editor/datasets.rs | 582 +++++++++++++++--- 4 files changed, 489 insertions(+), 102 deletions(-) create mode 100644 nexus/reconfigurator/planning/proptest-regressions/blueprint_editor/sled_editor/datasets.txt diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index e0d6452376..94440df2d5 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -872,6 +872,7 @@ pub struct ExternalIpGatewayMap { /// Describes the purpose of the dataset. #[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, EnumCount)] +#[cfg_attr(feature = "testing", derive(test_strategy::Arbitrary))] pub enum DatasetKind { // Durable datasets for zones Cockroach, diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 19e429dcd9..43a65ad085 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -39,6 +39,7 @@ omicron-workspace-hack.workspace = true [dev-dependencies] expectorate.workspace = true maplit.workspace = true +omicron-common = { workspace = true, features = ["testing"] } omicron-test-utils.workspace = true proptest.workspace = true test-strategy.workspace = true diff --git a/nexus/reconfigurator/planning/proptest-regressions/blueprint_editor/sled_editor/datasets.txt b/nexus/reconfigurator/planning/proptest-regressions/blueprint_editor/sled_editor/datasets.txt new file mode 100644 index 0000000000..bee50f1683 --- /dev/null +++ b/nexus/reconfigurator/planning/proptest-regressions/blueprint_editor/sled_editor/datasets.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc a3c842ed34d27e4c78fb52fd718cfcc038942eca49672c53e126a1062f5db3ac # shrinks to input = _ProptestNamefixmeArgs { values: [[Cockroach]] } diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs index 3830f02233..de397b9caa 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs @@ -7,6 +7,7 @@ use crate::planner::PlannerRng; use illumos_utils::zpool::ZpoolName; use nexus_types::deployment::BlueprintDatasetConfig; use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintDatasetFilter; use nexus_types::deployment::BlueprintDatasetsConfig; use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolFilter; @@ -20,6 +21,7 @@ use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::ZpoolUuid; use std::collections::btree_map::Entry; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::net::SocketAddrV6; #[derive(Debug, thiserror::Error)] @@ -189,7 +191,14 @@ impl PartialDatasetConfig { pub(super) struct DatasetsEditor { preexisting_dataset_ids: DatasetIdsBackfillFromDb, config: BlueprintDatasetsConfig, - by_zpool_and_kind: BTreeMap>, + // Cache of _in service only_ datasets, identified by (zpool, kind). + in_service_by_zpool_and_kind: + BTreeMap>, + // Cache of _expunged_ dataset IDs. This serves as a list of IDs from + // `preexisting_dataset_ids` to ignore, as we shouldn't reuse old IDs if + // they belong to expunged datasets. We should be able to remove this when + // we remove `preexisting_dataset_ids`. + expunged_datasets: BTreeSet, counts: EditCounts, } @@ -198,28 +207,39 @@ impl DatasetsEditor { config: BlueprintDatasetsConfig, preexisting_dataset_ids: DatasetIdsBackfillFromDb, ) -> Result { - let mut by_zpool_and_kind = BTreeMap::new(); + let mut in_service_by_zpool_and_kind = BTreeMap::new(); + let mut expunged_datasets = BTreeSet::new(); for dataset in config.datasets.values() { - let by_kind: &mut BTreeMap<_, _> = - by_zpool_and_kind.entry(dataset.pool.id()).or_default(); - match by_kind.entry(dataset.kind.clone()) { - Entry::Vacant(slot) => { - slot.insert(dataset.id); + match dataset.disposition { + BlueprintDatasetDisposition::InService => { + let by_kind: &mut BTreeMap<_, _> = + in_service_by_zpool_and_kind + .entry(dataset.pool.id()) + .or_default(); + match by_kind.entry(dataset.kind.clone()) { + Entry::Vacant(slot) => { + slot.insert(dataset.id); + } + Entry::Occupied(prev) => { + return Err(MultipleDatasetsOfKind { + zpool_id: dataset.pool.id(), + kind: dataset.kind.clone(), + id1: *prev.get(), + id2: dataset.id, + }); + } + } } - Entry::Occupied(prev) => { - return Err(MultipleDatasetsOfKind { - zpool_id: dataset.pool.id(), - kind: dataset.kind.clone(), - id1: *prev.get(), - id2: dataset.id, - }); + BlueprintDatasetDisposition::Expunged => { + expunged_datasets.insert(dataset.id); } } } Ok(Self { preexisting_dataset_ids, config, - by_zpool_and_kind, + in_service_by_zpool_and_kind, + expunged_datasets, counts: EditCounts::zeroes(), }) } @@ -231,7 +251,8 @@ impl DatasetsEditor { generation: Generation::new(), datasets: BTreeMap::new(), }, - by_zpool_and_kind: BTreeMap::new(), + in_service_by_zpool_and_kind: BTreeMap::new(), + expunged_datasets: BTreeSet::new(), counts: EditCounts::zeroes(), } } @@ -248,101 +269,69 @@ impl DatasetsEditor { self.counts } - // If there is a dataset of the given `kind` on the given `zpool`, return - // its ID. - // - // This prefers IDs we already have; if we don't have one, it falls back to - // backfilling based on IDs recorded in the database from before blueprints - // tracked datasets (see `DatasetIdsBackfillFromDb` above). - fn get_id( + #[allow(dead_code)] // currently only used by tests; this will change soon + pub fn datasets( &self, - zpool: &ZpoolUuid, - kind: &DatasetKind, - ) -> Option { - if let Some(blueprint_id) = self - .by_zpool_and_kind - .get(zpool) - .and_then(|by_kind| by_kind.get(kind).copied()) - { - return Some(blueprint_id); - }; - if let Some(preexisting_database_id) = - self.preexisting_dataset_ids.get(zpool, kind) - { - return Some(preexisting_database_id); - }; - None + filter: BlueprintDatasetFilter, + ) -> impl Iterator { + self.config + .datasets + .values() + .filter(move |dataset| dataset.disposition.matches(filter)) } - fn expunge_impl( - dataset: &mut BlueprintDatasetConfig, - counts: &mut EditCounts, - ) { + // Private method; panics if given an ID that isn't present in + // `self.config.datasets`. Callers must ensure the ID is valid. + fn expunge_by_known_valid_id(&mut self, id: DatasetUuid) { + let dataset = self + .config + .datasets + .get_mut(&id) + .expect("expunge_impl called with invalid ID"); match dataset.disposition { BlueprintDatasetDisposition::InService => { dataset.disposition = BlueprintDatasetDisposition::Expunged; - counts.expunged += 1; + self.counts.expunged += 1; } BlueprintDatasetDisposition::Expunged => { // already expunged; nothing to do } } + self.expunged_datasets.insert(dataset.id); } /// Expunge a dataset identified by its zpool + kind combo. /// - /// TODO-cleanup This seems fishy. We require that there is at most one - /// dataset of a given `DatasetKind` on a given zpool at a time, but over - /// time we might have had multiple. For example: - /// - /// * Blueprint A: Nexus 1 is on zpool 12 - /// * Blueprint B: Nexus 1 is expunged - /// * Blueprint C: Nexus 2 is added and is placed on zpool 12 - /// - /// When we go to plan Blueprint D, if Nexus 1 is still being carried - /// forward, it will already be expunged (which is fine). If we then try to - /// expunge it again, which should be idempotent, expunging its - /// datasets would incorrectly expunge Nexus 2's datasets (because we'd look - /// up "the dataset with kind Nexus on zpool 12"). We should probably take - /// an explicit dataset ID here, but that would require - /// `BlueprintZoneConfig` to track its dataset IDs explicitly instead of - /// only tracking their zpools. + /// TODO-cleanup This is a little fishy and should be replaced with + /// an expunge-by-ID method instead, but that requires some rework + /// (). pub fn expunge( &mut self, zpool: &ZpoolUuid, kind: &DatasetKind, ) -> Result<(), DatasetsEditError> { let Some(id) = self - .by_zpool_and_kind - .get(zpool) - .and_then(|by_kind| by_kind.get(kind)) + .in_service_by_zpool_and_kind + .get_mut(zpool) + .and_then(|by_kind| by_kind.remove(kind)) else { return Err(DatasetsEditError::ExpungeNonexistentDataset { zpool_id: *zpool, kind: kind.clone(), }); }; - let dataset = self - .config - .datasets - .get_mut(id) - .expect("by_zpool_and_kind and config out of sync"); - Self::expunge_impl(dataset, &mut self.counts); + self.expunge_by_known_valid_id(id); Ok(()) } pub fn expunge_all_on_zpool(&mut self, zpool: &ZpoolUuid) { - let Some(by_kind) = self.by_zpool_and_kind.get(zpool) else { + let Some(by_kind) = self.in_service_by_zpool_and_kind.remove(zpool) + else { return; }; - for id in by_kind.values() { - let dataset = self - .config - .datasets - .get_mut(id) - .expect("by_zpool_and_kind and config out of sync"); - Self::expunge_impl(dataset, &mut self.counts); + for id in by_kind.into_values() { + self.expunge_by_known_valid_id(id); } } @@ -350,49 +339,438 @@ impl DatasetsEditor { &mut self, dataset: PartialDatasetConfig, rng: &mut PlannerRng, - ) { + ) -> &BlueprintDatasetConfig { // Convert the partial config into a full config by finding or // generating its ID. - let dataset = { - let PartialDatasetConfig { - name, - address, - quota, - reservation, - compression, - } = dataset; - let (pool, kind) = name.into_parts(); - let id = self - .get_id(&pool.id(), &kind) - .unwrap_or_else(|| rng.next_dataset()); - BlueprintDatasetConfig { - disposition: BlueprintDatasetDisposition::InService, - id, - pool, - kind, - address, - quota, - reservation, - compression, + let PartialDatasetConfig { + name, + address, + quota, + reservation, + compression, + } = dataset; + let (pool, kind) = name.into_parts(); + + let id = { + // If there is a dataset of the given `kind` on the given + // `zpool`, find its ID. + // + // This prefers IDs we already have; if we don't have one, it + // falls back to backfilling based on IDs recorded in the + // database from before blueprints tracked datasets (see + // `DatasetIdsBackfillFromDb` above). + if let Some(blueprint_id) = self + .in_service_by_zpool_and_kind + .get(&pool.id()) + .and_then(|by_kind| by_kind.get(&kind).copied()) + { + blueprint_id + } else if let Some(preexisting_database_id) = + self.preexisting_dataset_ids.get(&pool.id(), &kind) + { + // Only use old database IDs if this ID hasn't been expunged. + // + // This check won't work if there's a preexisting_database_id + // for an old dataset that has been both expunged _and removed_, + // as we have no way of knowing about completely removed + // datasets. However: + // + // 1. `DatasetIdsBackfillFromDb::build()` filters to only + // in-service datasets, so we should never find a database ID + // for a removed dataset. + // 2. We don't yet ever remove datasets anyway, and hopefully + // `DatasetIdsBackfillFromDb` is entirely removed by then (it + // should be removeable after R12, once we've guaranteed all + // blueprints have datasets). + if !self.expunged_datasets.contains(&preexisting_database_id) { + preexisting_database_id + } else { + rng.next_dataset() + } + } else { + rng.next_dataset() } }; + let dataset = BlueprintDatasetConfig { + disposition: BlueprintDatasetDisposition::InService, + id, + pool, + kind, + address, + quota, + reservation, + compression, + }; + // Add or update our config with this new dataset info. match self.config.datasets.entry(dataset.id) { Entry::Vacant(slot) => { - self.by_zpool_and_kind + self.in_service_by_zpool_and_kind .entry(dataset.pool.id()) .or_default() .insert(dataset.kind.clone(), dataset.id); - slot.insert(dataset); self.counts.added += 1; + &*slot.insert(dataset) } Entry::Occupied(mut prev) => { if *prev.get() != dataset { - prev.insert(dataset); self.counts.updated += 1; + prev.insert(dataset); } + &*prev.into_mut() + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nexus_types::deployment::BlueprintDatasetFilter; + use omicron_uuid_kinds::GenericUuid; + use proptest::prelude::*; + use std::collections::BTreeSet; + use test_strategy::proptest; + use test_strategy::Arbitrary; + use uuid::Uuid; + + // Helper functions to "tag" an iterator (i.e., turn it into an iterator of + // tuples) for use with `build_test_config()` below. + fn all_in_service( + value: I, + ) -> impl Iterator + where + I: IntoIterator, + { + value + .into_iter() + .map(|kind| (BlueprintDatasetDisposition::InService, kind)) + } + fn all_expunged( + value: I, + ) -> impl Iterator + where + I: IntoIterator, + { + value + .into_iter() + .map(|kind| (BlueprintDatasetDisposition::Expunged, kind)) + } + + fn build_test_config(values: I) -> BlueprintDatasetsConfig + where + I: Iterator, + J: Iterator, + { + let mut datasets = BTreeMap::new(); + let mut dataset_id_index = 0; + for (zpool_id_index, disposition_kinds) in values.enumerate() { + let zpool_id = ZpoolUuid::from_untyped_uuid(Uuid::from_u128( + zpool_id_index as u128, + )); + for (disposition, kind) in disposition_kinds { + let id = { + let id = DatasetUuid::from_untyped_uuid(Uuid::from_u128( + dataset_id_index, + )); + dataset_id_index += 1; + id + }; + let dataset = BlueprintDatasetConfig { + disposition, + id, + pool: ZpoolName::new_external(zpool_id), + kind, + address: None, + quota: None, + reservation: None, + compression: CompressionAlgorithm::Off, + }; + let prev = datasets.insert(id, dataset); + assert!(prev.is_none(), "no duplicate dataset IDs"); + } + } + let mut generation = Generation::new(); + if dataset_id_index > 0 { + generation = generation.next(); + } + BlueprintDatasetsConfig { generation, datasets } + } + + #[derive(Debug, Arbitrary)] + struct DatasetKindSet { + #[strategy(prop::collection::btree_set(any::(), 0..16))] + kinds: BTreeSet, + } + + #[derive(Debug, Arbitrary)] + struct ZpoolsWithInServiceDatasets { + #[strategy(prop::collection::vec(any::(), 0..10))] + by_zpool: Vec, + } + + impl ZpoolsWithInServiceDatasets { + fn into_config(self) -> BlueprintDatasetsConfig { + build_test_config( + self.by_zpool + .into_iter() + .map(|kinds| all_in_service(kinds.kinds)), + ) + } + } + + #[derive(Debug, Arbitrary)] + struct DatasetKindVec { + #[strategy(prop::collection::vec(any::(), 0..32))] + kinds: Vec, + } + + #[derive(Debug, Arbitrary)] + struct ZpoolsWithExpungedDatasets { + #[strategy(prop::collection::vec(any::(), 0..10))] + by_zpool: Vec, + } + + impl ZpoolsWithExpungedDatasets { + fn into_config(self) -> BlueprintDatasetsConfig { + build_test_config( + self.by_zpool + .into_iter() + .map(|kinds| all_expunged(kinds.kinds)), + ) + } + } + + // Proptest helper to construct zpools with both in-service datasets (the + // first element of the tuple: a set of kinds) and expunged datasets (the + // second element of the tuple: a vec of kinds). + #[derive(Debug, Arbitrary)] + struct ZpoolsWithMixedDatasets { + #[strategy(prop::collection::vec(any::<(DatasetKindSet, DatasetKindVec)>(), 0..10))] + by_zpool: Vec<(DatasetKindSet, DatasetKindVec)>, + } + + impl ZpoolsWithMixedDatasets { + fn into_config(self) -> BlueprintDatasetsConfig { + build_test_config(self.by_zpool.into_iter().map( + |(in_service, expunged)| { + all_in_service(in_service.kinds) + .chain(all_expunged(expunged.kinds)) + }, + )) + } + } + + #[proptest] + fn proptest_create_editor_with_in_service_datasets( + by_zpool: ZpoolsWithInServiceDatasets, + ) { + _ = DatasetsEditor::new( + by_zpool.into_config(), + DatasetIdsBackfillFromDb::empty(), + ) + .expect("built editor"); + } + + #[proptest] + fn proptest_create_editor_with_expunged_datasets( + by_zpool: ZpoolsWithExpungedDatasets, + ) { + _ = DatasetsEditor::new( + by_zpool.into_config(), + DatasetIdsBackfillFromDb::empty(), + ) + .expect("built editor"); + } + + #[proptest] + fn proptest_add_same_kind_after_expunging( + initial: ZpoolsWithMixedDatasets, + rng_seed: u32, + ) { + let config = initial.into_config(); + let mut editor = DatasetsEditor::new( + config.clone(), + DatasetIdsBackfillFromDb::empty(), + ) + .expect("built editor"); + + let mut rng = PlannerRng::from_seed(( + rng_seed, + "proptest_add_same_kind_after_expunging", + )); + + // For each originally-in-service dataset: + // + // 1. Expunge that dataset + // 2. Add a new dataset of the same kind + // 3. Ensure the new dataset ID is freshly-generated + for dataset in config.datasets.values().filter(|dataset| { + dataset.disposition.matches(BlueprintDatasetFilter::InService) + }) { + editor + .expunge(&dataset.pool.id(), &dataset.kind) + .expect("expunged dataset"); + + let new_dataset = PartialDatasetConfig { + name: DatasetName::new( + dataset.pool.clone(), + dataset.kind.clone(), + ), + address: dataset.address, + quota: dataset.quota, + reservation: dataset.reservation, + compression: dataset.compression, + }; + let new_dataset = editor.ensure_in_service(new_dataset, &mut rng); + assert_ne!(dataset.id, new_dataset.id); + } + + // Repeat the test above, but this time assume all the dataset IDs were + // also present in the backfill database map. We should not reuse IDs + // after expunging zones. + let database_backfill = { + let mut by_zpool: BTreeMap<_, BTreeMap<_, _>> = BTreeMap::new(); + for dataset in config.datasets.values().filter(|dataset| { + dataset.disposition.matches(BlueprintDatasetFilter::InService) + }) { + let prev = by_zpool + .entry(dataset.pool.id()) + .or_default() + .insert(dataset.kind.clone(), dataset.id); + assert!( + prev.is_none(), + "duplicate (pool,kind) in-service input" + ); + } + DatasetIdsBackfillFromDb(by_zpool) + }; + let mut editor = DatasetsEditor::new(config.clone(), database_backfill) + .expect("built editor"); + for dataset in config.datasets.values().filter(|dataset| { + dataset.disposition.matches(BlueprintDatasetFilter::InService) + }) { + editor + .expunge(&dataset.pool.id(), &dataset.kind) + .expect("expunged dataset"); + + let new_dataset = PartialDatasetConfig { + name: DatasetName::new( + dataset.pool.clone(), + dataset.kind.clone(), + ), + address: dataset.address, + quota: dataset.quota, + reservation: dataset.reservation, + compression: dataset.compression, + }; + let new_dataset = editor.ensure_in_service(new_dataset, &mut rng); + assert_ne!(dataset.id, new_dataset.id); + } + } + + #[proptest] + fn proptest_add_same_kind_after_expunging_by_zpool( + initial: ZpoolsWithMixedDatasets, + rng_seed: u32, + ) { + let config = initial.into_config(); + let all_zpools = config + .datasets + .values() + .map(|dataset| dataset.pool.id()) + .collect::>(); + let mut editor = DatasetsEditor::new( + config.clone(), + DatasetIdsBackfillFromDb::empty(), + ) + .expect("built editor"); + + let mut rng = PlannerRng::from_seed(( + rng_seed, + "proptest_add_same_kind_after_expunging", + )); + + // Expunge all datasets on all zpools, by zpool. + for zpool_id in &all_zpools { + editor.expunge_all_on_zpool(zpool_id); + // There should no longer be any in-service datasets on this zpool. + assert!( + !editor + .datasets(BlueprintDatasetFilter::InService) + .any(|dataset| dataset.pool.id() == *zpool_id), + "in-service dataset remains after expunging zpool" + ); + } + + // For each originally-in-service dataset: + // + // 1. Add a new dataset of the same kind + // 2. Ensure the new dataset ID is freshly-generated + for dataset in config.datasets.values().filter(|dataset| { + dataset.disposition.matches(BlueprintDatasetFilter::InService) + }) { + let new_dataset = PartialDatasetConfig { + name: DatasetName::new( + dataset.pool.clone(), + dataset.kind.clone(), + ), + address: dataset.address, + quota: dataset.quota, + reservation: dataset.reservation, + compression: dataset.compression, + }; + let new_dataset = editor.ensure_in_service(new_dataset, &mut rng); + assert_ne!(dataset.id, new_dataset.id); + } + + // Repeat the test above, but this time assume all the dataset IDs were + // also present in the backfill database map. We should not reuse IDs + // after expunging zones. + let database_backfill = { + let mut by_zpool: BTreeMap<_, BTreeMap<_, _>> = BTreeMap::new(); + for dataset in config.datasets.values().filter(|dataset| { + dataset.disposition.matches(BlueprintDatasetFilter::InService) + }) { + let prev = by_zpool + .entry(dataset.pool.id()) + .or_default() + .insert(dataset.kind.clone(), dataset.id); + assert!( + prev.is_none(), + "duplicate (pool,kind) in-service input" + ); } + DatasetIdsBackfillFromDb(by_zpool) + }; + let mut editor = DatasetsEditor::new(config.clone(), database_backfill) + .expect("built editor"); + for zpool_id in &all_zpools { + editor.expunge_all_on_zpool(zpool_id); + // There should no longer be any in-service datasets on this zpool. + assert!( + !editor + .datasets(BlueprintDatasetFilter::InService) + .any(|dataset| dataset.pool.id() == *zpool_id), + "in-service dataset remains after expunging zpool" + ); + } + for dataset in config.datasets.values().filter(|dataset| { + dataset.disposition.matches(BlueprintDatasetFilter::InService) + }) { + let new_dataset = PartialDatasetConfig { + name: DatasetName::new( + dataset.pool.clone(), + dataset.kind.clone(), + ), + address: dataset.address, + quota: dataset.quota, + reservation: dataset.reservation, + compression: dataset.compression, + }; + let new_dataset = editor.ensure_in_service(new_dataset, &mut rng); + assert_ne!(dataset.id, new_dataset.id); } } } From d01b2ee56a24e3ec36a75c91d7e4959c9bc87c16 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 9 Dec 2024 12:18:29 -0800 Subject: [PATCH 14/22] Cleanup oximeter collector internals (#7205) - Move result sink tasks to their own module - Move collection task innards to their own module, exposing a simpler interface to the task via a new "handle" type. This makes the collection message type private, and instead uses public methods on the handle for dealing with messaging internally. - Move the producer endpoint info from the task map into the task handle itself, which simplifies a lot of the map manipulation in the agent itself. - Break up the massive `tokio::select!` in the collection task. This is now in a main `run()` method on the new collection task type. That lets us factor out each branch of the select (waiting for a timer to fire, or a message from the oximeter agent, e.g.) into their own methods. This is entirely NFC, it just makes everything much easier to understand and debug. - Closes #7202 --- oximeter/collector/src/agent.rs | 879 +-------------------- oximeter/collector/src/collection_task.rs | 898 ++++++++++++++++++++++ oximeter/collector/src/lib.rs | 6 +- oximeter/collector/src/results_sink.rs | 148 ++++ 4 files changed, 1089 insertions(+), 842 deletions(-) create mode 100644 oximeter/collector/src/collection_task.rs create mode 100644 oximeter/collector/src/results_sink.rs diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 4c4f0f4177..ae9aef1bd9 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -4,8 +4,12 @@ //! The oximeter agent handles collection tasks for each producer. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company +use crate::collection_task::CollectionTaskHandle; +use crate::collection_task::CollectionTaskOutput; +use crate::collection_task::ForcedCollectionError; +use crate::results_sink; use crate::self_stats; use crate::DbConfig; use crate::Error; @@ -18,11 +22,7 @@ use nexus_client::types::IdSortMode; use nexus_client::Client as NexusClient; use omicron_common::backoff; use omicron_common::backoff::BackoffError; -use oximeter::types::ProducerResults; -use oximeter::types::ProducerResultsItem; -use oximeter_api::FailedCollection; use oximeter_api::ProducerDetails; -use oximeter_api::SuccessfulCollection; use oximeter_db::Client; use oximeter_db::DbWrite; use qorb::claim::Handle; @@ -43,742 +43,11 @@ use std::ops::Bound; use std::sync::Arc; use std::sync::Mutex as StdMutex; use std::time::Duration; -use std::time::Instant; use tokio::sync::mpsc; -use tokio::sync::mpsc::error::TrySendError; -use tokio::sync::oneshot; -use tokio::sync::watch; use tokio::sync::Mutex; use tokio::sync::MutexGuard; -use tokio::task::JoinHandle; -use tokio::time::interval; use uuid::Uuid; -/// A token used to force a collection. -/// -/// If the collection is successfully completed, `Ok(())` will be sent back on the -/// contained oneshot channel. Note that that "successful" means the actual -/// request completed, _not_ that results were successfully collected. I.e., it -/// means "this attempt is done". -/// -/// If the collection could not be queued because there are too many outstanding -/// force collection attempts, an `Err(ForcedCollectionQueueFull)` is returned. -type CollectionToken = oneshot::Sender>; - -/// Error returned when a forced collection fails. -#[derive(Clone, Copy, Debug)] -pub enum ForcedCollectionError { - /// The internal queue of requests is full. - QueueFull, - /// We failed to send the request because the channel was closed. - Closed, -} - -/// Timeout on any single collection from a producer. -const COLLECTION_TIMEOUT: Duration = Duration::from_secs(30); - -/// The number of forced collections queued before we start to deny them. -const N_QUEUED_FORCED_COLLECTIONS: usize = 1; - -/// The number of timer-based collections queued before we start to deny them. -const N_QUEUED_TIMER_COLLECTIONS: usize = 1; - -// Messages for controlling a collection task -#[derive(Debug)] -enum CollectionMessage { - // Explicit request that the task collect data from its producer - // - // Also sends a oneshot that is signalled once the task scrapes - // data from the Producer, and places it in the Clickhouse server. - Collect(CollectionToken), - // Request that the task update its interval and the socket address on which it collects data - // from its producer. - Update(ProducerEndpoint), - // Request that the task exit - Shutdown, - // Return the current statistics from a single task. - #[cfg(test)] - Statistics { - reply_tx: oneshot::Sender, - }, - // Request details from the collection task about its producer. - Details { - reply_tx: oneshot::Sender, - }, -} - -/// Return type for `perform_collection`. -struct SingleCollectionResult { - /// The result of the collection. - result: Result, - /// The duration the collection took. - duration: Duration, -} - -/// Run a single collection from the producer. -async fn perform_collection( - log: Logger, - client: reqwest::Client, - producer: ProducerEndpoint, -) -> SingleCollectionResult { - let start = Instant::now(); - debug!(log, "collecting from producer"); - let res = client - .get(format!("http://{}/{}", producer.address, producer.id)) - .send() - .await; - trace!(log, "sent collection request to producer"); - let result = match res { - Ok(res) => { - if res.status().is_success() { - match res.json::().await { - Ok(results) => { - debug!( - log, - "collected results from producer"; - "n_results" => results.len() - ); - Ok(results) - } - Err(e) => { - warn!( - log, - "failed to collect results from producer"; - InlineErrorChain::new(&e), - ); - Err(self_stats::FailureReason::Deserialization) - } - } - } else { - warn!( - log, - "failed to receive metric results from producer"; - "status_code" => res.status().as_u16(), - ); - Err(self_stats::FailureReason::Other(res.status())) - } - } - Err(e) => { - error!( - log, - "failed to send collection request to producer"; - InlineErrorChain::new(&e), - ); - Err(self_stats::FailureReason::Unreachable) - } - }; - SingleCollectionResult { result, duration: start.elapsed() } -} - -// The type of one collection task run to completion. -// -// An `Err(_)` means we failed to collect, and contains the reason so that we -// can bump the self-stat counter accordingly. -type CollectionResult = Result; - -/// Information about when we start a collection. -struct CollectionStartTimes { - /// UTC timestamp at which the request was started. - started_at: DateTime, - /// Instant right before we queued the response for processing. - queued_at: Instant, -} - -impl CollectionStartTimes { - fn new() -> Self { - Self { started_at: Utc::now(), queued_at: Instant::now() } - } -} - -/// Details about a forced collection. -struct ForcedCollectionRequest { - /// The collection token we signal when the collection is completed. - token: CollectionToken, - /// Start time for this collection. - start: CollectionStartTimes, -} - -impl ForcedCollectionRequest { - fn new(token: CollectionToken) -> Self { - Self { token, start: CollectionStartTimes::new() } - } -} - -/// Details about a completed collection. -struct CollectionResponse { - /// Token for a forced collection request. - token: Option, - /// The actual result of the collection. - result: CollectionResult, - /// Time when the collection started. - started_at: DateTime, - /// Time the request spent queued. - time_queued: Duration, - /// Time we spent processing the request. - time_collecting: Duration, -} - -/// Task that actually performs collections from the producer. -async fn inner_collection_loop( - log: Logger, - mut producer_info_rx: watch::Receiver, - mut forced_collection_rx: mpsc::Receiver, - mut timer_collection_rx: mpsc::Receiver, - result_tx: mpsc::Sender, -) { - let client = reqwest::Client::builder() - .timeout(COLLECTION_TIMEOUT) - .build() - // Safety: `build()` only fails if TLS couldn't be initialized or the - // system DNS configuration could not be loaded. - .unwrap(); - loop { - // Wait for notification that we have a collection to perform, from - // either the forced- or timer-collection queue. - trace!(log, "top of inner collection loop, waiting for next request"); - let (maybe_token, start_time) = tokio::select! { - maybe_request = forced_collection_rx.recv() => { - let Some(ForcedCollectionRequest { token, start }) = maybe_request else { - debug!( - log, - "forced collection request queue closed, exiting" - ); - return; - }; - (Some(token), start) - } - maybe_request = timer_collection_rx.recv() => { - let Some(start) = maybe_request else { - debug!( - log, - "timer collection request queue closed, exiting" - ); - return; - }; - (None, start) - } - }; - let time_queued = start_time.queued_at.elapsed(); - - // Make a future to represent the actual collection. - let mut collection_fut = Box::pin(perform_collection( - log.clone(), - client.clone(), - *producer_info_rx.borrow_and_update(), - )); - - // Wait for that collection to complete or fail, or for an update to the - // producer's information. In the latter case, recreate the future for - // the collection itself with the new producer information. - let SingleCollectionResult { result, duration } = 'collection: loop { - tokio::select! { - biased; - - maybe_update = producer_info_rx.changed() => { - match maybe_update { - Ok(_) => { - let update = *producer_info_rx.borrow_and_update(); - debug!( - log, - "received producer info update with an outstanding \ - collection running, cancelling it and recreating \ - with the new info"; - "new_info" => ?&update, - ); - collection_fut = Box::pin(perform_collection( - log.new(o!("address" => update.address)), - client.clone(), - update, - )); - continue 'collection; - } - Err(e) => { - error!( - log, - "failed to receive on producer update \ - watch channel, exiting"; - InlineErrorChain::new(&e), - ); - return; - } - } - } - - collection_result = &mut collection_fut => { - // NOTE: This break here is intentional. We cannot just call - // `result_tx.send()` in this loop, because that moves out - // of `maybe_token`, which isn't Copy. Break the loop, and - // then send it after we know we've completed the - // collection. - break 'collection collection_result; - } - } - }; - - // Now that the collection has completed, send on the results, along - // with the timing information and any collection token we may have - // gotten with the request. - let response = CollectionResponse { - token: maybe_token, - result, - started_at: start_time.started_at, - time_queued, - time_collecting: duration, - }; - match result_tx.send(response).await { - Ok(_) => trace!(log, "forwarded results to main collection loop"), - Err(_) => { - error!( - log, - "failed to forward results to \ - collection loop, channel is closed, exiting", - ); - return; - } - } - } -} - -// Background task used to collect metrics from one producer on an interval. -// -// This function is started by the `OximeterAgent`, when a producer is registered. The task loops -// endlessly, and collects metrics from the assigned producer on a timeout. The assigned agent can -// also send a `CollectionMessage`, for example to update the collection interval. This is not -// currently used, but will likely be exposed via control plane interfaces in the future. -async fn collection_loop( - log: Logger, - collector: self_stats::OximeterCollector, - producer: ProducerEndpoint, - mut inbox: mpsc::Receiver, - outbox: mpsc::Sender<(Option, ProducerResults)>, -) { - let mut collection_timer = interval(producer.interval); - debug!( - log, - "starting oximeter collection task"; - "interval" => ?producer.interval, - ); - - // Set up the collection of self statistics about this collection task. - let mut stats = self_stats::CollectionTaskStats::new(collector, &producer); - let mut self_collection_timer = interval(self_stats::COLLECTION_INTERVAL); - self_collection_timer.tick().await; - - // Keep track of more details about each collection, so we can expose this - // as debugging information in `oximeter`'s public API. - let mut details = ProducerDetails::new(&producer); - - // Spawn a task to run the actual collections. - // - // This is so that we can possibly interrupt and restart collections that - // are in-progress when we get an update to the producer's information. In - // that case, the collection is likely doomed, since the producer has moved - // and won't be available at the address the collection started with. This - // lets us restart that collection with the new information. - let (producer_info_tx, producer_info_rx) = watch::channel(producer); - let (forced_collection_tx, forced_collection_rx) = - mpsc::channel(N_QUEUED_FORCED_COLLECTIONS); - let (timer_collection_tx, timer_collection_rx) = - mpsc::channel(N_QUEUED_TIMER_COLLECTIONS); - let (result_tx, mut result_rx) = mpsc::channel(1); - tokio::task::spawn(inner_collection_loop( - log.clone(), - producer_info_rx, - forced_collection_rx, - timer_collection_rx, - result_tx, - )); - - loop { - tokio::select! { - message = inbox.recv() => { - match message { - None => { - debug!( - log, - "collection task inbox closed, shutting down" - ); - return; - } - Some(CollectionMessage::Shutdown) => { - debug!( - log, - "collection task received shutdown request" - ); - return; - }, - Some(CollectionMessage::Collect(token)) => { - debug!( - log, - "collection task received explicit request to collect" - ); - let request = ForcedCollectionRequest::new(token); - match forced_collection_tx.try_send(request) { - Ok(_) => trace!( - log, "forwarded explicit request to collection task" - ), - Err(e) => { - match e { - TrySendError::Closed(ForcedCollectionRequest { token, .. }) => { - debug!( - log, - "collection task forced collection \ - queue is closed. Attempting to \ - notify caller and exiting.", - ); - let _ = token.send(Err(ForcedCollectionError::Closed)); - return; - } - TrySendError::Full(ForcedCollectionRequest { token, start }) => { - error!( - log, - "collection task forced collection \ - queue is full! This should never \ - happen, and probably indicates \ - a bug in your test code, such as \ - calling `force_collection()` many \ - times" - ); - if token - .send(Err(ForcedCollectionError::QueueFull)) - .is_err() - { - warn!( - log, - "failed to notify caller of \ - force_collection(), oneshot is \ - closed" - ); - } - let failure = FailedCollection { - started_at: start.started_at, - time_queued: Duration::ZERO, - time_collecting: Duration::ZERO, - reason: String::from("forced collection queue full"), - }; - details.on_failure(failure); - } - } - } - } - }, - Some(CollectionMessage::Update(new_info)) => { - // If the collection interval is shorter than the - // interval on which we receive these update messages, - // we'll never actually collect anything! Instead, only - // do the update if the information has changed. This - // should also be guarded against by the main agent, but - // we're being cautious here. - let updated_producer_info = |info: &mut ProducerEndpoint| { - if new_info == *info { - false - } else { - *info = new_info; - true - } - }; - if !producer_info_tx.send_if_modified(updated_producer_info) { - trace!( - log, - "collection task received update with \ - identical producer information, no \ - updates will be sent to the collection task" - ); - continue; - } - - // We have an actual update to the producer information. - // - // Rebuild our timer to reflect the possibly-new - // interval. The collection task has already been - // notified above. - debug!( - log, - "collection task received request to update \ - its producer information"; - "interval" => ?new_info.interval, - "address" => new_info.address, - ); - details.update(&new_info); - stats.update(&new_info); - collection_timer = interval(new_info.interval); - collection_timer.tick().await; // completes immediately - } - #[cfg(test)] - Some(CollectionMessage::Statistics { reply_tx }) => { - // Time should be paused when using this retrieval - // mechanism. We advance time to cause a panic if this - // message were to be sent with time *not* paused. - tokio::time::advance(Duration::from_nanos(1)).await; - // The collection timer *may* be ready to go in which - // case we would do a collection right after - // processesing this message, thus changing the actual - // data. Instead we reset the timer to prevent - // additional collections (i.e. since time is paused). - collection_timer.reset(); - debug!( - log, - "received request for current task statistics" - ); - reply_tx.send(stats.clone()).expect("failed to send statistics"); - } - Some(CollectionMessage::Details { reply_tx }) => { - match reply_tx.send(details.clone()) { - Ok(_) => trace!( - log, - "sent producer details reply to oximeter agent", - ), - Err(e) => error!( - log, - "failed to send producer details reply to \ - oximeter agent"; - "error" => ?e, - ), - } - } - } - } - maybe_result = result_rx.recv() => { - let Some(response) = maybe_result else { - error!( - log, - "channel for receiving results from collection task \ - is closed, exiting", - ); - return; - }; - let CollectionResponse { - token, - result, - started_at, - time_queued, - time_collecting - } = response; - match result { - Ok(results) => { - stats.collections.datum.increment(); - let n_samples: u64 = results - .iter() - .map(|each| match each { - ProducerResultsItem::Ok(samples) => samples.len() as u64, - _ => 0, - }) - .sum(); - let success = SuccessfulCollection { - started_at, - time_queued, - time_collecting, - n_samples - }; - details.on_success(success); - if outbox.send((token, results)).await.is_err() { - error!( - log, - "failed to send results to outbox, channel is \ - closed, exiting", - ); - return; - } - } - Err(reason) => { - let failure = FailedCollection { - started_at, - time_queued, - time_collecting, - reason: reason.to_string(), - }; - details.on_failure(failure); - stats.failures_for_reason(reason).datum.increment(); - } - } - } - _ = self_collection_timer.tick() => { - debug!( - log, - "reporting oximeter self-collection statistics" - ); - outbox.send((None, stats.sample())).await.unwrap(); - } - _ = collection_timer.tick() => { - match timer_collection_tx.try_send(CollectionStartTimes::new()) { - Ok(_) => { - debug!( - log, - "sent timer-based collection request to \ - the collection task" - ); - } - Err(TrySendError::Closed(_)) => { - error!( - log, - "timer-based collection request queue is \ - closed, exiting" - ); - return; - } - Err(TrySendError::Full(start)) => { - let failure = FailedCollection { - started_at: start.started_at, - time_queued: Duration::ZERO, - time_collecting: Duration::ZERO, - reason: String::from("collections in progress"), - }; - details.on_failure(failure); - error!( - log, - "timer-based collection request queue is \ - full! This may indicate that the producer \ - has a sampling interval that is too fast \ - for the amount of data it generates"; - "interval" => ?producer_info_tx.borrow().interval, - ); - stats - .failures_for_reason( - self_stats::FailureReason::CollectionsInProgress - ) - .datum - .increment() - } - } - } - } - } -} - -// Struct representing a task for collecting metric data from a single producer -#[derive(Debug)] -struct CollectionTask { - // Channel used to send messages from the agent to the actual task. The task owns the other - // side. - pub inbox: mpsc::Sender, - // Handle to the actual tokio task running the collection loop. - #[allow(dead_code)] - pub task: JoinHandle<()>, -} - -// A task run by `oximeter` in standalone mode, which simply prints results as -// they're received. -async fn results_printer( - log: Logger, - mut rx: mpsc::Receiver<(Option, ProducerResults)>, -) { - loop { - match rx.recv().await { - Some((_, results)) => { - for res in results.into_iter() { - match res { - ProducerResultsItem::Ok(samples) => { - for sample in samples.into_iter() { - info!( - log, - ""; - "sample" => ?sample, - ); - } - } - ProducerResultsItem::Err(e) => { - error!( - log, - "received error from a producer"; - InlineErrorChain::new(&e), - ); - } - } - } - } - None => { - debug!(log, "result queue closed, exiting"); - return; - } - } - } -} - -// Aggregation point for all results, from all collection tasks. -async fn results_sink( - log: Logger, - client: Client, - batch_size: usize, - batch_interval: Duration, - mut rx: mpsc::Receiver<(Option, ProducerResults)>, -) { - let mut timer = interval(batch_interval); - timer.tick().await; // completes immediately - let mut batch = Vec::with_capacity(batch_size); - loop { - let mut collection_token = None; - let insert = tokio::select! { - _ = timer.tick() => { - if batch.is_empty() { - trace!(log, "batch interval expired, but no samples to insert"); - false - } else { - true - } - } - results = rx.recv() => { - match results { - Some((token, results)) => { - let flattened_results = { - let mut flattened = Vec::with_capacity(results.len()); - for inner_batch in results.into_iter() { - match inner_batch { - ProducerResultsItem::Ok(samples) => flattened.extend(samples.into_iter()), - ProducerResultsItem::Err(e) => { - debug!( - log, - "received error (not samples) from a producer: {}", - e.to_string() - ); - } - } - } - flattened - }; - batch.extend(flattened_results); - - collection_token = token; - if collection_token.is_some() { - true - } else { - batch.len() >= batch_size - } - } - None => { - warn!(log, "result queue closed, exiting"); - return; - } - } - } - }; - - if insert { - debug!(log, "inserting {} samples into database", batch.len()); - match client.insert_samples(&batch).await { - Ok(()) => trace!(log, "successfully inserted samples"), - Err(e) => { - warn!( - log, - "failed to insert some results into metric DB: {}", - e.to_string() - ); - } - } - // TODO-correctness The `insert_samples` call above may fail. The method itself needs - // better handling of partially-inserted results in that case, but we may need to retry - // or otherwise handle an error here as well. - // - // See https://github.com/oxidecomputer/omicron/issues/740 for a - // disucssion. - batch.clear(); - } - - if let Some(token) = collection_token { - let _ = token.send(Ok(())); - } - } -} - /// The internal agent the oximeter server uses to collect metrics from producers. #[derive(Clone, Debug)] pub struct OximeterAgent { @@ -788,10 +57,9 @@ pub struct OximeterAgent { // Oximeter target used by this agent to produce metrics about itself. collection_target: self_stats::OximeterCollector, // Handle to the TX-side of a channel for collecting results from the collection tasks - result_sender: mpsc::Sender<(Option, ProducerResults)>, - // The actual tokio tasks running the collection on a timer. - collection_tasks: - Arc>>, + result_sender: mpsc::Sender, + // Handle to each Tokio task collection from a single producer. + collection_tasks: Arc>>, // The interval on which we refresh our list of producers from Nexus refresh_interval: Duration, // Handle to the task used to periodically refresh the list of producers. @@ -860,7 +128,7 @@ impl OximeterAgent { // Spawn the task for aggregating and inserting all metrics tokio::spawn(async move { - results_sink( + crate::results_sink::database_inserter( insertion_log, client, db_config.batch_size, @@ -945,7 +213,7 @@ impl OximeterAgent { // Spawn the task for aggregating and inserting all metrics tokio::spawn(async move { - results_sink( + results_sink::database_inserter( insertion_log, client, db_config.batch_size, @@ -955,7 +223,7 @@ impl OximeterAgent { .await }); } else { - tokio::spawn(results_printer(insertion_log, result_receiver)); + tokio::spawn(results_sink::logger(insertion_log, result_receiver)); } // Set up tracking of statistics about ourselves. @@ -988,29 +256,10 @@ impl OximeterAgent { id: Uuid, ) -> Result { let tasks = self.collection_tasks.lock().await; - let Some((_info, task)) = tasks.get(&id) else { + let Some(task) = tasks.get(&id) else { return Err(Error::NoSuchProducer { id }); }; - let (reply_tx, rx) = oneshot::channel(); - task.inbox.try_send(CollectionMessage::Details { reply_tx }).map_err( - |_| { - Error::CollectionError( - id, - String::from( - "Failed to send detail request to collection task", - ), - ) - }, - )?; - drop(tasks); - rx.await.map_err(|_| { - Error::CollectionError( - id, - String::from( - "Failed to receive detail response from collection task", - ), - ) - }) + task.details().await } /// Register a new producer with this oximeter instance. @@ -1027,10 +276,7 @@ impl OximeterAgent { // the map is held. async fn register_producer_locked( &self, - tasks: &mut MutexGuard< - '_, - BTreeMap, - >, + tasks: &mut MutexGuard<'_, BTreeMap>, info: ProducerEndpoint, ) { let id = info.id; @@ -1042,26 +288,20 @@ impl OximeterAgent { "producer_id" => id.to_string(), "address" => info.address, ); - - // Build channel to control the task and receive results. - let (tx, rx) = mpsc::channel(4); - let q = self.result_sender.clone(); - let log = self.log.new(o!( - "component" => "collection-task", - "producer_id" => id.to_string(), - )); - let info_clone = info; - let target = self.collection_target; - let task = tokio::spawn(async move { - collection_loop(log, target, info_clone, rx, q).await; - }); - value.insert((info, CollectionTask { inbox: tx, task })); + let handle = CollectionTaskHandle::new( + &self.log, + self.collection_target, + info, + self.result_sender.clone(), + ) + .await; + value.insert(handle); } Entry::Occupied(mut value) => { // Only update the endpoint information if it's actually // different, to avoid indefinitely delaying the collection // timer from expiring. - if value.get().0 == info { + if value.get().producer == info { trace!( self.log, "ignoring request to update existing metric \ @@ -1078,14 +318,7 @@ impl OximeterAgent { "interval" => ?info.interval, "address" => info.address, ); - value.get_mut().0 = info; - value - .get() - .1 - .inbox - .send(CollectionMessage::Update(info)) - .await - .unwrap(); + value.get_mut().update(info).await; } } } @@ -1105,10 +338,9 @@ impl OximeterAgent { ) -> Result<(), ForcedCollectionError> { let mut collection_oneshots = vec![]; let collection_tasks = self.collection_tasks.lock().await; - for (_id, (_endpoint, task)) in collection_tasks.iter() { - let (tx, rx) = oneshot::channel(); + for (_id, task) in collection_tasks.iter() { // Scrape from each producer, into oximeter... - task.inbox.send(CollectionMessage::Collect(tx)).await.unwrap(); + let rx = task.collect(); // ... and keep track of the token that indicates once the metric // has made it into ClickHouse. collection_oneshots.push(rx); @@ -1152,7 +384,7 @@ impl OximeterAgent { .await .range((start, Bound::Unbounded)) .take(limit) - .map(|(_id, (info, _t))| *info) + .map(|(_id, task)| task.producer) .collect() } @@ -1166,13 +398,10 @@ impl OximeterAgent { // the map is held. async fn delete_producer_locked( &self, - tasks: &mut MutexGuard< - '_, - BTreeMap, - >, + tasks: &mut MutexGuard<'_, BTreeMap>, id: Uuid, ) -> Result<(), Error> { - let Some((_info, task)) = tasks.remove(&id) else { + let Some(task) = tasks.remove(&id) else { // We have no such producer, so good news, we've removed it! return Ok(()); }; @@ -1181,19 +410,7 @@ impl OximeterAgent { "removed collection task from set"; "producer_id" => %id, ); - match task.inbox.send(CollectionMessage::Shutdown).await { - Ok(_) => debug!( - self.log, - "shut down collection task"; - "producer_id" => %id, - ), - Err(e) => error!( - self.log, - "failed to shut down collection task"; - "producer_id" => %id, - InlineErrorChain::new(&e), - ), - } + task.shutdown().await; Ok(()) } @@ -1367,7 +584,6 @@ async fn claim_nexus_with_backoff( #[cfg(test)] mod tests { - use super::CollectionMessage; use super::OximeterAgent; use super::ProducerEndpoint; use crate::self_stats::FailureReason; @@ -1387,7 +603,6 @@ mod tests { use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; - use tokio::sync::oneshot; use tokio::time::Instant; use uuid::Uuid; @@ -1518,21 +733,15 @@ mod tests { } // Request the statistics from the task itself. - let (reply_tx, rx) = oneshot::channel(); - collector + let stats = collector .collection_tasks .lock() .await .values() .next() .unwrap() - .1 - .inbox - .send(CollectionMessage::Statistics { reply_tx }) - .await - .expect("failed to request statistics from task"); - let stats = rx.await.expect("failed to receive statistics from task"); - + .statistics() + .await; let count = stats.collections.datum.value() as usize; assert!(count != 0); @@ -1589,20 +798,15 @@ mod tests { } // Request the statistics from the task itself. - let (reply_tx, rx) = oneshot::channel(); - collector + let stats = collector .collection_tasks .lock() .await .values() .next() .unwrap() - .1 - .inbox - .send(CollectionMessage::Statistics { reply_tx }) - .await - .expect("failed to request statistics from task"); - let stats = rx.await.expect("failed to receive statistics from task"); + .statistics() + .await; assert_eq!(stats.collections.datum.value(), 0); assert_eq!( stats @@ -1668,20 +872,15 @@ mod tests { } // Request the statistics from the task itself. - let (reply_tx, rx) = oneshot::channel(); - collector + let stats = collector .collection_tasks .lock() .await .values() .next() .unwrap() - .1 - .inbox - .send(CollectionMessage::Statistics { reply_tx }) - .await - .expect("failed to request statistics from task"); - let stats = rx.await.expect("failed to receive statistics from task"); + .statistics() + .await; let count = stats .failed_collections .get(&FailureReason::Other( diff --git a/oximeter/collector/src/collection_task.rs b/oximeter/collector/src/collection_task.rs new file mode 100644 index 0000000000..716f87421f --- /dev/null +++ b/oximeter/collector/src/collection_task.rs @@ -0,0 +1,898 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Task responsible for collecting from a single producer. + +// Copyright 2024 Oxide Computer Company + +use crate::self_stats; +use crate::Error; +use chrono::DateTime; +use chrono::Utc; +use omicron_common::api::internal::nexus::ProducerEndpoint; +use oximeter::types::ProducerResults; +use oximeter::types::ProducerResultsItem; +use oximeter_api::FailedCollection; +use oximeter_api::ProducerDetails; +use oximeter_api::SuccessfulCollection; +use slog::debug; +use slog::error; +use slog::o; +use slog::trace; +use slog::warn; +use slog::Logger; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; +use tokio::sync::mpsc; +use tokio::sync::mpsc::error::TrySendError; +use tokio::sync::oneshot; +use tokio::sync::watch; +use tokio::time::interval; +use tokio::time::Instant; +use tokio::time::Interval; + +/// A token used to force a collection. +/// +/// If the collection is successfully completed, `Ok(())` will be sent back on the +/// contained oneshot channel. Note that that "successful" means the actual +/// request completed, _not_ that results were successfully collected. I.e., it +/// means "this attempt is done". +/// +/// If the collection could not be queued because there are too many outstanding +/// force collection attempts, an `Err(ForcedCollectionQueueFull)` is returned. +type CollectionToken = oneshot::Sender>; + +/// Error returned when a forced collection fails. +#[derive(Clone, Copy, Debug)] +pub enum ForcedCollectionError { + /// The internal queue of requests is full. + QueueFull, + /// We failed to send the request because the channel was closed. + Closed, +} + +/// Timeout on any single collection from a producer. +const COLLECTION_TIMEOUT: Duration = Duration::from_secs(30); + +/// The number of forced collections queued before we start to deny them. +const N_QUEUED_FORCED_COLLECTIONS: usize = 1; + +/// The number of timer-based collections queued before we start to deny them. +const N_QUEUED_TIMER_COLLECTIONS: usize = 1; + +/// The number of queued messages from the main collector agent. +const N_QUEUED_TASK_MESSAGES: usize = 4; + +/// The number of queued results from our internal collection task. +const N_QUEUED_RESULTS: usize = 1; + +// Messages for controlling a collection task +#[derive(Debug)] +enum CollectionMessage { + // Explicit request that the task collect data from its producer + // + // Also sends a oneshot that is signalled once the task scrapes + // data from the Producer, and places it in the Clickhouse server. + Collect(CollectionToken), + // Request that the task update its interval and the socket address on which it collects data + // from its producer. + Update(ProducerEndpoint), + // Request that the task exit + Shutdown, + // Return the current statistics from a single task. + #[cfg(test)] + Statistics { + reply_tx: oneshot::Sender, + }, + // Request details from the collection task about its producer. + Details { + reply_tx: oneshot::Sender, + }, +} + +/// Return type for `perform_collection`. +struct SingleCollectionResult { + /// The result of the collection. + result: Result, + /// The duration the collection took. + duration: Duration, +} + +/// Run a single collection from the producer. +async fn perform_collection( + log: Logger, + client: reqwest::Client, + producer: ProducerEndpoint, +) -> SingleCollectionResult { + let start = Instant::now(); + debug!(log, "collecting from producer"); + let res = client + .get(format!("http://{}/{}", producer.address, producer.id)) + .send() + .await; + trace!(log, "sent collection request to producer"); + let result = match res { + Ok(res) => { + if res.status().is_success() { + match res.json::().await { + Ok(results) => { + debug!( + log, + "collected results from producer"; + "n_results" => results.len() + ); + Ok(results) + } + Err(e) => { + warn!( + log, + "failed to collect results from producer"; + InlineErrorChain::new(&e), + ); + Err(self_stats::FailureReason::Deserialization) + } + } + } else { + warn!( + log, + "failed to receive metric results from producer"; + "status_code" => res.status().as_u16(), + ); + Err(self_stats::FailureReason::Other(res.status())) + } + } + Err(e) => { + error!( + log, + "failed to send collection request to producer"; + InlineErrorChain::new(&e), + ); + Err(self_stats::FailureReason::Unreachable) + } + }; + SingleCollectionResult { result, duration: start.elapsed() } +} + +// The type of one collection task run to completion. +// +// An `Err(_)` means we failed to collect, and contains the reason so that we +// can bump the self-stat counter accordingly. +type CollectionResult = Result; + +/// Information about when we start a collection. +struct CollectionStartTimes { + /// UTC timestamp at which the request was started. + started_at: DateTime, + /// Instant right before we queued the response for processing. + queued_at: Instant, +} + +impl CollectionStartTimes { + fn new() -> Self { + Self { started_at: Utc::now(), queued_at: Instant::now() } + } +} + +/// Details about a forced collection. +struct ForcedCollectionRequest { + /// The collection token we signal when the collection is completed. + token: CollectionToken, + /// Start time for this collection. + start: CollectionStartTimes, +} + +impl ForcedCollectionRequest { + fn new(token: CollectionToken) -> Self { + Self { token, start: CollectionStartTimes::new() } + } +} + +/// Details about a completed collection. +struct CollectionResponse { + /// Token for a forced collection request. + token: Option, + /// The actual result of the collection. + result: CollectionResult, + /// Time when the collection started. + started_at: DateTime, + /// Time the request spent queued. + time_queued: Duration, + /// Time we spent processing the request. + time_collecting: Duration, +} + +/// Task that actually performs collections from the producer. +async fn collection_loop( + log: Logger, + mut producer_info_rx: watch::Receiver, + mut forced_collection_rx: mpsc::Receiver, + mut timer_collection_rx: mpsc::Receiver, + result_tx: mpsc::Sender, +) { + let client = reqwest::Client::builder() + .timeout(COLLECTION_TIMEOUT) + .build() + // Safety: `build()` only fails if TLS couldn't be initialized or the + // system DNS configuration could not be loaded. + .unwrap(); + loop { + // Wait for notification that we have a collection to perform, from + // either the forced- or timer-collection queue. + trace!(log, "top of inner collection loop, waiting for next request"); + let (maybe_token, start_time) = tokio::select! { + maybe_request = forced_collection_rx.recv() => { + let Some(ForcedCollectionRequest { token, start }) = maybe_request else { + debug!( + log, + "forced collection request queue closed, exiting" + ); + return; + }; + (Some(token), start) + } + maybe_request = timer_collection_rx.recv() => { + let Some(start) = maybe_request else { + debug!( + log, + "timer collection request queue closed, exiting" + ); + return; + }; + (None, start) + } + }; + + // Record the time this request was queued. We'll include this along + // with the time spent collecting, which is returned from the future + // that actually does the collection. + let CollectionStartTimes { started_at, queued_at } = start_time; + let time_queued = queued_at.elapsed(); + + // Make a future to represent the actual collection. + let mut collection_fut = Box::pin(perform_collection( + log.clone(), + client.clone(), + *producer_info_rx.borrow_and_update(), + )); + + // Wait for that collection to complete or fail, or for an update to the + // producer's information. In the latter case, recreate the future for + // the collection itself with the new producer information. + let SingleCollectionResult { result, duration } = 'collection: loop { + tokio::select! { + biased; + + maybe_update = producer_info_rx.changed() => { + match maybe_update { + Ok(_) => { + let update = *producer_info_rx.borrow_and_update(); + debug!( + log, + "received producer info update with an outstanding \ + collection running, cancelling it and recreating \ + with the new info"; + "new_info" => ?&update, + ); + collection_fut = Box::pin(perform_collection( + log.new(o!("address" => update.address)), + client.clone(), + update, + )); + continue 'collection; + } + Err(e) => { + error!( + log, + "failed to receive on producer update \ + watch channel, exiting"; + InlineErrorChain::new(&e), + ); + return; + } + } + } + + collection_result = &mut collection_fut => { + // NOTE: This break here is intentional. We cannot just call + // `result_tx.send()` in this loop, because that moves out + // of `maybe_token`, which isn't Copy. Break the loop, and + // then send it after we know we've completed the + // collection. + break 'collection collection_result; + } + } + }; + + // Now that the collection has completed, send on the results, along + // with the timing information and any collection token we may have + // gotten with the request. + let response = CollectionResponse { + token: maybe_token, + result, + started_at, + time_queued, + time_collecting: duration, + }; + match result_tx.send(response).await { + Ok(_) => trace!(log, "forwarded results to main collection loop"), + Err(_) => { + error!( + log, + "failed to forward results to \ + collection loop, channel is closed, exiting", + ); + return; + } + } + } +} + +/// Type of each output sent from a collection task to the results sink. +pub type CollectionTaskOutput = (Option, ProducerResults); + +/// Handle to the task which collects metric data from a single producer. +#[derive(Debug)] +pub struct CollectionTaskHandle { + /// Information about the producer we're currently collecting from. + pub producer: ProducerEndpoint, + // Channel used to send messages from the agent to the actual task. + // + // The task owns the other side. + task_tx: mpsc::Sender, + log: Logger, +} + +impl CollectionTaskHandle { + /// Create a new collection task handle. + /// + /// This spawns the actual task itself, and returns a handle to it. The + /// latter is used to send messages to the task, through the handle's + /// `inbox` field. + pub async fn new( + log: &Logger, + collector: self_stats::OximeterCollector, + producer: ProducerEndpoint, + outbox: mpsc::Sender, + ) -> Self { + let (task, task_tx) = + CollectionTask::new(log, collector, producer, outbox).await; + tokio::spawn(task.run()); + let log = log.new(o!( + "component" => "collection-task-handle", + "producer_id" => producer.id.to_string(), + )); + Self { task_tx, producer, log } + } + + /// Ask the task to update its producer endpoint information. + /// + /// # Panics + /// + /// This panics if we could not send a message to the internal collection + /// task. That only happens when that task has exited. + pub async fn update(&mut self, info: ProducerEndpoint) { + match self.task_tx.send(CollectionMessage::Update(info)).await { + Ok(_) => { + trace!( + self.log, + "sent update message to task"; + "new_info" => ?info, + ); + self.producer = info; + } + Err(e) => { + error!( + self.log, + "failed to send update message to task!"; + "error" => InlineErrorChain::new(&e), + ); + panic!("failed to send update message to task: {}", e); + } + } + } + + /// Ask the collection task to shutdown. + pub async fn shutdown(&self) { + match self.task_tx.send(CollectionMessage::Shutdown).await { + Ok(_) => trace!(self.log, "sent shutdown message to task"), + Err(e) => error!( + self.log, + "failed to send shutdown message to task!"; + "error" => InlineErrorChain::new(&e), + ), + } + } + + /// Return the current statistics from this task. + #[cfg(test)] + pub async fn statistics(&self) -> self_stats::CollectionTaskStats { + let (reply_tx, rx) = oneshot::channel(); + self.task_tx + .send(CollectionMessage::Statistics { reply_tx }) + .await + .expect("Failed to send statistics message"); + rx.await.expect("Failed to receive statistics") + } + + /// Return details about the current producer this task is collecting from. + /// + /// An error is returned if we either could not send the request to the + /// producer because its queue is full, or because the task failed to send + /// us the response. + /// + /// Note that this makes collecting details best-effort -- if the task is + /// already doing lots of work and its queue is full, we fail rather than + /// block. + pub async fn details(&self) -> Result { + let (reply_tx, rx) = oneshot::channel(); + if self + .task_tx + .try_send(CollectionMessage::Details { reply_tx }) + .is_err() + { + return Err(Error::CollectionError( + self.producer.id, + String::from( + "Failed to send detail request to collection task", + ), + )); + } + rx.await.map_err(|_| { + Error::CollectionError( + self.producer.id, + String::from( + "Failed to receive detail response from collection task", + ), + ) + }) + } + + /// Explicitly request that the task collect from its producer now. + /// + /// Note that this doesn't block, instead returning a oneshot that will + /// resolve when the collection completes. + pub fn collect( + &self, + ) -> oneshot::Receiver> { + let (tx, rx) = oneshot::channel(); + match self.task_tx.try_send(CollectionMessage::Collect(tx)) { + Ok(_) => rx, + Err(err) => { + let (err, msg) = match err { + TrySendError::Full(msg) => { + (ForcedCollectionError::QueueFull, msg) + } + TrySendError::Closed(msg) => { + (ForcedCollectionError::Closed, msg) + } + }; + let CollectionMessage::Collect(tx) = msg else { + unreachable!(); + }; + // Safety: In this case, we own both sides of the channel and we + // know nothing has been sent on it. This can't fail. + tx.send(Err(err)).unwrap(); + rx + } + } + } +} + +/// Helper type used to simplify control flow in the main `CollectionTask::run` +/// method. +type TaskAction = std::ops::ControlFlow<()>; + +/// Main task used to dispatch messages from the oximeter agent and request +/// collections from the producer. +#[derive(Debug)] +struct CollectionTask { + log: Logger, + + // The details about past collections from this producer. + details: ProducerDetails, + + // Statistics about all collections we've made so far. + stats: self_stats::CollectionTaskStats, + + // Inbox for messages from the controlling task handle. + inbox: mpsc::Receiver, + + // Watch channel for broadcasting changes about the producer. + producer_info_tx: watch::Sender, + + // Channel for sending forced collection requests. + forced_collection_tx: mpsc::Sender, + + // Channel for sending timer-based collection requests. + timer_collection_tx: mpsc::Sender, + + // Channel for receiving collection responses from the inner collection + // loop. + result_rx: mpsc::Receiver, + + // Outbox for forwarding the results to the sink. + outbox: mpsc::Sender, + + // Timer for making collections periodically. + collection_timer: Interval, + + // Timer for reporting our own collection statistics to the database. + self_collection_timer: Interval, +} + +impl CollectionTask { + // Construct a new collection task. + // + // This also spawns the internal task which itself manages the collections + // from our assigned producer. It then creates all the controlling queues + // for talking to this task and the inner task. + async fn new( + log: &Logger, + collector: self_stats::OximeterCollector, + producer: ProducerEndpoint, + outbox: mpsc::Sender, + ) -> (Self, mpsc::Sender) { + // Create our own logger. + let log = log.new(o!( + "component" => "collection-task", + "producer_id" => producer.id.to_string(), + )); + + // Setup queues for talking between ourselves, our controlling task + // handle, and the spawned collection loop itself. + let (task_tx, inbox) = mpsc::channel(N_QUEUED_TASK_MESSAGES); + let (producer_info_tx, producer_info_rx) = watch::channel(producer); + let (forced_collection_tx, forced_collection_rx) = + mpsc::channel(N_QUEUED_FORCED_COLLECTIONS); + let (timer_collection_tx, timer_collection_rx) = + mpsc::channel(N_QUEUED_TIMER_COLLECTIONS); + let (result_tx, result_rx) = mpsc::channel(N_QUEUED_RESULTS); + tokio::task::spawn(collection_loop( + log.clone(), + producer_info_rx, + forced_collection_rx, + timer_collection_rx, + result_tx, + )); + + // Construct ourself, and return our controlling input queue. + let details = ProducerDetails::new(&producer); + let stats = self_stats::CollectionTaskStats::new(collector, &producer); + let collection_timer = Self::timer(producer.interval).await; + let self_collection_timer = + Self::timer(self_stats::COLLECTION_INTERVAL).await; + let self_ = Self { + log, + details, + stats, + inbox, + outbox, + producer_info_tx, + forced_collection_tx, + timer_collection_tx, + result_rx, + collection_timer, + self_collection_timer, + }; + (self_, task_tx) + } + + /// Helper to construct a timer and tick it. + /// + /// Since a `tokio::time::interval`'s first tick completes immediately, this + /// constructs the timer and then _ticks it_ once. + async fn timer(t: Duration) -> Interval { + let mut timer = interval(t); + timer.tick().await; + timer + } + + /// Run the main loop of this collection task. + /// + /// NOTE: This returns a `TaskAction`, but the value isn't used. It returns + /// that value to simplify control-flow internally, which uses `?` to + /// propagate the `TaskAction::Break` variant when we need to exit. + async fn run(mut self) -> TaskAction { + loop { + tokio::select! { + message = self.inbox.recv() => { + let Some(message) = message else { + debug!( + self.log, + "collection task inbox closed, shutting down" + ); + return TaskAction::Break(()); + }; + self.handle_inbox_message(message).await?; + } + maybe_result = self.result_rx.recv() => { + let Some(response) = maybe_result else { + error!( + self.log, + "channel for receiving results from collection task \ + is closed, exiting", + ); + return TaskAction::Break(()); + }; + self.handle_collection_response(response).await?; + } + _ = self.self_collection_timer.tick() => { + debug!( + self.log, + "reporting oximeter self-collection statistics" + ); + self.outbox.send((None, self.stats.sample())).await.unwrap(); + } + _ = self.collection_timer.tick() => { + self.handle_collection_timer_tick().await?; + } + } + } + } + + /// Handle a single message from the task handle. + /// + /// This method takes messages from the main oximeter agent, passed through + /// our controlling handle. This implements the main public API of the + /// `CollectionTaskHandle` methods that the agent uses. + async fn handle_inbox_message( + &mut self, + message: CollectionMessage, + ) -> TaskAction { + match message { + CollectionMessage::Shutdown => { + debug!(self.log, "collection task received shutdown request"); + return TaskAction::Break(()); + } + CollectionMessage::Collect(token) => { + debug!( + self.log, + "collection task received explicit request to collect" + ); + let request = ForcedCollectionRequest::new(token); + match self.forced_collection_tx.try_send(request) { + Ok(_) => { + trace!( + self.log, + "forwarded explicit request to collection task" + ); + } + Err(e) => match e { + TrySendError::Closed(ForcedCollectionRequest { + token, + .. + }) => { + debug!( + self.log, + "collection task forced collection \ + queue is closed. Attempting to \ + notify caller and exiting.", + ); + let _ = + token.send(Err(ForcedCollectionError::Closed)); + return TaskAction::Break(()); + } + TrySendError::Full(ForcedCollectionRequest { + token, + start, + }) => { + error!( + self.log, + "collection task forced collection \ + queue is full! This should never \ + happen, and probably indicates \ + a bug in your test code, such as \ + calling `force_collection()` many \ + times" + ); + if token + .send(Err(ForcedCollectionError::QueueFull)) + .is_err() + { + warn!( + self.log, + "failed to notify caller of \ + force_collection(), oneshot is \ + closed" + ); + } + let failure = FailedCollection { + started_at: start.started_at, + time_queued: Duration::ZERO, + time_collecting: Duration::ZERO, + reason: String::from( + "forced collection queue full", + ), + }; + self.details.on_failure(failure); + } + }, + } + } + CollectionMessage::Update(new_info) => { + // If the collection interval is shorter than the + // interval on which we receive these update messages, + // we'll never actually collect anything! Instead, only + // do the update if the information has changed. This + // should also be guarded against by the main agent, but + // we're being cautious here. + let updated_producer_info = |info: &mut ProducerEndpoint| { + if new_info == *info { + false + } else { + *info = new_info; + true + } + }; + if !self + .producer_info_tx + .send_if_modified(updated_producer_info) + { + trace!( + self.log, + "collection task received update with \ + identical producer information, no \ + updates will be sent to the collection task" + ); + return TaskAction::Continue(()); + } + + // We have an actual update to the producer information. + // + // Rebuild our timer to reflect the possibly-new + // interval. The collection task has already been + // notified above. + debug!( + self.log, + "collection task received request to update \ + its producer information"; + "interval" => ?new_info.interval, + "address" => new_info.address, + ); + self.details.update(&new_info); + self.stats.update(&new_info); + self.collection_timer = Self::timer(new_info.interval).await; + } + #[cfg(test)] + CollectionMessage::Statistics { reply_tx } => { + // Time should be paused when using this retrieval + // mechanism. We advance time to cause a panic if this + // message were to be sent with time *not* paused. + tokio::time::advance(Duration::from_nanos(1)).await; + // The collection timer *may* be ready to go in which + // case we would do a collection right after + // processesing this message, thus changing the actual + // data. Instead we reset the timer to prevent + // additional collections (i.e. since time is paused). + self.collection_timer.reset(); + debug!( + self.log, + "received request for current task statistics" + ); + reply_tx + .send(self.stats.clone()) + .expect("failed to send statistics"); + } + CollectionMessage::Details { reply_tx } => { + match reply_tx.send(self.details.clone()) { + Ok(_) => trace!( + self.log, + "sent producer details reply to oximeter agent", + ), + Err(e) => error!( + self.log, + "failed to send producer details reply to \ + oximeter agent"; + "error" => ?e, + ), + } + } + } + + // Continue unless we explicitly exit early. + TaskAction::Continue(()) + } + + /// Handle a single collection response from the inner collection task. + /// + /// This takes responses from the spawned task that actually does + /// collections, and dispatches them to the results sink. It also updates + /// our own details and collection stats accordingly. + async fn handle_collection_response( + &mut self, + response: CollectionResponse, + ) -> TaskAction { + let CollectionResponse { + token, + result, + started_at, + time_queued, + time_collecting, + } = response; + match result { + Ok(results) => { + self.stats.collections.datum.increment(); + let n_samples: u64 = results + .iter() + .map(|each| match each { + ProducerResultsItem::Ok(samples) => { + samples.len() as u64 + } + _ => 0, + }) + .sum(); + let success = SuccessfulCollection { + started_at, + time_queued, + time_collecting, + n_samples, + }; + self.details.on_success(success); + if self.outbox.send((token, results)).await.is_err() { + error!( + self.log, + "failed to send results to outbox, channel is \ + closed, exiting", + ); + return TaskAction::Break(()); + } + } + Err(reason) => { + let failure = FailedCollection { + started_at, + time_queued, + time_collecting, + reason: reason.to_string(), + }; + self.details.on_failure(failure); + self.stats.failures_for_reason(reason).datum.increment(); + } + } + TaskAction::Continue(()) + } + + async fn handle_collection_timer_tick(&mut self) -> TaskAction { + match self.timer_collection_tx.try_send(CollectionStartTimes::new()) { + Ok(_) => { + debug!( + self.log, + "sent timer-based collection request to \ + the collection task" + ); + } + Err(TrySendError::Closed(_)) => { + error!( + self.log, + "timer-based collection request queue is \ + closed, exiting" + ); + return TaskAction::Break(()); + } + Err(TrySendError::Full(start)) => { + let failure = FailedCollection { + started_at: start.started_at, + time_queued: Duration::ZERO, + time_collecting: Duration::ZERO, + reason: String::from("collections in progress"), + }; + self.details.on_failure(failure); + error!( + self.log, + "timer-based collection request queue is \ + full! This may indicate that the producer \ + has a sampling interval that is too fast \ + for the amount of data it generates"; + "interval" => ?self.producer_info_tx.borrow().interval, + ); + self.stats + .failures_for_reason( + self_stats::FailureReason::CollectionsInProgress, + ) + .datum + .increment() + } + } + TaskAction::Continue(()) + } +} diff --git a/oximeter/collector/src/lib.rs b/oximeter/collector/src/lib.rs index 3f13eb1382..54044b0068 100644 --- a/oximeter/collector/src/lib.rs +++ b/oximeter/collector/src/lib.rs @@ -4,9 +4,9 @@ //! Implementation of the `oximeter` metric collection server. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company -pub use agent::ForcedCollectionError; +pub use collection_task::ForcedCollectionError; use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::HttpError; @@ -42,7 +42,9 @@ use thiserror::Error; use uuid::Uuid; mod agent; +mod collection_task; mod http_entrypoints; +mod results_sink; mod self_stats; mod standalone; diff --git a/oximeter/collector/src/results_sink.rs b/oximeter/collector/src/results_sink.rs new file mode 100644 index 0000000000..3013d472b6 --- /dev/null +++ b/oximeter/collector/src/results_sink.rs @@ -0,0 +1,148 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Tasks acting as sinks for results. +//! +//! This includes the usual task that inserts data into ClickHouse, and a +//! printing task used in `oximeter` standalone. + +// Copyright 2024 Oxide Computer Company + +use crate::collection_task::CollectionTaskOutput; +use oximeter::types::ProducerResultsItem; +use oximeter_db::Client; +use oximeter_db::DbWrite as _; +use slog::debug; +use slog::error; +use slog::info; +use slog::trace; +use slog::warn; +use slog::Logger; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; +use tokio::sync::mpsc; +use tokio::time::interval; + +/// A sink that inserts all results into the ClickHouse database. +/// +/// This sink is used in production, when running the `oximeter` collector +/// normally. It aggregates all results, from all collection tasks, and inserts +/// them into ClickHouse in batches. +pub async fn database_inserter( + log: Logger, + client: Client, + batch_size: usize, + batch_interval: Duration, + mut rx: mpsc::Receiver, +) { + let mut timer = interval(batch_interval); + timer.tick().await; // completes immediately + let mut batch = Vec::with_capacity(batch_size); + loop { + let mut collection_token = None; + let insert = tokio::select! { + _ = timer.tick() => { + if batch.is_empty() { + trace!(log, "batch interval expired, but no samples to insert"); + false + } else { + true + } + } + results = rx.recv() => { + match results { + Some((token, results)) => { + let flattened_results = { + let mut flattened = Vec::with_capacity(results.len()); + for inner_batch in results.into_iter() { + match inner_batch { + ProducerResultsItem::Ok(samples) => flattened.extend(samples.into_iter()), + ProducerResultsItem::Err(e) => { + debug!( + log, + "received error (not samples) from a producer: {}", + e.to_string() + ); + } + } + } + flattened + }; + batch.extend(flattened_results); + + collection_token = token; + if collection_token.is_some() { + true + } else { + batch.len() >= batch_size + } + } + None => { + warn!(log, "result queue closed, exiting"); + return; + } + } + } + }; + + if insert { + debug!(log, "inserting {} samples into database", batch.len()); + match client.insert_samples(&batch).await { + Ok(()) => trace!(log, "successfully inserted samples"), + Err(e) => { + warn!( + log, + "failed to insert some results into metric DB: {}", + e.to_string() + ); + } + } + // TODO-correctness The `insert_samples` call above may fail. The method itself needs + // better handling of partially-inserted results in that case, but we may need to retry + // or otherwise handle an error here as well. + // + // See https://github.com/oxidecomputer/omicron/issues/740 for a + // disucssion. + batch.clear(); + } + + if let Some(token) = collection_token { + let _ = token.send(Ok(())); + } + } +} + +/// A sink run in `oximeter` standalone, that logs results on receipt. +pub async fn logger(log: Logger, mut rx: mpsc::Receiver) { + loop { + match rx.recv().await { + Some((_, results)) => { + for res in results.into_iter() { + match res { + ProducerResultsItem::Ok(samples) => { + for sample in samples.into_iter() { + info!( + log, + ""; + "sample" => ?sample, + ); + } + } + ProducerResultsItem::Err(e) => { + error!( + log, + "received error from a producer"; + InlineErrorChain::new(&e), + ); + } + } + } + } + None => { + debug!(log, "result queue closed, exiting"); + return; + } + } + } +} From 228848edd5ec5d0a6ad883670453c59de8188ecb Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Mon, 9 Dec 2024 16:28:56 -0800 Subject: [PATCH 15/22] omicron-package: work around cargo issue 8157 (#7218) #5799 modified the `cargo build` command line omicron-package runs. Previously it built up a list of packages to be built using the `-p` flag; that PR changed it to use `--bin`. The goal was to build only the binaries that are necessary for shipping; this avoids building sled-agent-sim during releng, for instance. We did not realize it at the time, but this invited the specter of https://github.com/rust-lang/cargo/issues/8157 to wreak havoc; namely: - Without `--package`, Cargo uses the `default-members` key of the workspace Cargo.toml to determine which packages to build. `--bin` does not cause the same thing to happen; saying `--bin` does _not_ imply `--package [the package that the bin belongs to]`. - `omicron-dev` belongs to `default-members` and has a normal dependency on `nexus-test-utils`, which enables the `"testing"` feature of `nexus-db-queries`. https://github.com/oxidecomputer/omicron/issues/7208 is a known result of this problem, but there might be more. Fortunately the solution seems fairly easy, without reverting the relevant changes from #5799: use _both_ `--package` and `--bin`. With this change, the `"testing"` feature is no longer shown in the `cargo build --unit-graph` and `nm target/release/nexus | demangle | grep validate_volume_invar` no longer shows any matching testing-only symbols. --- package/src/bin/omicron-package.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index c828c90432..cc4050cbce 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -108,6 +108,7 @@ struct Args { #[derive(Debug, Default)] struct CargoPlan<'a> { command: &'a str, + packages: BTreeSet<&'a String>, bins: BTreeSet<&'a String>, features: BTreeSet<&'a String>, release: bool, @@ -123,6 +124,12 @@ impl<'a> CargoPlan<'a> { // We rely on the rust-toolchain.toml file for toolchain information, // rather than specifying one within the packaging tool. cmd.arg(self.command); + // We specify _both_ --package and --bin; --bin does not imply + // --package, and without any --package options Cargo unifies features + // across all workspace default members. See rust-lang/cargo#8157. + for package in &self.packages { + cmd.arg("--package").arg(package); + } for bin in &self.bins { cmd.arg("--bin").arg(bin); } @@ -185,9 +192,12 @@ async fn do_for_all_rust_packages( let mut debug = CargoPlan { command, release: false, ..Default::default() }; for (name, pkg) in config.packages_to_build().0 { - // If this is a Rust package... + // If this is a Rust package, `name` (the map key) is the name of the + // corresponding Rust crate. if let PackageSource::Local { rust: Some(rust_pkg), .. } = &pkg.source { let plan = if rust_pkg.release { &mut release } else { &mut debug }; + // Add the package name to the plan + plan.packages.insert(name); // Get the package metadata let metadata = workspace_pkgs.get(name).with_context(|| { format!("package '{name}' is not a workspace package") From c8f8332bc8941a6696ebd290360d331fd9f158cb Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 9 Dec 2024 19:12:43 -0800 Subject: [PATCH 16/22] Update to diesel-dtrace 0.4.0 (#7219) --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f5e44318bb..3d290f146b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,9 +2302,9 @@ dependencies = [ [[package]] name = "diesel" -version = "2.2.4" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "158fe8e2e68695bd615d7e4f3227c0727b151330d3e253b525086c348d055d5e" +checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12" dependencies = [ "bitflags 2.6.0", "byteorder", @@ -2321,9 +2321,9 @@ dependencies = [ [[package]] name = "diesel-dtrace" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5318329cce80f28564e585bb5ba4007bdf16865efa13d797a4f0fd4b1fed40f1" +checksum = "4e5130181059723aae1cfdb678d3698052a225aaadb18000f77fec4200047acc" dependencies = [ "diesel", "serde", diff --git a/Cargo.toml b/Cargo.toml index b65617f082..ec862fb722 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -361,7 +361,7 @@ derive_more = "0.99.18" derive-where = "1.2.7" # Having the i-implement-... feature here makes diesel go away from the workspace-hack diesel = { version = "2.2.4", features = ["i-implement-a-third-party-backend-and-opt-into-breaking-changes", "postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } -diesel-dtrace = "0.3.0" +diesel-dtrace = "0.4.0" dns-server = { path = "dns-server" } dns-server-api = { path = "dns-server-api" } dns-service-client = { path = "clients/dns-service-client" } From 7bc8572659148b67d88687c5d2a6e0f59c24b365 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 10 Dec 2024 11:11:19 -0800 Subject: [PATCH 17/22] Fix test flake when verifying producers (#7223) Closes #7220 --- oximeter/collector/src/agent.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index ae9aef1bd9..e924cb2ee3 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -970,7 +970,22 @@ mod tests { // We don't manipulate time manually here, since this is pretty short // and we want to assert things about the actual timing in the test // below. - while collection_count.load(Ordering::SeqCst) < 1 { + let is_ready = || async { + // We need to check if the server has had a collection request, and + // also if we've processed it on our task side. If we don't wait for + // the second bit, updating our collection details in the task races + // with the rest of this test that checks those details. + if collection_count.load(Ordering::SeqCst) < 1 { + return false; + } + collector + .producer_details(id) + .await + .expect("Should be able to get producer details") + .n_collections + > 0 + }; + while !is_ready().await { tokio::time::sleep(TICK_INTERVAL).await; } @@ -980,6 +995,7 @@ mod tests { .producer_details(id) .await .expect("Should be able to get producer details"); + println!("{details:#?}"); assert_eq!(details.id, id); assert!(details.registered > before); assert!(details.updated > before); From c9d86e27ea8fdf7dc25331057bc6fee6c4a6b678 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 10 Dec 2024 14:54:43 -0500 Subject: [PATCH 18/22] Automatic bump of permslip manifest to sidecar-v1.0.32 (#7222) Automated bump --- tools/permslip_staging | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/permslip_staging b/tools/permslip_staging index 9c413ddc6e..c224b1ea95 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ c33a381e716127e05da928c39b3a4d5f5278e43f526ff8c5c817708c378a5c87 manifest-gimlet-v1.0.32.toml 2cda350adba506b3ab67813db932d07c7a7836b5731d5351e57d49302f41dbf4 manifest-oxide-rot-1-v1.0.30.toml 70de21757b47e3e6c15d4c8701efe80e8cc90125afdd2883ff160045aed20956 manifest-psc-v1.0.31.toml -499ee08eb77ed3600564239f3f3efdcf79f122ffc4b93b168790c24358ae1e3c manifest-sidecar-v1.0.31.toml +222ae9df38699037b75e98eb7a8b441f6cda958b8a79e57e72e410b054f1d8eb manifest-sidecar-v1.0.32.toml 6f8459afe22c27d5920356878e4d8d639464f39a15ce7b5b040c2d908d52a570 manifest-bootleby-v1.3.1.toml From cfd5a8973742cee1ca78a96f98f3231ec864ae55 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:24:46 +0000 Subject: [PATCH 19/22] Bump hashbrown from 0.15.0 to 0.15.1 (#7203) --- Cargo.lock | 10 +++++----- workspace-hack/Cargo.toml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3d290f146b..8c374f5609 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3692,9 +3692,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" dependencies = [ "allocator-api2", "equivalent", @@ -4524,7 +4524,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "serde", ] @@ -5339,7 +5339,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.0", + "hashbrown 0.15.1", ] [[package]] @@ -7395,7 +7395,7 @@ dependencies = [ "generic-array", "getrandom", "group", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "hex", "hickory-proto", "hmac", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index b0bf8858d5..9bd648132a 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -62,7 +62,7 @@ gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway- generic-array = { version = "0.14.7", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } -hashbrown = { version = "0.15.0" } +hashbrown = { version = "0.15.1" } hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } @@ -182,7 +182,7 @@ gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway- generic-array = { version = "0.14.7", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } -hashbrown = { version = "0.15.0" } +hashbrown = { version = "0.15.1" } hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } From dcc0df384de6b1c708f715f66cc013c6fc3cde5c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 11 Dec 2024 13:47:01 -0800 Subject: [PATCH 20/22] Fix get_dataset_properties to avoid propagating inherited UUIDs (#7232) Fixes https://github.com/oxidecomputer/omicron/issues/7231 Rather than relying on `zfs list`, uses `zfs get` and parses the `source` field to decide how properties should be used. This method is used when reporting inventory. --- illumos-utils/src/zfs.rs | 368 ++++++++++++++++++++++++++++----------- 1 file changed, 271 insertions(+), 97 deletions(-) diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index fa09fb22c5..f9edb8de86 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -5,14 +5,16 @@ //! Utilities for poking at ZFS. use crate::{execute, PFEXEC}; +use anyhow::anyhow; +use anyhow::bail; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use omicron_common::api::external::ByteCount; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DiskIdentity; use omicron_uuid_kinds::DatasetUuid; +use std::collections::BTreeMap; use std::fmt; -use std::str::FromStr; // These locations in the ramdisk must only be used by the switch zone. // @@ -236,56 +238,118 @@ pub struct DatasetProperties { } impl DatasetProperties { - // care about. - const ZFS_LIST_STR: &'static str = + const ZFS_GET_PROPS: &'static str = "oxide:uuid,name,avail,used,quota,reservation,compression"; } -// An inner parsing function, so that the FromStr implementation can always emit -// the string 's' that failed to parse in the error message. -fn dataset_properties_parse( - s: &str, -) -> Result { - let mut iter = s.split_whitespace(); - - let id = match iter.next().context("Missing UUID")? { - "-" => None, - anything_else => Some(anything_else.parse::()?), - }; - - let name = iter.next().context("Missing 'name'")?.to_string(); - let avail = - iter.next().context("Missing 'avail'")?.parse::()?.try_into()?; - let used = - iter.next().context("Missing 'used'")?.parse::()?.try_into()?; - let quota = match iter.next().context("Missing 'quota'")?.parse::()? { - 0 => None, - q => Some(q.try_into()?), - }; - let reservation = - match iter.next().context("Missing 'reservation'")?.parse::()? { - 0 => None, - r => Some(r.try_into()?), - }; - let compression = iter.next().context("Missing 'compression'")?.to_string(); - - Ok(DatasetProperties { - id, - name, - avail, - used, - quota, - reservation, - compression, - }) -} +impl DatasetProperties { + /// Parses dataset properties, assuming that the caller is providing the + /// output of the following command as stdout: + /// + /// zfs get -rpo name,property,value,source $ZFS_GET_PROPS $DATASETS + fn parse_many( + stdout: &str, + ) -> Result, anyhow::Error> { + let name_prop_val_source_list = stdout.trim().split('\n'); + + let mut datasets: BTreeMap<&str, BTreeMap<&str, _>> = BTreeMap::new(); + for name_prop_val_source in name_prop_val_source_list { + // "-H" indicates that these columns are tab-separated; + // each column may internally have whitespace. + let mut iter = name_prop_val_source.split('\t'); + + let (name, prop, val, source) = ( + iter.next().context("Missing 'name'")?, + iter.next().context("Missing 'property'")?, + iter.next().context("Missing 'value'")?, + iter.next().context("Missing 'source'")?, + ); + if let Some(extra) = iter.next() { + bail!("Unexpected column data: '{extra}'"); + } -impl FromStr for DatasetProperties { - type Err = anyhow::Error; + let props = datasets.entry(name).or_default(); + props.insert(prop, (val, source)); + } - fn from_str(s: &str) -> Result { - dataset_properties_parse(s) - .with_context(|| format!("Failed to parse: {s}")) + datasets + .into_iter() + .map(|(dataset_name, props)| { + let id = props + .get("oxide:uuid") + .filter(|(prop, source)| { + // Dataset UUIDs are properties that are optionally attached to + // datasets. However, some datasets are nested - to avoid them + // from propagating, we explicitly ignore this value if it is + // inherited. + // + // This can be the case for the "zone" filesystem root, which + // can propagate this property to a child zone without it set. + !source.starts_with("inherited") && *prop != "-" + }) + .map(|(prop, _source)| { + prop.parse::() + .context("Failed to parse UUID") + }) + .transpose()?; + let name = dataset_name.to_string(); + let avail = props + .get("available") + .map(|(prop, _source)| prop) + .ok_or(anyhow!("Missing 'available'"))? + .parse::() + .context("Failed to parse 'available'")? + .try_into()?; + let used = props + .get("used") + .map(|(prop, _source)| prop) + .ok_or(anyhow!("Missing 'used'"))? + .parse::() + .context("Failed to parse 'used'")? + .try_into()?; + let quota = props + .get("quota") + .filter(|(_prop, source)| { + // If a quota has not been set explicitly, it has a default + // source and a value of "zero". Rather than parsing the value + // as zero, it should be ignored. + *source != "default" + }) + .map(|(prop, _source)| { + prop.parse::().context("Failed to parse 'quota'") + }) + .transpose()? + .and_then(|v| ByteCount::try_from(v).ok()); + let reservation = props + .get("reservation") + .filter(|(_prop, source)| { + // If a reservation has not been set explicitly, it has a default + // source and a value of "zero". Rather than parsing the value + // as zero, it should be ignored. + *source != "default" + }) + .map(|(prop, _source)| { + prop.parse::() + .context("Failed to parse 'reservation'") + }) + .transpose()? + .and_then(|v| ByteCount::try_from(v).ok()); + let compression = props + .get("compression") + .map(|(prop, _source)| prop.to_string()) + .ok_or_else(|| anyhow!("Missing 'compression'"))?; + + Ok(DatasetProperties { + id, + name, + avail, + used, + quota, + reservation, + compression, + }) + }) + .collect::, _>>() } } @@ -335,6 +399,7 @@ impl Zfs { } /// Get information about datasets within a list of zpools / datasets. + /// Returns properties for all input datasets and their direct children. /// /// This function is similar to [Zfs::list_datasets], but provides a more /// substantial results about the datasets found. @@ -344,26 +409,24 @@ impl Zfs { datasets: &[String], ) -> Result, anyhow::Error> { let mut command = std::process::Command::new(ZFS); - let cmd = command.args(&["list", "-d", "1", "-rHpo"]); + let cmd = command.args(&[ + "get", + "-d", + "1", + "-Hpo", + "name,property,value,source", + ]); // Note: this is tightly coupled with the layout of DatasetProperties - cmd.arg(DatasetProperties::ZFS_LIST_STR); + cmd.arg(DatasetProperties::ZFS_GET_PROPS); cmd.args(datasets); let output = execute(cmd).with_context(|| { format!("Failed to get dataset properties for {datasets:?}") })?; let stdout = String::from_utf8(output.stdout)?; - let mut datasets = stdout - .trim() - .split('\n') - .map(|row| row.parse::()) - .collect::, _>>()?; - - datasets.sort_by(|d1, d2| d1.name.partial_cmp(&d2.name).unwrap()); - datasets.dedup_by(|d1, d2| d1.name.eq(&d2.name)); - Ok(datasets) + DatasetProperties::parse_many(&stdout) } /// Return the name of a dataset for a ZFS object. @@ -859,42 +922,68 @@ mod test { #[test] fn parse_dataset_props() { - let input = - "- dataset_name 1234 5678 0 0 off"; - let props = DatasetProperties::from_str(&input) + let input = "dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tname\tI_AM_IGNORED\t-\n\ + dataset_name\tcompression\toff\tinherited from parent"; + let props = DatasetProperties::parse_many(&input) .expect("Should have parsed data"); + assert_eq!(props.len(), 1); + + assert_eq!(props[0].id, None); + assert_eq!(props[0].name, "dataset_name"); + assert_eq!(props[0].avail.to_bytes(), 1234); + assert_eq!(props[0].used.to_bytes(), 5678); + assert_eq!(props[0].quota, None); + assert_eq!(props[0].reservation, None); + assert_eq!(props[0].compression, "off"); + } - assert_eq!(props.id, None); - assert_eq!(props.name, "dataset_name"); - assert_eq!(props.avail.to_bytes(), 1234); - assert_eq!(props.used.to_bytes(), 5678); - assert_eq!(props.quota, None); - assert_eq!(props.reservation, None); - assert_eq!(props.compression, "off"); + #[test] + fn parse_dataset_too_many_columns() { + let input = "dataset_name\tavailable\t1234\t-\tEXTRA\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tname\tI_AM_IGNORED\t-\n\ + dataset_name\tcompression\toff\tinherited from parent"; + let err = DatasetProperties::parse_many(&input) + .expect_err("Should have parsed data"); + assert!( + err.to_string().contains("Unexpected column data: 'EXTRA'"), + "{err}" + ); } #[test] fn parse_dataset_props_with_optionals() { - let input = "d4e1e554-7b98-4413-809e-4a42561c3d0c dataset_name 1234 5678 111 222 off"; - let props = DatasetProperties::from_str(&input) + let input = + "dataset_name\toxide:uuid\td4e1e554-7b98-4413-809e-4a42561c3d0c\tlocal\n\ + dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tquota\t111\t-\n\ + dataset_name\treservation\t222\t-\n\ + dataset_name\tcompression\toff\tinherited from parent"; + let props = DatasetProperties::parse_many(&input) .expect("Should have parsed data"); - + assert_eq!(props.len(), 1); assert_eq!( - props.id, + props[0].id, Some("d4e1e554-7b98-4413-809e-4a42561c3d0c".parse().unwrap()) ); - assert_eq!(props.name, "dataset_name"); - assert_eq!(props.avail.to_bytes(), 1234); - assert_eq!(props.used.to_bytes(), 5678); - assert_eq!(props.quota.map(|q| q.to_bytes()), Some(111)); - assert_eq!(props.reservation.map(|r| r.to_bytes()), Some(222)); - assert_eq!(props.compression, "off"); + assert_eq!(props[0].name, "dataset_name"); + assert_eq!(props[0].avail.to_bytes(), 1234); + assert_eq!(props[0].used.to_bytes(), 5678); + assert_eq!(props[0].quota.map(|q| q.to_bytes()), Some(111)); + assert_eq!(props[0].reservation.map(|r| r.to_bytes()), Some(222)); + assert_eq!(props[0].compression, "off"); } #[test] fn parse_dataset_bad_uuid() { - let input = "bad dataset_name 1234 5678 111 222 off"; - let err = DatasetProperties::from_str(&input) + let input = "dataset_name\toxide:uuid\tbad\t-\n\ + dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-"; + + let err = DatasetProperties::parse_many(&input) .expect_err("Should have failed to parse"); assert!( format!("{err:#}").contains("error parsing UUID (dataset)"), @@ -904,8 +993,9 @@ mod test { #[test] fn parse_dataset_bad_avail() { - let input = "- dataset_name BADAVAIL 5678 111 222 off"; - let err = DatasetProperties::from_str(&input) + let input = "dataset_name\tavailable\tBADAVAIL\t-\n\ + dataset_name\tused\t5678\t-"; + let err = DatasetProperties::parse_many(&input) .expect_err("Should have failed to parse"); assert!( format!("{err:#}").contains("invalid digit found in string"), @@ -915,8 +1005,9 @@ mod test { #[test] fn parse_dataset_bad_usage() { - let input = "- dataset_name 1234 BADUSAGE 111 222 off"; - let err = DatasetProperties::from_str(&input) + let input = "dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\tBADUSAGE\t-"; + let err = DatasetProperties::parse_many(&input) .expect_err("Should have failed to parse"); assert!( format!("{err:#}").contains("invalid digit found in string"), @@ -926,8 +1017,10 @@ mod test { #[test] fn parse_dataset_bad_quota() { - let input = "- dataset_name 1234 5678 BADQUOTA 222 off"; - let err = DatasetProperties::from_str(&input) + let input = "dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tquota\tBADQUOTA\t-"; + let err = DatasetProperties::parse_many(&input) .expect_err("Should have failed to parse"); assert!( format!("{err:#}").contains("invalid digit found in string"), @@ -937,8 +1030,11 @@ mod test { #[test] fn parse_dataset_bad_reservation() { - let input = "- dataset_name 1234 5678 111 BADRES off"; - let err = DatasetProperties::from_str(&input) + let input = "dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tquota\t111\t-\n\ + dataset_name\treservation\tBADRES\t-"; + let err = DatasetProperties::parse_many(&input) .expect_err("Should have failed to parse"); assert!( format!("{err:#}").contains("invalid digit found in string"), @@ -949,24 +1045,102 @@ mod test { #[test] fn parse_dataset_missing_fields() { let expect_missing = |input: &str, what: &str| { - let err = DatasetProperties::from_str(input) + let err = DatasetProperties::parse_many(input) .expect_err("Should have failed to parse"); let err = format!("{err:#}"); assert!(err.contains(&format!("Missing {what}")), "{err}"); }; expect_missing( - "- dataset_name 1234 5678 111 222", - "'compression'", + "dataset_name\tused\t5678\t-\n\ + dataset_name\tquota\t111\t-\n\ + dataset_name\treservation\t222\t-\n\ + dataset_name\tcompression\toff\tinherited", + "'available'", + ); + expect_missing( + "dataset_name\tavailable\t1234\t-\n\ + dataset_name\tquota\t111\t-\n\ + dataset_name\treservation\t222\t-\n\ + dataset_name\tcompression\toff\tinherited", + "'used'", ); expect_missing( - "- dataset_name 1234 5678 111", - "'reservation'", + "dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tquota\t111\t-\n\ + dataset_name\treservation\t222\t-", + "'compression'", ); - expect_missing("- dataset_name 1234 5678", "'quota'"); - expect_missing("- dataset_name 1234", "'used'"); - expect_missing("- dataset_name", "'avail'"); - expect_missing("-", "'name'"); - expect_missing("", "UUID"); + } + + #[test] + fn parse_dataset_uuid_ignored_if_inherited() { + let input = + "dataset_name\toxide:uuid\tb8698ede-60c2-4e16-b792-d28c165cfd12\tinherited from parent\n\ + dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tcompression\toff\t-"; + let props = DatasetProperties::parse_many(&input) + .expect("Should have parsed data"); + assert_eq!(props.len(), 1); + assert_eq!(props[0].id, None); + } + + #[test] + fn parse_dataset_uuid_ignored_if_dash() { + let input = "dataset_name\toxide:uuid\t-\t-\n\ + dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tcompression\toff\t-"; + let props = DatasetProperties::parse_many(&input) + .expect("Should have parsed data"); + assert_eq!(props.len(), 1); + assert_eq!(props[0].id, None); + } + + #[test] + fn parse_quota_ignored_if_default() { + let input = "dataset_name\tquota\t0\tdefault\n\ + dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tcompression\toff\t-"; + let props = DatasetProperties::parse_many(&input) + .expect("Should have parsed data"); + assert_eq!(props.len(), 1); + assert_eq!(props[0].quota, None); + } + + #[test] + fn parse_reservation_ignored_if_default() { + let input = "dataset_name\treservation\t0\tdefault\n\ + dataset_name\tavailable\t1234\t-\n\ + dataset_name\tused\t5678\t-\n\ + dataset_name\tcompression\toff\t-"; + let props = DatasetProperties::parse_many(&input) + .expect("Should have parsed data"); + assert_eq!(props.len(), 1); + assert_eq!(props[0].reservation, None); + } + + #[test] + fn parse_sorts_and_dedups() { + let input = "foo\tavailable\t111\t-\n\ + foo\tused\t111\t-\n\ + foo\tcompression\toff\t-\n\ + foo\tavailable\t111\t-\n\ + foo\tused\t111\t-\n\ + foo\tcompression\toff\t-\n\ + bar\tavailable\t222\t-\n\ + bar\tused\t222\t-\n\ + bar\tcompression\toff\t-"; + + let props = DatasetProperties::parse_many(&input) + .expect("Should have parsed data"); + assert_eq!(props.len(), 2); + assert_eq!(props[0].name, "bar"); + assert_eq!(props[0].used, 222.into()); + assert_eq!(props[1].name, "foo"); + assert_eq!(props[1].used, 111.into()); } } From dce80bdae6c3ffbd96fa65a7cff5c22ad5dc656f Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 12 Dec 2024 19:59:56 -0500 Subject: [PATCH 21/22] Updated bootleby (#7241) --- tools/permslip_production | 2 +- tools/permslip_staging | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/permslip_production b/tools/permslip_production index ce73c3c6da..5c84faad8f 100644 --- a/tools/permslip_production +++ b/tools/permslip_production @@ -1,2 +1,2 @@ a72a5f931bcfd3d931df407fbbba6d851165c4637adf39568a94f755966b6c9c manifest-oxide-rot-1-v1.0.30.toml -610ebce44b1fb622eb56591534fb2569340fdba9b5ba62ca1b02f0b2d2e973dc manifest-bootleby-v1.3.1.toml +9d5faa910e8e8e7aaeb74df972badcdf371615d4bbabdb9ddccf4d0d32517f7d manifest-bootleby-v1.3.3.toml diff --git a/tools/permslip_staging b/tools/permslip_staging index c224b1ea95..146a9d615e 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -2,4 +2,4 @@ c33a381e716127e05da928c39b3a4d5f5278e43f526ff8c5c817708c378a5c87 manifest-gimlet 2cda350adba506b3ab67813db932d07c7a7836b5731d5351e57d49302f41dbf4 manifest-oxide-rot-1-v1.0.30.toml 70de21757b47e3e6c15d4c8701efe80e8cc90125afdd2883ff160045aed20956 manifest-psc-v1.0.31.toml 222ae9df38699037b75e98eb7a8b441f6cda958b8a79e57e72e410b054f1d8eb manifest-sidecar-v1.0.32.toml -6f8459afe22c27d5920356878e4d8d639464f39a15ce7b5b040c2d908d52a570 manifest-bootleby-v1.3.1.toml +14c20540fe785dea65ef03446d5c4665a5f3d9106eb176691b35646faa54f61f manifest-bootleby-v1.3.3.toml From 7e7c3bb9e876fcdcacf5d4bd3ce9fff3814ba034 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 13 Dec 2024 12:09:39 -0800 Subject: [PATCH 22/22] bump rustls@0.23 to 0.23.19 (#7226) --- Cargo.lock | 29 ++++++++++++++--------------- workspace-hack/Cargo.toml | 10 ++++------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8c374f5609..65ac0fd91c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,7 +718,7 @@ dependencies = [ "bitflags 2.6.0", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.10.5", "lazy_static", "lazycell", "log", @@ -4162,7 +4162,7 @@ dependencies = [ "http", "hyper", "hyper-util", - "rustls 0.23.14", + "rustls 0.23.19", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -5103,7 +5103,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -7406,7 +7406,6 @@ dependencies = [ "indicatif", "inout", "itertools 0.10.5", - "itertools 0.12.1", "lalrpop-util", "lazy_static", "libc", @@ -7441,7 +7440,7 @@ dependencies = [ "reqwest", "rsa", "rustix", - "rustls 0.23.14", + "rustls 0.23.19", "rustls-webpki 0.102.8", "schemars", "scopeguard", @@ -9151,7 +9150,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.0.0", - "rustls 0.23.14", + "rustls 0.23.19", "socket2", "thiserror 1.0.69", "tokio", @@ -9168,7 +9167,7 @@ dependencies = [ "rand", "ring 0.17.8", "rustc-hash 2.0.0", - "rustls 0.23.14", + "rustls 0.23.19", "slab", "thiserror 1.0.69", "tinyvec", @@ -9582,7 +9581,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.14", + "rustls 0.23.19", "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", @@ -9969,9 +9968,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.14" +version = "0.23.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" +checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1" dependencies = [ "aws-lc-rs", "log", @@ -10016,9 +10015,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" @@ -11085,7 +11084,7 @@ dependencies = [ "ed25519-dalek", "libipcc", "pem-rfc7468", - "rustls 0.23.14", + "rustls 0.23.19", "secrecy", "serde", "sha2", @@ -11938,7 +11937,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.14", + "rustls 0.23.19", "rustls-pki-types", "tokio", ] @@ -12103,7 +12102,7 @@ dependencies = [ "pem", "percent-encoding", "reqwest", - "rustls 0.23.14", + "rustls 0.23.19", "serde", "serde_json", "serde_plain", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 9bd648132a..31677ed8c1 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -69,8 +69,7 @@ hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.5.0", features = ["full"] } indexmap = { version = "2.6.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } +itertools = { version = "0.10.5" } lalrpop-util = { version = "0.19.12" } lazy_static = { version = "1.5.0", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2.162", features = ["extra_traits"] } @@ -101,7 +100,7 @@ regex-automata = { version = "0.4.8", default-features = false, features = ["dfa regex-syntax = { version = "0.8.5" } reqwest = { version = "0.12.9", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } rsa = { version = "0.9.6", features = ["serde", "sha2"] } -rustls = { version = "0.23.14", features = ["ring"] } +rustls = { version = "0.23.19", features = ["ring"] } rustls-webpki = { version = "0.102.8", default-features = false, features = ["aws_lc_rs", "ring", "std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } @@ -189,8 +188,7 @@ hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.5.0", features = ["full"] } indexmap = { version = "2.6.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } +itertools = { version = "0.10.5" } lalrpop-util = { version = "0.19.12" } lazy_static = { version = "1.5.0", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2.162", features = ["extra_traits"] } @@ -221,7 +219,7 @@ regex-automata = { version = "0.4.8", default-features = false, features = ["dfa regex-syntax = { version = "0.8.5" } reqwest = { version = "0.12.9", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } rsa = { version = "0.9.6", features = ["serde", "sha2"] } -rustls = { version = "0.23.14", features = ["ring"] } +rustls = { version = "0.23.19", features = ["ring"] } rustls-webpki = { version = "0.102.8", default-features = false, features = ["aws_lc_rs", "ring", "std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" }