From 06f804575ce440495a6998d51f3d01863b1f5edd Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 12:52:15 -0800 Subject: [PATCH 01/10] Create dtrace probes for transactions (#7248) - Adds a "NonRetryHelper" struct to help instrument non-retryable transactions - Adds `transaction__start` and `transaction__done` probes which wrap our retryable (and now, non-retryable transactions) Intended to supplement https://github.com/oxidecomputer/omicron/pull/7244 , and provide transaction names --- Cargo.lock | 4 +- Cargo.toml | 2 +- clippy.toml | 2 +- .../db-queries/src/db/datastore/deployment.rs | 7 +- nexus/db-queries/src/db/datastore/dns.rs | 31 +++---- .../db-queries/src/db/datastore/inventory.rs | 6 +- nexus/db-queries/src/db/datastore/mod.rs | 8 ++ nexus/db-queries/src/db/datastore/rack.rs | 11 +-- nexus/db-queries/src/db/datastore/role.rs | 8 +- nexus/db-queries/src/db/datastore/silo.rs | 80 +++++++++---------- .../db-queries/src/db/datastore/silo_group.rs | 8 +- nexus/db-queries/src/lib.rs | 6 ++ nexus/db-queries/src/transaction_retry.rs | 48 +++++++++++ 13 files changed, 140 insertions(+), 81 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 65ac0fd91c..04cea43c75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2321,9 +2321,9 @@ dependencies = [ [[package]] name = "diesel-dtrace" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e5130181059723aae1cfdb678d3698052a225aaadb18000f77fec4200047acc" +checksum = "2750c8bd7a42381620b57f370ca5f757a71d554814e02a43c1bf54ae2656e01d" dependencies = [ "diesel", "serde", diff --git a/Cargo.toml b/Cargo.toml index ec862fb722..39d8f33ca6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -361,7 +361,7 @@ derive_more = "0.99.18" derive-where = "1.2.7" # Having the i-implement-... feature here makes diesel go away from the workspace-hack diesel = { version = "2.2.4", features = ["i-implement-a-third-party-backend-and-opt-into-breaking-changes", "postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } -diesel-dtrace = "0.4.0" +diesel-dtrace = "0.4.2" dns-server = { path = "dns-server" } dns-server-api = { path = "dns-server-api" } dns-service-client = { path = "clients/dns-service-client" } diff --git a/clippy.toml b/clippy.toml index 31e28d5911..677fb67ade 100644 --- a/clippy.toml +++ b/clippy.toml @@ -15,5 +15,5 @@ disallowed-methods = [ # and can fail spuriously. # Instead, the "transaction_retry_wrapper" should be preferred, as it # automatically retries transactions experiencing contention. - { path = "async_bb8_diesel::AsyncConnection::transaction_async", reason = "Prefer to use transaction_retry_wrapper, if possible. Feel free to override this for tests and nested transactions." }, + { path = "async_bb8_diesel::AsyncConnection::transaction_async", reason = "Prefer to use transaction_retry_wrapper, if possible. For tests and nested transactions, use transaction_non_retry_wrapper to at least get dtrace probes" }, ] diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 0c73ae1ae2..4210f2bee2 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -16,7 +16,6 @@ use crate::db::DbConnection; use crate::db::TransactionError; use crate::transaction_retry::OptionalError; use anyhow::Context; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; @@ -106,7 +105,7 @@ impl DataStore { blueprint: &Blueprint, ) -> Result<(), Error> { let conn = self.pool_connection_authorized(opctx).await?; - Self::blueprint_insert_on_connection(&conn, opctx, blueprint).await + self.blueprint_insert_on_connection(&conn, opctx, blueprint).await } /// Creates a transaction iff the current blueprint is "bp_id". @@ -182,6 +181,7 @@ impl DataStore { /// Variant of [Self::blueprint_insert] which may be called from a /// transaction context. pub(crate) async fn blueprint_insert_on_connection( + &self, conn: &async_bb8_diesel::Connection, opctx: &OpContext, blueprint: &Blueprint, @@ -340,7 +340,8 @@ impl DataStore { // as most of the operations should be insertions rather than in-place // modifications of existing tables. #[allow(clippy::disallowed_methods)] - conn.transaction_async(|conn| async move { + self.transaction_non_retry_wrapper("blueprint_insert") + .transaction(&conn, |conn| async move { // Insert the row for the blueprint. { use db::schema::blueprint::dsl; diff --git a/nexus/db-queries/src/db/datastore/dns.rs b/nexus/db-queries/src/db/datastore/dns.rs index 3f0f7828fa..114d553aac 100644 --- a/nexus/db-queries/src/db/datastore/dns.rs +++ b/nexus/db-queries/src/db/datastore/dns.rs @@ -20,7 +20,6 @@ use crate::db::pagination::Paginator; use crate::db::pool::DbConnection; use crate::db::TransactionError; use crate::transaction_retry::OptionalError; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use futures::future::BoxFuture; @@ -453,20 +452,24 @@ impl DataStore { // This method is used in nested transactions, which are not supported // with retryable transactions. - #[allow(clippy::disallowed_methods)] - conn.transaction_async(|c| async move { - let version = self - .dns_group_latest_version_conn(opctx, conn, update.dns_group) - .await?; - self.dns_write_version_internal( - &c, - update, - zones, - Generation(version.version.next()), - ) + self.transaction_non_retry_wrapper("dns_update_incremental") + .transaction(&conn, |c| async move { + let version = self + .dns_group_latest_version_conn( + opctx, + conn, + update.dns_group, + ) + .await?; + self.dns_write_version_internal( + &c, + update, + zones, + Generation(version.version.next()), + ) + .await + }) .await - }) - .await } // This must only be used inside a transaction. Otherwise, it may make diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 9269b233f3..08c64a3254 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -278,13 +278,13 @@ impl DataStore { // batch rather than making a bunch of round-trips to the database. // We'd do that if we had an interface for doing that with bound // parameters, etc. See oxidecomputer/omicron#973. - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; // The risk of a serialization error is possible here, but low, // as most of the operations should be insertions rather than in-place // modifications of existing tables. - #[allow(clippy::disallowed_methods)] - pool.transaction_async(|conn| async move { + self.transaction_non_retry_wrapper("inventory_insert_collection") + .transaction(&conn, |conn| async move { // Insert records (and generate ids) for any baseboards that do not // already exist in the database. These rows are not scoped to a // particular collection. They contain only immutable data -- diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 25f1f282f7..c90f1cc92e 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -320,6 +320,14 @@ impl DataStore { ) } + /// Constructs a non-retryable transaction helper + pub fn transaction_non_retry_wrapper( + &self, + name: &'static str, + ) -> crate::transaction_retry::NonRetryHelper { + crate::transaction_retry::NonRetryHelper::new(&self.log, name) + } + #[cfg(test)] pub(crate) fn transaction_retry_producer( &self, diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index dc3175c22d..067b8ed3ce 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -26,7 +26,6 @@ use crate::db::model::Zpool; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; use crate::db::TransactionError; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -676,11 +675,9 @@ impl DataStore { // This method uses nested transactions, which are not supported // with retryable transactions. - #[allow(clippy::disallowed_methods)] - let rack = self - .pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| { + let conn = self.pool_connection_authorized(opctx).await?; + let rack = self.transaction_non_retry_wrapper("rack_set_initialized") + .transaction(&conn, |conn| { let err = err.clone(); let log = log.clone(); let authz_service_pool = authz_service_pool.clone(); @@ -752,7 +749,7 @@ impl DataStore { } // Insert the RSS-generated blueprint. - Self::blueprint_insert_on_connection( + self.blueprint_insert_on_connection( &conn, opctx, &blueprint, ) .await diff --git a/nexus/db-queries/src/db/datastore/role.rs b/nexus/db-queries/src/db/datastore/role.rs index ed8ec6fcd9..757658351b 100644 --- a/nexus/db-queries/src/db/datastore/role.rs +++ b/nexus/db-queries/src/db/datastore/role.rs @@ -20,7 +20,6 @@ use crate::db::model::RoleAssignment; use crate::db::model::RoleBuiltin; use crate::db::pagination::paginated_multicolumn; use crate::db::pool::DbConnection; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use nexus_db_fixed_data::role_assignment::BUILTIN_ROLE_ASSIGNMENTS; @@ -213,10 +212,9 @@ impl DataStore { // This method should probably be retryable, but this is slightly // complicated by the cloning semantics of the queries, which // must be Clone to be retried. - #[allow(clippy::disallowed_methods)] - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_non_retry_wrapper("role_assignment_replace_visible") + .transaction(&conn, |conn| async move { delete_old_query.execute_async(&conn).await?; Ok(insert_new_query.get_results_async(&conn).await?) }) diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index b862f3c461..c963e13ae4 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -24,7 +24,6 @@ use crate::db::model::VirtualProvisioningCollection; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::pool::DbConnection; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -227,9 +226,9 @@ impl DataStore { // This method uses nested transactions, which are not supported // with retryable transactions. - #[allow(clippy::disallowed_methods)] - let silo = conn - .transaction_async(|conn| async move { + let silo = self + .transaction_non_retry_wrapper("silo_create") + .transaction(&conn, |conn| async move { let silo = silo_create_query .get_result_async(&conn) .await @@ -429,48 +428,49 @@ impl DataStore { // This method uses nested transactions, which are not supported // with retryable transactions. - #[allow(clippy::disallowed_methods)] - conn.transaction_async(|conn| async move { - let updated_rows = diesel::update(silo::dsl::silo) - .filter(silo::dsl::time_deleted.is_null()) - .filter(silo::dsl::id.eq(id)) - .filter(silo::dsl::rcgen.eq(rcgen)) - .set(silo::dsl::time_deleted.eq(now)) - .execute_async(&conn) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_silo), - ) - })?; + self.transaction_non_retry_wrapper("silo_delete") + .transaction(&conn, |conn| async move { + let updated_rows = diesel::update(silo::dsl::silo) + .filter(silo::dsl::time_deleted.is_null()) + .filter(silo::dsl::id.eq(id)) + .filter(silo::dsl::rcgen.eq(rcgen)) + .set(silo::dsl::time_deleted.eq(now)) + .execute_async(&conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByResource(authz_silo), + ) + })?; - if updated_rows == 0 { - return Err(TxnError::CustomError(Error::invalid_request( - "silo deletion failed due to concurrent modification", - ))); - } + if updated_rows == 0 { + return Err(TxnError::CustomError(Error::invalid_request( + "silo deletion failed due to concurrent modification", + ))); + } - self.silo_quotas_delete(opctx, &conn, &authz_silo).await?; + self.silo_quotas_delete(opctx, &conn, &authz_silo).await?; - self.virtual_provisioning_collection_delete_on_connection( - &opctx.log, &conn, id, - ) - .await?; + self.virtual_provisioning_collection_delete_on_connection( + &opctx.log, &conn, id, + ) + .await?; - self.dns_update_incremental(dns_opctx, &conn, dns_update).await?; + self.dns_update_incremental(dns_opctx, &conn, dns_update) + .await?; - info!(opctx.log, "deleted silo {}", id); + info!(opctx.log, "deleted silo {}", id); - Ok(()) - }) - .await - .map_err(|e| match e { - TxnError::CustomError(e) => e, - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - })?; + Ok(()) + }) + .await + .map_err(|e| match e { + TxnError::CustomError(e) => e, + TxnError::Database(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + } + })?; // TODO-correctness This needs to happen in a saga or some other // mechanism that ensures it happens even if we crash at this point. diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index e6168f4e42..4d5543cc9c 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -16,7 +16,6 @@ use crate::db::model::SiloGroup; use crate::db::model::SiloGroupMembership; use crate::db::pagination::paginated; use crate::db::IncompleteOnConflictExt; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -198,12 +197,11 @@ impl DataStore { type TxnError = TransactionError; let group_id = authz_silo_group.id(); + let conn = self.pool_connection_authorized(opctx).await?; // Prefer to use "transaction_retry_wrapper" - #[allow(clippy::disallowed_methods)] - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + self.transaction_non_retry_wrapper("silo_group_delete") + .transaction(&conn, |conn| async move { use db::schema::silo_group_membership; // Don't delete groups that still have memberships diff --git a/nexus/db-queries/src/lib.rs b/nexus/db-queries/src/lib.rs index 003310f920..c02c781c8e 100644 --- a/nexus/db-queries/src/lib.rs +++ b/nexus/db-queries/src/lib.rs @@ -38,4 +38,10 @@ mod probes { // Fires when we fail to find a VNI in the provided range. fn vni__search__range__empty(_: &usdt::UniqueId) {} + + // Fires when a transaction has started + fn transaction__start(conn_id: uuid::Uuid, name: &str) {} + + // Fires when a transaction has completed + fn transaction__done(conn_id: uuid::Uuid, name: &str) {} } diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index 02d00f8215..cf8ee22376 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -106,9 +106,15 @@ impl RetryHelper { + Send + Sync, { + crate::probes::transaction__start!(|| { + (conn.as_sync_conn().id(), self.name) + }); let result = conn .transaction_async_with_retry(f, || self.retry_callback()) .await; + crate::probes::transaction__done!(|| { + (conn.as_sync_conn().id(), self.name) + }); let retry_info = self.inner.lock().unwrap(); if retry_info.has_retried() { @@ -179,6 +185,48 @@ impl oximeter::Producer for Producer { } } +/// Helper utility to have a similar interface to RetryHelper (also emitting +/// probes) but for non-retryable transactions. +pub struct NonRetryHelper { + log: Logger, + name: &'static str, +} + +impl NonRetryHelper { + pub(crate) fn new(log: &Logger, name: &'static str) -> Self { + Self { log: log.new(o!("transaction" => name)), name } + } + + /// Calls the function "f" in an asynchronous, non-retryable transaction. + pub async fn transaction( + self, + conn: &async_bb8_diesel::Connection, + f: Func, + ) -> Result + where + R: Send + 'static, + E: From + std::fmt::Debug + Send + 'static, + Fut: std::future::Future> + Send, + Func: FnOnce(async_bb8_diesel::Connection) -> Fut + + Send + + Sync, + { + crate::probes::transaction__start!(|| { + (conn.as_sync_conn().id(), self.name) + }); + + #[allow(clippy::disallowed_methods)] + let result = conn.transaction_async(f).await; + crate::probes::transaction__done!(|| { + (conn.as_sync_conn().id(), self.name) + }); + if let Err(err) = result.as_ref() { + warn!(self.log, "Non-retryable transaction failure"; "err" => ?err); + } + result + } +} + /// Helper utility for passing non-retryable errors out-of-band from /// transactions. /// From 5ef0b7d93abedec94ddbc4cc60ea562017e7f262 Mon Sep 17 00:00:00 2001 From: Michael Zeller Date: Fri, 13 Dec 2024 17:45:44 -0500 Subject: [PATCH 02/10] Pull support-bundle/queries into a standalone crate (#7193) This pulls out sled-agent/support-bundle/queries from Omicron into a standalone sled-diagnostics crate that can be reused outside of sled-agent. --- Cargo.lock | 11 ++ Cargo.toml | 3 + sled-agent/Cargo.toml | 1 + sled-agent/src/http_entrypoints.rs | 2 +- sled-agent/src/sled_agent.rs | 17 +-- sled-agent/src/support_bundle/mod.rs | 1 - sled-diagnostics/.gitignore | 1 + sled-diagnostics/Cargo.toml | 13 ++ sled-diagnostics/src/lib.rs | 52 ++++++++ .../src}/queries.rs | 120 +++++++----------- 10 files changed, 137 insertions(+), 84 deletions(-) create mode 100644 sled-diagnostics/.gitignore create mode 100644 sled-diagnostics/Cargo.toml create mode 100644 sled-diagnostics/src/lib.rs rename {sled-agent/src/support_bundle => sled-diagnostics/src}/queries.rs (69%) diff --git a/Cargo.lock b/Cargo.lock index 04cea43c75..f53a518f76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7265,6 +7265,7 @@ dependencies = [ "sled-agent-api", "sled-agent-client", "sled-agent-types", + "sled-diagnostics", "sled-hardware", "sled-hardware-types", "sled-storage", @@ -10734,6 +10735,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "sled-diagnostics" +version = "0.1.0" +dependencies = [ + "futures", + "omicron-workspace-hack", + "thiserror 1.0.69", + "tokio", +] + [[package]] name = "sled-hardware" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 39d8f33ca6..bb9ff0e80f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,6 +108,7 @@ members = [ "sled-agent/bootstrap-agent-api", "sled-agent/repo-depot-api", "sled-agent/types", + "sled-diagnostics", "sled-hardware", "sled-hardware/types", "sled-storage", @@ -240,6 +241,7 @@ default-members = [ "sled-agent/bootstrap-agent-api", "sled-agent/repo-depot-api", "sled-agent/types", + "sled-diagnostics", "sled-hardware", "sled-hardware/types", "sled-storage", @@ -596,6 +598,7 @@ sled = "=0.34.7" sled-agent-api = { path = "sled-agent/api" } sled-agent-client = { path = "clients/sled-agent-client" } sled-agent-types = { path = "sled-agent/types" } +sled-diagnostics = { path = "sled-diagnostics" } sled-hardware = { path = "sled-hardware" } sled-hardware-types = { path = "sled-hardware/types" } sled-storage = { path = "sled-storage" } diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 10b4ba1cdb..88c758dc31 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -81,6 +81,7 @@ sha3.workspace = true sled-agent-api.workspace = true sled-agent-client.workspace = true sled-agent-types.workspace = true +sled-diagnostics.workspace = true sled-hardware.workspace = true sled-hardware-types.workspace = true sled-storage.workspace = true diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 844e13151a..88259537d2 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -6,7 +6,6 @@ use super::sled_agent::SledAgent; use crate::sled_agent::Error as SledAgentError; -use crate::support_bundle::queries::SupportBundleCommandHttpOutput; use crate::zone_bundle::BundleError; use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; @@ -53,6 +52,7 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, StorageLimit, ZoneBundleId, ZoneBundleMetadata, }; +use sled_diagnostics::SledDiagnosticsCommandHttpOutput; use std::collections::BTreeMap; type SledApiDescription = ApiDescription; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 80dbe72ea3..b9bf703933 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -19,10 +19,6 @@ use crate::params::OmicronZoneTypeExt; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; use crate::storage_monitor::StorageMonitorHandle; -use crate::support_bundle::queries::{ - dladm_info, ipadm_info, zoneadm_info, SupportBundleCmdError, - SupportBundleCmdOutput, -}; use crate::support_bundle::storage::SupportBundleManager; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager}; @@ -76,6 +72,7 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, PriorityOrder, StorageLimit, ZoneBundleMetadata, }; +use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput}; use sled_hardware::{underlay, HardwareManager}; use sled_hardware_types::underlay::BootstrapInterface; use sled_hardware_types::Baseboard; @@ -1367,20 +1364,20 @@ impl SledAgent { pub(crate) async fn support_zoneadm_info( &self, - ) -> Result { - zoneadm_info().await + ) -> Result { + sled_diagnostics::zoneadm_info().await } pub(crate) async fn support_ipadm_info( &self, - ) -> Vec> { - ipadm_info().await + ) -> Vec> { + sled_diagnostics::ipadm_info().await } pub(crate) async fn support_dladm_info( &self, - ) -> Vec> { - dladm_info().await + ) -> Vec> { + sled_diagnostics::dladm_info().await } } diff --git a/sled-agent/src/support_bundle/mod.rs b/sled-agent/src/support_bundle/mod.rs index 314edfaec8..a1c4942751 100644 --- a/sled-agent/src/support_bundle/mod.rs +++ b/sled-agent/src/support_bundle/mod.rs @@ -2,5 +2,4 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -pub mod queries; pub mod storage; diff --git a/sled-diagnostics/.gitignore b/sled-diagnostics/.gitignore new file mode 100644 index 0000000000..ea8c4bf7f3 --- /dev/null +++ b/sled-diagnostics/.gitignore @@ -0,0 +1 @@ +/target diff --git a/sled-diagnostics/Cargo.toml b/sled-diagnostics/Cargo.toml new file mode 100644 index 0000000000..98ac59b674 --- /dev/null +++ b/sled-diagnostics/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "sled-diagnostics" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +futures.workspace = true +omicron-workspace-hack.workspace = true +thiserror.workspace = true +tokio = { workspace = true, features = ["full"] } diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs new file mode 100644 index 0000000000..466c1ec446 --- /dev/null +++ b/sled-diagnostics/src/lib.rs @@ -0,0 +1,52 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Diagnostics for an Oxide sled that exposes common support commands. + +use futures::{stream::FuturesUnordered, StreamExt}; + +mod queries; +pub use crate::queries::{ + SledDiagnosticsCmdError, SledDiagnosticsCmdOutput, + SledDiagnosticsCommandHttpOutput, +}; +use queries::*; + +/// List all zones on a sled. +pub async fn zoneadm_info( +) -> Result { + execute_command_with_timeout(zoneadm_list(), DEFAULT_TIMEOUT).await +} + +/// Retrieve various `ipadm` command output for the system. +pub async fn ipadm_info( +) -> Vec> { + [ipadm_show_interface(), ipadm_show_addr(), ipadm_show_prop()] + .into_iter() + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} + +/// Retrieve various `dladm` command output for the system. +pub async fn dladm_info( +) -> Vec> { + [ + dladm_show_phys(), + dladm_show_ether(), + dladm_show_link(), + dladm_show_vnic(), + dladm_show_linkprop(), + ] + .into_iter() + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} diff --git a/sled-agent/src/support_bundle/queries.rs b/sled-diagnostics/src/queries.rs similarity index 69% rename from sled-agent/src/support_bundle/queries.rs rename to sled-diagnostics/src/queries.rs index 2313d9e08d..9a66842cb2 100644 --- a/sled-agent/src/support_bundle/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -1,18 +1,27 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Wrapper for command execution with timeout. + use std::{process::Command, time::Duration}; -use futures::{stream::FuturesUnordered, StreamExt}; -use illumos_utils::{dladm::DLADM, zone::IPADM, PFEXEC, ZONEADM}; use thiserror::Error; use tokio::io::AsyncReadExt; -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +const DLADM: &str = "/usr/sbin/dladm"; +const IPADM: &str = "/usr/sbin/ipadm"; +const PFEXEC: &str = "/usr/bin/pfexec"; +const ZONEADM: &str = "/usr/sbin/zoneadm"; + +pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); -pub trait SupportBundleCommandHttpOutput { +pub trait SledDiagnosticsCommandHttpOutput { fn get_output(self) -> String; } #[derive(Error, Debug)] -pub enum SupportBundleCmdError { +pub enum SledDiagnosticsCmdError { #[error("Failed to duplicate pipe for command [{command}]: {error}")] Dup { command: String, error: std::io::Error }, #[error("Failed to proccess output for command [{command}]: {error}")] @@ -32,13 +41,13 @@ pub enum SupportBundleCmdError { } #[derive(Debug)] -pub struct SupportBundleCmdOutput { +pub struct SledDiagnosticsCmdOutput { pub command: String, pub stdio: String, pub exit_status: String, } -impl std::fmt::Display for SupportBundleCmdOutput { +impl std::fmt::Display for SledDiagnosticsCmdOutput { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "Command executed [{}]:", self.command)?; writeln!(f, " ==== stdio ====\n{}", self.stdio)?; @@ -46,8 +55,8 @@ impl std::fmt::Display for SupportBundleCmdOutput { } } -impl SupportBundleCommandHttpOutput - for Result +impl SledDiagnosticsCommandHttpOutput + for Result { fn get_output(self) -> String { match self { @@ -76,16 +85,19 @@ fn command_to_string(command: &Command) -> String { /// and stderr as they occur. async fn execute( cmd: Command, -) -> Result { +) -> Result { let cmd_string = command_to_string(&cmd); let (sender, mut rx) = tokio::net::unix::pipe::pipe().map_err(|e| { - SupportBundleCmdError::Pipe { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::Pipe { command: cmd_string.clone(), error: e } })?; let pipe = sender.into_nonblocking_fd().map_err(|e| { - SupportBundleCmdError::OwnedFd { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::OwnedFd { + command: cmd_string.clone(), + error: e, + } })?; let pipe_dup = pipe.try_clone().map_err(|e| { - SupportBundleCmdError::Dup { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::Dup { command: cmd_string.clone(), error: e } })?; // TODO MTZ: We may eventually want to reuse some of the process contract @@ -95,9 +107,8 @@ async fn execute( cmd.stdout(pipe); cmd.stderr(pipe_dup); - let mut child = cmd.spawn().map_err(|e| SupportBundleCmdError::Spawn { - command: cmd_string.clone(), - error: e, + let mut child = cmd.spawn().map_err(|e| { + SledDiagnosticsCmdError::Spawn { command: cmd_string.clone(), error: e } })?; // NB: This drop call is load-bearing and prevents a deadlock. The command // struct holds onto the write half of the pipe preventing the read side @@ -107,85 +118,88 @@ async fn execute( let mut stdio = String::new(); rx.read_to_string(&mut stdio).await.map_err(|e| { - SupportBundleCmdError::Output { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::Output { + command: cmd_string.clone(), + error: e, + } })?; let exit_status = child.wait().await.map(|es| format!("{es}")).map_err(|e| { - SupportBundleCmdError::Wait { + SledDiagnosticsCmdError::Wait { command: cmd_string.clone(), error: e, } })?; - Ok(SupportBundleCmdOutput { command: cmd_string, stdio, exit_status }) + Ok(SledDiagnosticsCmdOutput { command: cmd_string, stdio, exit_status }) } /// Spawn a command that's allowed to execute within a given time limit. -async fn execute_command_with_timeout( +pub async fn execute_command_with_timeout( command: Command, duration: Duration, -) -> Result { +) -> Result { let cmd_string = command_to_string(&command); let tokio_command = execute(command); match tokio::time::timeout(duration, tokio_command).await { Ok(res) => res, - Err(_elapsed) => Err(SupportBundleCmdError::Timeout { + Err(_elapsed) => Err(SledDiagnosticsCmdError::Timeout { command: cmd_string, duration, }), } } -fn zoneadm_list() -> Command { +pub fn zoneadm_list() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(ZONEADM).arg("list").arg("-cip"); cmd } -fn ipadm_show_interface() -> Command { +pub fn ipadm_show_interface() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(IPADM).arg("show-if"); cmd } -fn ipadm_show_addr() -> Command { +pub fn ipadm_show_addr() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(IPADM).arg("show-addr"); cmd } -fn ipadm_show_prop() -> Command { +pub fn ipadm_show_prop() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(IPADM).arg("show-prop"); cmd } -fn dladm_show_phys() -> Command { +pub fn dladm_show_phys() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).args(["show-phys", "-m"]); cmd } -fn dladm_show_ether() -> Command { +pub fn dladm_show_ether() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-ether"); cmd } -fn dladm_show_link() -> Command { +pub fn dladm_show_link() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-link"); cmd } -fn dladm_show_vnic() -> Command { +pub fn dladm_show_vnic() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-vnic"); cmd } -fn dladm_show_linkprop() -> Command { +pub fn dladm_show_linkprop() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-linkprop"); cmd @@ -195,44 +209,6 @@ fn dladm_show_linkprop() -> Command { * Public API */ -/// List all zones on a sled. -pub async fn zoneadm_info( -) -> Result { - execute_command_with_timeout(zoneadm_list(), DEFAULT_TIMEOUT).await -} - -/// Retrieve various `ipadm` command output for the system. -pub async fn ipadm_info( -) -> Vec> { - [ipadm_show_interface(), ipadm_show_addr(), ipadm_show_prop()] - .into_iter() - .map(|c| async move { - execute_command_with_timeout(c, DEFAULT_TIMEOUT).await - }) - .collect::>() - .collect::>>() - .await -} - -/// Retrieve various `dladm` command output for the system. -pub async fn dladm_info( -) -> Vec> { - [ - dladm_show_phys(), - dladm_show_ether(), - dladm_show_link(), - dladm_show_vnic(), - dladm_show_linkprop(), - ] - .into_iter() - .map(|c| async move { - execute_command_with_timeout(c, DEFAULT_TIMEOUT).await - }) - .collect::>() - .collect::>>() - .await -} - #[cfg(test)] mod test { use super::*; @@ -246,7 +222,7 @@ mod test { match execute_command_with_timeout(command, Duration::from_millis(500)) .await { - Err(SupportBundleCmdError::Timeout { .. }) => (), + Err(SledDiagnosticsCmdError::Timeout { .. }) => (), _ => panic!("command should have timed out"), } } @@ -267,7 +243,7 @@ mod test { #[tokio::test] async fn test_command_stderr_is_correct() { let mut command = Command::new("bash"); - command.env_clear().args(&["-c", "echo oxide computer > /dev/stderr"]); + command.env_clear().args(["-c", "echo oxide computer > /dev/stderr"]); let res = execute_command_with_timeout(command, Duration::from_secs(5)) .await @@ -279,7 +255,7 @@ mod test { #[tokio::test] async fn test_command_stdout_stderr_are_interleaved() { let mut command = Command::new("bash"); - command.env_clear().args(&[ + command.env_clear().args([ "-c", "echo one > /dev/stdout \ && echo two > /dev/stderr \ From de8bc93309f9eab8c431328065ea6f3d5bd0f34f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 13 Dec 2024 15:14:49 -0800 Subject: [PATCH 03/10] Optimizations to improve dataset ensuring performance (#7236) Major part of https://github.com/oxidecomputer/omicron/issues/7217 Optimizations made: - `zfs get` is queried, so properties are samples for all datasets of interest ahead-of-time. No subsequent processes are exec'd for each dataset that needs no changes. - The "dataset ensure" process is now concurrent These optimizations should significantly improve the latency of the "no (or few) changes necessary" cases. Optimizations still left to be made: - Making blueprint execution concurrent, rather than blocking one-sled-at-a-time - Patching the illumos_utils "Zfs::ensure_filesystem" to re-use the pre-fetched properties, and minimize re-querying --- Cargo.lock | 2 + illumos-utils/Cargo.toml | 1 + illumos-utils/src/zfs.rs | 239 +++++++++--------- sled-agent/src/backing_fs.rs | 3 +- sled-agent/src/bootstrap/pre_server.rs | 1 + sled-storage/Cargo.toml | 1 + sled-storage/src/dataset.rs | 2 + sled-storage/src/error.rs | 3 + sled-storage/src/manager.rs | 320 ++++++++++++++++++++----- 9 files changed, 395 insertions(+), 177 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f53a518f76..c9fc792f72 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4464,6 +4464,7 @@ dependencies = [ "futures", "http", "ipnetwork", + "itertools 0.13.0", "libc", "macaddr", "mockall", @@ -10816,6 +10817,7 @@ dependencies = [ "slog", "thiserror 1.0.69", "tokio", + "tokio-stream", "uuid", ] diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index e1421bd3ab..69276713bb 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -21,6 +21,7 @@ dropshot.workspace = true futures.workspace = true http.workspace = true ipnetwork.workspace = true +itertools.workspace = true libc.workspace = true macaddr.workspace = true omicron-common.workspace = true diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index f9edb8de86..ee1ac58be9 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -9,9 +9,11 @@ use anyhow::anyhow; use anyhow::bail; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; +use itertools::Itertools; use omicron_common::api::external::ByteCount; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DiskIdentity; +use omicron_common::disk::SharedDatasetConfig; use omicron_uuid_kinds::DatasetUuid; use std::collections::BTreeMap; use std::fmt; @@ -82,25 +84,19 @@ enum EnsureFilesystemErrorRaw { /// Error returned by [`Zfs::ensure_filesystem`]. #[derive(thiserror::Error, Debug)] -#[error( - "Failed to ensure filesystem '{name}' exists at '{mountpoint:?}': {err}" -)] +#[error("Failed to ensure filesystem '{name}': {err}")] pub struct EnsureFilesystemError { name: String, - mountpoint: Mountpoint, #[source] err: EnsureFilesystemErrorRaw, } /// Error returned by [`Zfs::set_oxide_value`] #[derive(thiserror::Error, Debug)] -#[error( - "Failed to set value '{name}={value}' on filesystem {filesystem}: {err}" -)] +#[error("Failed to set values '{values}' on filesystem {filesystem}: {err}")] pub struct SetValueError { filesystem: String, - name: String, - value: String, + values: String, err: crate::ExecutionError, } @@ -242,11 +238,27 @@ impl DatasetProperties { "oxide:uuid,name,avail,used,quota,reservation,compression"; } +impl TryFrom<&DatasetProperties> for SharedDatasetConfig { + type Error = anyhow::Error; + + fn try_from( + props: &DatasetProperties, + ) -> Result { + Ok(SharedDatasetConfig { + compression: props.compression.parse()?, + quota: props.quota, + reservation: props.reservation, + }) + } +} + impl DatasetProperties { /// Parses dataset properties, assuming that the caller is providing the /// output of the following command as stdout: /// - /// zfs get -rpo name,property,value,source $ZFS_GET_PROPS $DATASETS + /// zfs get \ + /// [maybe depth arguments] \ + /// -Hpo name,property,value,source $ZFS_GET_PROPS $DATASETS fn parse_many( stdout: &str, ) -> Result, anyhow::Error> { @@ -307,14 +319,16 @@ impl DatasetProperties { .parse::() .context("Failed to parse 'used'")? .try_into()?; + + // The values of "quota" and "reservation" can be either "-" or + // "0" when they are not actually set. To be cautious, we treat + // both of these values as "the value has not been set + // explicitly". As a result, setting either of these values + // explicitly to zero is indistinguishable from setting them + // with a value of "none". let quota = props .get("quota") - .filter(|(_prop, source)| { - // If a quota has not been set explicitly, it has a default - // source and a value of "zero". Rather than parsing the value - // as zero, it should be ignored. - *source != "default" - }) + .filter(|(prop, _source)| *prop != "-" && *prop != "0") .map(|(prop, _source)| { prop.parse::().context("Failed to parse 'quota'") }) @@ -322,12 +336,7 @@ impl DatasetProperties { .and_then(|v| ByteCount::try_from(v).ok()); let reservation = props .get("reservation") - .filter(|(_prop, source)| { - // If a reservation has not been set explicitly, it has a default - // source and a value of "zero". Rather than parsing the value - // as zero, it should be ignored. - *source != "default" - }) + .filter(|(prop, _source)| *prop != "-" && *prop != "0") .map(|(prop, _source)| { prop.parse::() .context("Failed to parse 'reservation'") @@ -375,7 +384,40 @@ impl fmt::Display for PropertySource { } } -#[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))] +#[derive(Copy, Clone, Debug)] +pub enum WhichDatasets { + SelfOnly, + SelfAndChildren, +} + +fn build_zfs_set_key_value_pairs( + size_details: Option, + dataset_id: Option, +) -> Vec<(&'static str, String)> { + let mut props = Vec::new(); + if let Some(SizeDetails { quota, reservation, compression }) = size_details + { + let quota = quota + .map(|q| q.to_bytes().to_string()) + .unwrap_or_else(|| String::from("none")); + props.push(("quota", quota)); + + let reservation = reservation + .map(|r| r.to_bytes().to_string()) + .unwrap_or_else(|| String::from("none")); + props.push(("reservation", reservation)); + + let compression = compression.to_string(); + props.push(("compression", compression)); + } + + if let Some(id) = dataset_id { + props.push(("oxide:uuid", id.to_string())); + } + + props +} + impl Zfs { /// Lists all datasets within a pool or existing dataset. /// @@ -399,7 +441,9 @@ impl Zfs { } /// Get information about datasets within a list of zpools / datasets. - /// Returns properties for all input datasets and their direct children. + /// Returns properties for all input datasets, and optionally, for + /// their children (depending on the value of [WhichDatasets] is provided + /// as input). /// /// This function is similar to [Zfs::list_datasets], but provides a more /// substantial results about the datasets found. @@ -407,22 +451,30 @@ impl Zfs { /// Sorts results and de-duplicates them by name. pub fn get_dataset_properties( datasets: &[String], + which: WhichDatasets, ) -> Result, anyhow::Error> { let mut command = std::process::Command::new(ZFS); - let cmd = command.args(&[ - "get", - "-d", - "1", - "-Hpo", - "name,property,value,source", - ]); + let cmd = command.arg("get"); + match which { + WhichDatasets::SelfOnly => (), + WhichDatasets::SelfAndChildren => { + cmd.args(&["-d", "1"]); + } + } + cmd.args(&["-Hpo", "name,property,value,source"]); // Note: this is tightly coupled with the layout of DatasetProperties cmd.arg(DatasetProperties::ZFS_GET_PROPS); cmd.args(datasets); - let output = execute(cmd).with_context(|| { - format!("Failed to get dataset properties for {datasets:?}") + // We are intentionally ignoring the output status of this command. + // + // If one or more dataset doesn't exist, we can still read stdout to + // see about the ones that do exist. + let output = cmd.output().map_err(|err| { + anyhow!( + "Failed to get dataset properties for {datasets:?}: {err:?}" + ) })?; let stdout = String::from_utf8(output.stdout)?; @@ -490,23 +542,19 @@ impl Zfs { do_format: bool, encryption_details: Option, size_details: Option, + id: Option, additional_options: Option>, ) -> Result<(), EnsureFilesystemError> { let (exists, mounted) = Self::dataset_exists(name, &mountpoint)?; + + let props = build_zfs_set_key_value_pairs(size_details, id); if exists { - if let Some(SizeDetails { quota, reservation, compression }) = - size_details - { - // apply quota and compression mode (in case they've changed across - // sled-agent versions since creation) - Self::apply_properties( - name, - &mountpoint, - quota, - reservation, - compression, - )?; - } + Self::set_values(name, props.as_slice()).map_err(|err| { + EnsureFilesystemError { + name: name.to_string(), + err: err.err.into(), + } + })?; if encryption_details.is_none() { // If the dataset exists, we're done. Unencrypted datasets are @@ -518,14 +566,13 @@ impl Zfs { return Ok(()); } // We need to load the encryption key and mount the filesystem - return Self::mount_encrypted_dataset(name, &mountpoint); + return Self::mount_encrypted_dataset(name); } } if !do_format { return Err(EnsureFilesystemError { name: name.to_string(), - mountpoint, err: EnsureFilesystemErrorRaw::NotFoundNotFormatted, }); } @@ -561,7 +608,6 @@ impl Zfs { execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), - mountpoint: mountpoint.clone(), err: err.into(), })?; @@ -574,82 +620,27 @@ impl Zfs { let cmd = command.args(["chown", "-R", &user, &mount]); execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), - mountpoint: mountpoint.clone(), err: err.into(), })?; } - if let Some(SizeDetails { quota, reservation, compression }) = - size_details - { - // Apply any quota and compression mode. - Self::apply_properties( - name, - &mountpoint, - quota, - reservation, - compression, - )?; - } - - Ok(()) - } - - /// Applies the following properties to the filesystem. - /// - /// If any of the options are not supplied, a default "none" or "off" - /// value is supplied. - fn apply_properties( - name: &str, - mountpoint: &Mountpoint, - quota: Option, - reservation: Option, - compression: CompressionAlgorithm, - ) -> Result<(), EnsureFilesystemError> { - let quota = quota - .map(|q| q.to_bytes().to_string()) - .unwrap_or_else(|| String::from("none")); - let reservation = reservation - .map(|r| r.to_bytes().to_string()) - .unwrap_or_else(|| String::from("none")); - let compression = compression.to_string(); - - if let Err(err) = Self::set_value(name, "quota", "a) { - return Err(EnsureFilesystemError { - name: name.to_string(), - mountpoint: mountpoint.clone(), - // Take the execution error from the SetValueError - err: err.err.into(), - }); - } - if let Err(err) = Self::set_value(name, "reservation", &reservation) { - return Err(EnsureFilesystemError { - name: name.to_string(), - mountpoint: mountpoint.clone(), - // Take the execution error from the SetValueError - err: err.err.into(), - }); - } - if let Err(err) = Self::set_value(name, "compression", &compression) { - return Err(EnsureFilesystemError { + Self::set_values(name, props.as_slice()).map_err(|err| { + EnsureFilesystemError { name: name.to_string(), - mountpoint: mountpoint.clone(), - // Take the execution error from the SetValueError err: err.err.into(), - }); - } + } + })?; + Ok(()) } fn mount_encrypted_dataset( name: &str, - mountpoint: &Mountpoint, ) -> Result<(), EnsureFilesystemError> { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[ZFS, "mount", "-l", name]); execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), - mountpoint: mountpoint.clone(), err: EnsureFilesystemErrorRaw::MountEncryptedFsFailed(err), })?; Ok(()) @@ -657,13 +648,11 @@ impl Zfs { pub fn mount_overlay_dataset( name: &str, - mountpoint: &Mountpoint, ) -> Result<(), EnsureFilesystemError> { let mut command = std::process::Command::new(PFEXEC); let cmd = command.args(&[ZFS, "mount", "-O", name]); execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), - mountpoint: mountpoint.clone(), err: EnsureFilesystemErrorRaw::MountOverlayFsFailed(err), })?; Ok(()) @@ -689,7 +678,6 @@ impl Zfs { if &values[..3] != &[name, "filesystem", &mountpoint.to_string()] { return Err(EnsureFilesystemError { name: name.to_string(), - mountpoint: mountpoint.clone(), err: EnsureFilesystemErrorRaw::Output(stdout.to_string()), }); } @@ -714,13 +702,29 @@ impl Zfs { name: &str, value: &str, ) -> Result<(), SetValueError> { + Self::set_values(filesystem_name, &[(name, value)]) + } + + fn set_values( + filesystem_name: &str, + name_values: &[(K, V)], + ) -> Result<(), SetValueError> { + if name_values.is_empty() { + return Ok(()); + } + let mut command = std::process::Command::new(PFEXEC); - let value_arg = format!("{}={}", name, value); - let cmd = command.args(&[ZFS, "set", &value_arg, filesystem_name]); + let cmd = command.args(&[ZFS, "set"]); + for (name, value) in name_values { + cmd.arg(format!("{name}={value}")); + } + cmd.arg(filesystem_name); execute(cmd).map_err(|err| SetValueError { filesystem: filesystem_name.to_string(), - name: name.to_string(), - value: value.to_string(), + values: name_values + .iter() + .map(|(k, v)| format!("{k}={v}")) + .join(","), err, })?; Ok(()) @@ -808,10 +812,7 @@ impl Zfs { err, }) } -} -// These methods don't work with mockall, so they exist in a separate impl block -impl Zfs { /// Calls "zfs get" to acquire multiple values /// /// - `names`: The properties being acquired diff --git a/sled-agent/src/backing_fs.rs b/sled-agent/src/backing_fs.rs index 265c0152e2..d24f85ad80 100644 --- a/sled-agent/src/backing_fs.rs +++ b/sled-agent/src/backing_fs.rs @@ -150,6 +150,7 @@ pub(crate) fn ensure_backing_fs( true, // do_format None, // encryption_details, size_details, + None, Some(vec!["canmount=noauto".to_string()]), // options )?; @@ -180,7 +181,7 @@ pub(crate) fn ensure_backing_fs( info!(log, "Mounting {} on {}", dataset, mountpoint); - Zfs::mount_overlay_dataset(&dataset, &mountpoint)?; + Zfs::mount_overlay_dataset(&dataset)?; if let Some(subdirs) = bfs.subdirs { for dir in subdirs { diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 5b89506242..19a63cb71d 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -293,6 +293,7 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { encryption_details, quota, None, + None, ) .map_err(StartError::EnsureZfsRamdiskDataset) } diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 2439c52aa7..d47a89e0ae 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -28,6 +28,7 @@ sled-hardware.workspace = true slog.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-stream.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index e90a4c0092..717595ad9e 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -269,6 +269,7 @@ pub(crate) async fn ensure_zpool_has_datasets( Some(encryption_details), None, None, + None, ); keyfile.zero_and_unlink().await.map_err(|error| { @@ -331,6 +332,7 @@ pub(crate) async fn ensure_zpool_has_datasets( encryption_details, size_details, None, + None, )?; if dataset.wipe { diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index 351dd7f353..f00e35e654 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -77,6 +77,9 @@ pub enum Error { #[error("Not ready to manage U.2s (key manager is not ready)")] KeyManagerNotReady, + #[error("Physical disk configuration changed for the same generation number: {generation}")] + PhysicalDiskConfigurationChanged { generation: Generation }, + #[error("Physical disk configuration out-of-date (asked for {requested}, but latest is {current})")] PhysicalDiskConfigurationOutdated { requested: Generation, diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index a760285d3f..b1832ca92b 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -14,7 +14,9 @@ use camino::Utf8Path; use camino::Utf8PathBuf; use debug_ignore::DebugIgnore; use futures::future::FutureExt; -use illumos_utils::zfs::{DatasetProperties, Mountpoint, Zfs}; +use futures::Stream; +use futures::StreamExt; +use illumos_utils::zfs::{DatasetProperties, Mountpoint, WhichDatasets, Zfs}; use illumos_utils::zpool::{ZpoolName, ZPOOL_MOUNTPOINT_ROOT}; use key_manager::StorageKeyRequester; use omicron_common::disk::{ @@ -26,6 +28,7 @@ use omicron_common::ledger::Ledger; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use slog::{error, info, o, warn, Logger}; +use std::collections::BTreeMap; use std::collections::HashSet; use std::future::Future; use tokio::sync::{mpsc, oneshot, watch}; @@ -929,21 +932,107 @@ impl StorageManager { // includes details about all possible errors that may occur on // a per-dataset granularity. async fn datasets_ensure_internal( - &mut self, + &self, log: &Logger, config: &DatasetsConfig, ) -> DatasetsManagementResult { - let mut status = vec![]; - for dataset in config.datasets.values() { - status.push(self.dataset_ensure_internal(log, dataset).await); - } + // Gather properties about these datasets, if they exist. + // + // This pre-fetching lets us avoid individually querying them later. + let old_datasets = Zfs::get_dataset_properties( + config + .datasets + .values() + .map(|config| config.name.full_name()) + .collect::>() + .as_slice(), + WhichDatasets::SelfOnly, + ) + .unwrap_or_default() + .into_iter() + .map(|props| (props.name.clone(), props)) + .collect::>(); + + let futures = config.datasets.values().map(|dataset| async { + self.dataset_ensure_internal( + log, + dataset, + old_datasets.get(&dataset.name.full_name()), + ) + .await + }); + + // This "Box::pin" is a workaround for: https://github.com/rust-lang/rust/issues/64552 + // + // Ideally, we would just use: + // + // ``` + // let status: Vec<_> = futures::stream::iter(futures) + // .buffered(...) + // .collect() + // .await; + // ``` + const DATASET_ENSURE_CONCURRENCY_LIMIT: usize = 16; + let results: std::pin::Pin + Send>> = Box::pin( + futures::stream::iter(futures) + .buffered(DATASET_ENSURE_CONCURRENCY_LIMIT), + ); + + let status: Vec = results.collect().await; + DatasetsManagementResult { status } } + fn should_skip_dataset_ensure( + log: &Logger, + config: &DatasetConfig, + old_dataset: Option<&DatasetProperties>, + ) -> Result { + let Some(old_dataset) = old_dataset else { + info!(log, "This dataset did not exist"); + return Ok(false); + }; + + let Some(old_id) = old_dataset.id else { + info!(log, "Old properties missing UUID"); + return Ok(false); + }; + + if old_id != config.id { + return Err(Error::UuidMismatch { + name: config.name.full_name(), + old: old_id.into_untyped_uuid(), + new: config.id.into_untyped_uuid(), + }); + } + + let old_props = match SharedDatasetConfig::try_from(old_dataset) { + Ok(old_props) => old_props, + Err(err) => { + warn!(log, "Failed to parse old properties"; "err" => #%err); + return Ok(false); + } + }; + + info!(log, "Parsed old dataset properties"; "props" => ?old_props); + if old_props != config.inner { + info!( + log, + "Dataset properties changed"; + "old_props" => ?old_props, + "requested_props" => ?config.inner, + ); + return Ok(false); + } + info!(log, "No changes necessary, returning early"); + return Ok(true); + } + async fn dataset_ensure_internal( - &mut self, + &self, log: &Logger, config: &DatasetConfig, + old_dataset: Option<&DatasetProperties>, ) -> DatasetManagementStatus { let log = log.new(o!("name" => config.name.full_name())); info!(log, "Ensuring dataset"); @@ -952,6 +1041,15 @@ impl StorageManager { err: None, }; + match Self::should_skip_dataset_ensure(&log, config, old_dataset) { + Ok(true) => return status, + Ok(false) => (), + Err(err) => { + status.err = Some(err.to_string()); + return status; + } + } + let mountpoint_root = &self.resources.disks().mount_config().root; let mountpoint_path = config.name.mountpoint(mountpoint_root); let details = DatasetCreationDetails { @@ -961,9 +1059,9 @@ impl StorageManager { }; if let Err(err) = self - .ensure_dataset_with_id( + .ensure_dataset( config.name.pool(), - config.id, + Some(config.id), &config.inner, &details, ) @@ -1017,8 +1115,11 @@ impl StorageManager { ]; info!(log, "Listing datasets within zpool"; "zpool" => zpool.to_string()); - Zfs::get_dataset_properties(datasets_of_interest.as_slice()) - .map_err(Error::Other) + Zfs::get_dataset_properties( + datasets_of_interest.as_slice(), + WhichDatasets::SelfAndChildren, + ) + .map_err(Error::Other) } // Ensures that a dataset exists, nested somewhere arbitrary within @@ -1039,8 +1140,13 @@ impl StorageManager { full_name: config.name.full_name(), }; - self.ensure_dataset(config.name.root.pool(), &config.inner, &details) - .await?; + self.ensure_dataset( + config.name.root.pool(), + None, + &config.inner, + &details, + ) + .await?; Ok(()) } @@ -1073,15 +1179,18 @@ impl StorageManager { info!(log, "Listing nested datasets"); let full_name = name.full_name(); - let properties = - Zfs::get_dataset_properties(&[full_name]).map_err(|e| { - warn!( - log, - "Failed to access nested dataset"; - "name" => ?name - ); - crate::dataset::DatasetError::Other(e) - })?; + let properties = Zfs::get_dataset_properties( + &[full_name], + WhichDatasets::SelfAndChildren, + ) + .map_err(|e| { + warn!( + log, + "Failed to access nested dataset"; + "name" => ?name + ); + crate::dataset::DatasetError::Other(e) + })?; let root_path = name.root.full_name(); Ok(properties @@ -1159,12 +1268,29 @@ impl StorageManager { requested: config.generation, current: ledger_data.generation, }); - } - - // TODO: If the generation is equal, check that the values are - // also equal. + } else if config.generation == ledger_data.generation { + info!( + log, + "Requested generation number matches prior request", + ); - info!(log, "Request looks newer than prior requests"); + if ledger_data != &config { + error!( + log, + "Requested configuration changed (with the same generation)"; + "generation" => ?config.generation + ); + return Err(Error::PhysicalDiskConfigurationChanged { + generation: config.generation, + }); + } + info!( + log, + "Request looks identical to last request, re-sending" + ); + } else { + info!(log, "Request looks newer than prior requests"); + } ledger } None => { @@ -1291,40 +1417,13 @@ impl StorageManager { } } - // Invokes [Self::ensure_dataset] and also ensures the dataset has an - // expected UUID as a ZFS property. - async fn ensure_dataset_with_id( - &mut self, - zpool: &ZpoolName, - id: DatasetUuid, - config: &SharedDatasetConfig, - details: &DatasetCreationDetails, - ) -> Result<(), Error> { - self.ensure_dataset(zpool, config, details).await?; - - // Ensure the dataset has a usable UUID. - if let Ok(id_str) = Zfs::get_oxide_value(&details.full_name, "uuid") { - if let Ok(found_id) = id_str.parse::() { - if found_id != id { - return Err(Error::UuidMismatch { - name: details.full_name.clone(), - old: found_id.into_untyped_uuid(), - new: id.into_untyped_uuid(), - }); - } - return Ok(()); - } - } - Zfs::set_oxide_value(&details.full_name, "uuid", &id.to_string())?; - Ok(()) - } - // Ensures a dataset exists within a zpool. // // Confirms that the zpool exists and is managed by this sled. async fn ensure_dataset( - &mut self, + &self, zpool: &ZpoolName, + dataset_id: Option, config: &SharedDatasetConfig, details: &DatasetCreationDetails, ) -> Result<(), Error> { @@ -1365,6 +1464,7 @@ impl StorageManager { do_format, encryption_details, size_details, + dataset_id, None, )?; @@ -1401,6 +1501,7 @@ impl StorageManager { do_format, encryption_details, size_details, + Some(DatasetUuid::from_untyped_uuid(request.dataset_id)), None, )?; // Ensure the dataset has a usable UUID. @@ -1725,7 +1826,7 @@ mod tests { ); let mut harness = StorageManagerTestHarness::new(&logctx.log).await; - // Queue up a disks, as we haven't told the `StorageManager` that + // Queue up a disk, as we haven't told the `StorageManager` that // the `KeyManager` is ready yet. let raw_disks = harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; @@ -1996,6 +2097,111 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn ensure_many_datasets() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("ensure_many_datasets"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Test setup: Add U.2s and an M.2, adopt them into the "control plane" + // for usage. + harness.handle().key_manager_ready().await; + let raw_disks = harness + .add_vdevs(&[ + "u2_0.vdev", + "u2_1.vdev", + "u2_2.vdev", + "u2_3.vdev", + "u2_4.vdev", + "u2_5.vdev", + "u2_6.vdev", + "u2_7.vdev", + "u2_8.vdev", + "u2_9.vdev", + "m2_helping.vdev", + ]) + .await; + let config = harness.make_config(1, &raw_disks); + + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + assert!(!result.has_error(), "{:?}", result); + + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + assert!(!result.has_error(), "{:?}", result); + + // Create datasets on the newly formatted U.2s + let mut datasets = BTreeMap::new(); + for i in 0..10 { + let zpool_name = ZpoolName::new_external(config.disks[i].pool_id); + + let id = DatasetUuid::new_v4(); + let name = + DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); + datasets.insert( + id, + DatasetConfig { + id, + name, + inner: SharedDatasetConfig::default(), + }, + ); + + let id = DatasetUuid::new_v4(); + let name = DatasetName::new(zpool_name.clone(), DatasetKind::Debug); + datasets.insert( + id, + DatasetConfig { + id, + name, + inner: SharedDatasetConfig::default(), + }, + ); + + let id = DatasetUuid::new_v4(); + let name = DatasetName::new( + zpool_name.clone(), + DatasetKind::TransientZoneRoot, + ); + datasets.insert( + id, + DatasetConfig { + id, + name, + inner: SharedDatasetConfig::default(), + }, + ); + } + // "Generation = 1" is reserved as "no requests seen yet", so we jump + // past it. + let generation = Generation::new().next(); + let config = DatasetsConfig { generation, datasets }; + + let status = + harness.handle().datasets_ensure(config.clone()).await.unwrap(); + assert!(!status.has_error()); + + // List datasets, expect to see what we just created + let observed_config = + harness.handle().datasets_config_list().await.unwrap(); + assert_eq!(config, observed_config); + + // Calling "datasets_ensure" with the same input should succeed. + let status = + harness.handle().datasets_ensure(config.clone()).await.unwrap(); + assert!(!status.has_error()); + + harness.cleanup().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn nested_dataset() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); From c7663bd36aa28cb17a43acfe97a2f9b9265e48bd Mon Sep 17 00:00:00 2001 From: David Crespo Date: Fri, 13 Dec 2024 19:19:27 -0600 Subject: [PATCH 04/10] Bump web console (gateways UI, SSH key view, fixes) (#7250) https://github.com/oxidecomputer/console/compare/927c8b63...c1ebd8d9 * [c1ebd8d9](https://github.com/oxidecomputer/console/commit/c1ebd8d9) take inventory switches tab back out, DB is empty * [2a139028](https://github.com/oxidecomputer/console/commit/2a139028) tone down empty switches table message * [2b0c3f12](https://github.com/oxidecomputer/console/commit/2b0c3f12) oxidecomputer/console#2621 * [c1fbf631](https://github.com/oxidecomputer/console/commit/c1fbf631) oxidecomputer/console#2622 * [1dd25f63](https://github.com/oxidecomputer/console/commit/1dd25f63) oxidecomputer/console#2615 * [11486f8b](https://github.com/oxidecomputer/console/commit/11486f8b) oxidecomputer/console#2620 * [ada302ce](https://github.com/oxidecomputer/console/commit/ada302ce) oxidecomputer/console#2488 * [bc3161ae](https://github.com/oxidecomputer/console/commit/bc3161ae) minor: remove stub e2e test * [9e1d53c6](https://github.com/oxidecomputer/console/commit/9e1d53c6) sentence case idp form heading * [aaf1154f](https://github.com/oxidecomputer/console/commit/aaf1154f) add extra assert for instance create with additional disks test flake * [79d610dc](https://github.com/oxidecomputer/console/commit/79d610dc) oxidecomputer/console#2618 * [bdbc02b7](https://github.com/oxidecomputer/console/commit/bdbc02b7) oxidecomputer/console#2589 * [7a8ee0ab](https://github.com/oxidecomputer/console/commit/7a8ee0ab) oxidecomputer/console#2614 * [0b5220a1](https://github.com/oxidecomputer/console/commit/0b5220a1) npm audit fix * [0c873cf4](https://github.com/oxidecomputer/console/commit/0c873cf4) oxidecomputer/console#2610 * [d031c8ff](https://github.com/oxidecomputer/console/commit/d031c8ff) bump playwright for navigation hang fix * [dbd8545e](https://github.com/oxidecomputer/console/commit/dbd8545e) oxidecomputer/console#2609 * [dc5562fe](https://github.com/oxidecomputer/console/commit/dc5562fe) move error log to avoid failing to log certain errors --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 08078c264e..85ca41f755 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="927c8b63a6f97c230cd8766a80fa1cfef6429eb4" -SHA2="96550b6e485aaee1c6ced00a4a1aeec86267c99fc79a4b2b253141cf0222d346" +COMMIT="c1ebd8d9acae4ff7a09b2517265fba52ebdfe82e" +SHA2="840dbfda1c0def66212e7602d7be6e8acf1b26ba218f10ce3e627df49f5ce9e2" From 0fa1bc2bc447d63c2ed1bc3979a1298f2ae3ce4c Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 13 Dec 2024 17:29:57 -0800 Subject: [PATCH 05/10] update nextest to 0.9.86 (#7239) Includes some fixes to JUnit support, including representing setup scripts in the output. --- .config/nextest.toml | 2 +- .github/buildomat/build-and-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index b1699e6717..cc21a442b3 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -3,7 +3,7 @@ # # The required version should be bumped up if we need new features, performance # improvements or bugfixes that are present in newer versions of nextest. -nextest-version = { required = "0.9.77", recommended = "0.9.78" } +nextest-version = { required = "0.9.77", recommended = "0.9.86" } experimental = ["setup-scripts"] diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh index 28c1dbc0e6..605bf91b01 100755 --- a/.github/buildomat/build-and-test.sh +++ b/.github/buildomat/build-and-test.sh @@ -18,7 +18,7 @@ target_os=$1 # NOTE: This version should be in sync with the recommended version in # .config/nextest.toml. (Maybe build an automated way to pull the recommended # version in the future.) -NEXTEST_VERSION='0.9.78' +NEXTEST_VERSION='0.9.86' cargo --version rustc --version From 5ceea0f4f16d87294e3043459e11c2d05b51ce6d Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 13 Dec 2024 19:47:02 -0800 Subject: [PATCH 06/10] Check for correct error message in timeseries retry loop (#7253) --- nexus/tests/integration_tests/metrics.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 7e5441c16a..5e5268fb55 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -399,8 +399,7 @@ pub async fn execute_timeseries_query( if rsp.status.is_client_error() { let text = std::str::from_utf8(&rsp.body) .expect("Timeseries query response body should be UTF-8"); - if text.contains("Schema for timeseries") && text.contains("not found") - { + if text.starts_with("Timeseries not found for: ") { return None; } } From 89d6c764f48589798c0ee552e857f51f83fb94b0 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 13 Dec 2024 20:51:57 -0800 Subject: [PATCH 07/10] Add DTrace scripts to Nexus zone (#7244) - Adds a script for tracing all transactions run by Nexus, including their overall latency and the number of statements in each one. - Move all existing scripts to a Nexus subdirectory, and then include that whole directory in the Omicron zone for Nexus itself. - Closes #7224 --- Cargo.lock | 3 +- package-manifest.toml | 1 + .../{ => nexus}/aggregate-query-latency.d | 0 tools/dtrace/{ => nexus}/slowest-queries.d | 2 +- tools/dtrace/{ => nexus}/trace-db-queries.d | 0 tools/dtrace/nexus/trace-transactions.d | 67 +++++++++++++++++++ workspace-hack/Cargo.toml | 2 + 7 files changed, 73 insertions(+), 2 deletions(-) rename tools/dtrace/{ => nexus}/aggregate-query-latency.d (100%) rename tools/dtrace/{ => nexus}/slowest-queries.d (99%) rename tools/dtrace/{ => nexus}/trace-db-queries.d (100%) create mode 100755 tools/dtrace/nexus/trace-transactions.d diff --git a/Cargo.lock b/Cargo.lock index c9fc792f72..6a44c7af3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7398,6 +7398,7 @@ dependencies = [ "getrandom", "group", "hashbrown 0.15.1", + "heck 0.4.1", "hex", "hickory-proto", "hmac", @@ -11020,7 +11021,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ - "heck 0.5.0", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.87", diff --git a/package-manifest.toml b/package-manifest.toml index 809c1ce6ca..83b1ba8168 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -125,6 +125,7 @@ source.paths = [ { from = "smf/nexus/{{rack-topology}}", to = "/var/svc/manifest/site/nexus" }, { from = "out/console-assets", to = "/var/nexus/static" }, { from = "schema/crdb", to = "/var/nexus/schema/crdb" }, + { from = "tools/dtrace/nexus", to = "/opt/oxide/dtrace/nexus" }, ] output.type = "zone" setup_hint = """ diff --git a/tools/dtrace/aggregate-query-latency.d b/tools/dtrace/nexus/aggregate-query-latency.d similarity index 100% rename from tools/dtrace/aggregate-query-latency.d rename to tools/dtrace/nexus/aggregate-query-latency.d diff --git a/tools/dtrace/slowest-queries.d b/tools/dtrace/nexus/slowest-queries.d similarity index 99% rename from tools/dtrace/slowest-queries.d rename to tools/dtrace/nexus/slowest-queries.d index 76e22de22f..08b40d4d79 100755 --- a/tools/dtrace/slowest-queries.d +++ b/tools/dtrace/nexus/slowest-queries.d @@ -35,7 +35,7 @@ diesel_db$target:::query-done query[this->conn_id] = NULL; } -tick-5s +tick-10s { printf("\n%Y\n", walltimestamp); trunc(@, 5); diff --git a/tools/dtrace/trace-db-queries.d b/tools/dtrace/nexus/trace-db-queries.d similarity index 100% rename from tools/dtrace/trace-db-queries.d rename to tools/dtrace/nexus/trace-db-queries.d diff --git a/tools/dtrace/nexus/trace-transactions.d b/tools/dtrace/nexus/trace-transactions.d new file mode 100755 index 0000000000..baf7818f72 --- /dev/null +++ b/tools/dtrace/nexus/trace-transactions.d @@ -0,0 +1,67 @@ +#!/usr/sbin/dtrace -qs + +/* Trace all transactions to the control plane database with their latency */ + +dtrace:::BEGIN +{ + printf("Tracing all database transactions for nexus PID %d, use Ctrl-C to exit\n", $target); +} + +/* + * Record the start and number of statements for each transaction. + * + * Note that we're using the Nexus-provided transaction start / done probes. + * This lets us associate the other data we might collect (number of statements, + * ustacks, etc) with the Nexus code itself. Although there are transaction + * start / done probes in `diesel-dtrace`, the existing way we run transactions + * with `async-bb8-diesel` involves spawning a future to run the transactions on + * a blocking thread-pool. That spawning makes it impossible to associate the + * context in which the `diesel-dtrace` probes fire with the Nexus code that + * actually spawned the transaction itself. + */ +nexus_db_queries$target:::transaction-start +{ + this->key = copyinstr(arg0); + transaction_names[this->key] = copyinstr(arg1); + ts[this->key] = timestamp; + n_statements[this->key] = 0; + printf( + "Started transaction '%s' on conn %s\n", + transaction_names[this->key], + json(this->key, "ok") + ); +} + +/* + * When a query runs in the context of a transaction (on the same connection), + * bump the statement counter. + */ +diesel_db$target:::query-start +/ts[copyinstr(arg1)]/ +{ + n_statements[copyinstr(arg1)] += 1 +} + +/* + * As transactions complete, print the number of statements we ran and the + * duration. + */ +nexus_db_queries$target:::transaction-done +/ts[copyinstr(arg0)]/ +{ + this->key = copyinstr(arg0); + this->conn_id = json(this->key, "ok"); + this->latency = (timestamp - ts[this->key]) / 1000; + this->n_statements = n_statements[this->key]; + printf( + "%s %d statement(s) in transaction '%s' on connection %s (%d us)\n", + arg2 ? "COMMIT" : "ROLLBACK", + n_statements[this->key], + transaction_names[this->key], + this->conn_id, + this->latency + ); + ts[this->key] = 0; + n_statements[this->key] = 0; + transaction_names[this->key] = 0; +} diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 31677ed8c1..678170b25e 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -63,6 +63,7 @@ generic-array = { version = "0.14.7", default-features = false, features = ["mor getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.15.1" } +heck = { version = "0.4.1" } hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } @@ -182,6 +183,7 @@ generic-array = { version = "0.14.7", default-features = false, features = ["mor getrandom = { version = "0.2.15", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.15.1" } +heck = { version = "0.4.1" } hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } From 3449a645add39ea8d857556c49d4bbb522cd92b6 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 14 Dec 2024 12:14:46 -0800 Subject: [PATCH 08/10] decide whether each API should be server- or client-side-versioned (#7138) --- .github/buildomat/build-and-test.sh | 3 +- dev-tools/ls-apis/README.adoc | 24 ++ dev-tools/ls-apis/api-manifest.toml | 76 ++++ dev-tools/ls-apis/src/api_metadata.rs | 34 ++ dev-tools/ls-apis/src/bin/ls-apis.rs | 99 ++++- dev-tools/ls-apis/src/lib.rs | 1 + dev-tools/ls-apis/src/system_apis.rs | 364 ++++++++++++++++++- dev-tools/ls-apis/tests/api_dependencies.out | 1 - 8 files changed, 591 insertions(+), 11 deletions(-) diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh index 605bf91b01..369274eb7a 100755 --- a/.github/buildomat/build-and-test.sh +++ b/.github/buildomat/build-and-test.sh @@ -69,7 +69,8 @@ banner ls-apis source ./tools/include/force-git-over-https.sh; ptime -m cargo xtask ls-apis apis && ptime -m cargo xtask ls-apis deployment-units && - ptime -m cargo xtask ls-apis servers + ptime -m cargo xtask ls-apis servers && + ptime -m cargo xtask ls-apis check ) # diff --git a/dev-tools/ls-apis/README.adoc b/dev-tools/ls-apis/README.adoc index 00c229bff2..6b1aa1cdea 100644 --- a/dev-tools/ls-apis/README.adoc +++ b/dev-tools/ls-apis/README.adoc @@ -104,6 +104,30 @@ You can generate a PNG image of the graph like this: $ dot -T png -o deployment-units.png deployment-units.dot ``` +=== Checking the upgrade DAG + +As of this writing: + +* All Dropshot/Progenitor APIs in the system are identified with this tool (as far as we know). +* Every API is annotated with metadata in `api-manifest.toml`. +* The API metadata specifies whether versioning for the API is managed on the server side exclusively or using client-side versioning as well. We've made this choice for every existing API. +* There are no cycles in the server-side-managed-API dependency graph. + +This tool verifies these properties. If you add a new API or a new API dependency without adding valid metadata, or if that metadata breaks these constraints, you will have to fix this up before you can land the change to Omicron. + +You can verify the versioning metadata using: + +``` +$ cargo xtask ls-apis check +``` + +You might see any of the following errors: + +* `+missing field `versioned_how`+`: this probably means that you added an API to the metadata but neglected to specify the `versioned_how` field. This must be either `client` or `server`. In general, prefer `server`. The tool should tell you if you try to use `server` but that can't work. If in doubt, ask the update team. +* `at least one API has unknown version strategy (see above)`: this probably means you added an API with `version_how = "unknown"`. (The specific API(s) with this problem are listed immediately above this error.) You need to instead decide if it should be client-side or server-side versioned. If in doubt, ask the update team. +* `+missing field `versioned_how_reason`+`: this probably means that you added an API with `versioned_how = "client"`, but did not add a `versioned_how_reason`. This field is required. It's a free-text field that explains why we couldn't use server-side versioning for this API. +* `graph of server-managed components has a cycle (includes node: "omicron-nexus")`: this probably means that you added an API with `versioned_how = "server"`, but that created a circular dependency with other server-side-versioned components. You'll need to break the cycle by changing one of these components (probably your new one) to be client-managed instead. +* `API identified by client package "nexus-client" (Nexus Internal API) is the "client" in a "non-dag" dependency rule, but its "versioned_how" is not "client"` (the specific APIs may be different): this (unlikely) condition means just what it says: the API manifest file contains a dependency rule saying that some API is *not* part of the update DAG, but that API is not marked accordingly. See the documentation below on filter rules. == Details diff --git a/dev-tools/ls-apis/api-manifest.toml b/dev-tools/ls-apis/api-manifest.toml index 766708f478..75f4777a48 100644 --- a/dev-tools/ls-apis/api-manifest.toml +++ b/dev-tools/ls-apis/api-manifest.toml @@ -132,6 +132,28 @@ packages = [ "oximeter-collector" ] # # `server_package_name`: the package that contains the Dropshot API definition. # +# `versioned_how`: one of: +# +# "server": this component uses server-side versioning only, meaning that the +# update system can ensure that servers are always updated before +# clients. +# +# "client": this component uses client-side versioning, meaning that the +# update system cannot ensure that servers are always updated +# before clients, so clients need to deal with multiple different +# server versions. +# +# You must also specify `versioned_how_reason` if you use "client". +# +# "unknown": this has not been determined. This will break tests. But the +# tooling supports this value so that during development you can +# relax these constraints. +# +# [`versioned_how_reason`]: free text string explaining why `versioned_how` must +# be "client" for this API. This is printed in documentation and command output +# and serves as documentation for developers to understand why we made this +# choice. This should be present only if `versioned_how = "client"`. +# # [`notes`]: optional free-form human-readable summary documentation about this # API ################################################################################ @@ -140,11 +162,14 @@ packages = [ "oximeter-collector" ] client_package_name = "bootstrap-agent-client" label = "Bootstrap Agent" server_package_name = "bootstrap-agent-api" +versioned_how = "client" +versioned_how_reason = "depends on itself (i.e., instances call each other)" [[apis]] client_package_name = "clickhouse-admin-keeper-client" label = "Clickhouse Cluster Admin for Keepers" server_package_name = "clickhouse-admin-api" +versioned_how = "server" notes = """ This is the server running inside multi-node Clickhouse keeper zones that's \ responsible for local configuration and monitoring. @@ -154,6 +179,7 @@ responsible for local configuration and monitoring. client_package_name = "clickhouse-admin-server-client" label = "Clickhouse Cluster Admin for Servers" server_package_name = "clickhouse-admin-api" +versioned_how = "server" notes = """ This is the server running inside multi-node Clickhouse server zones that's \ responsible for local configuration and monitoring. @@ -163,6 +189,7 @@ responsible for local configuration and monitoring. client_package_name = "clickhouse-admin-single-client" label = "Clickhouse Single-Node Cluster Admin" server_package_name = "clickhouse-admin-api" +versioned_how = "server" notes = """ This is the server running inside single-node Clickhouse server zones that's \ responsible for local configuration and monitoring. @@ -172,6 +199,7 @@ responsible for local configuration and monitoring. client_package_name = "cockroach-admin-client" label = "CockroachDB Cluster Admin" server_package_name = "cockroach-admin-api" +versioned_how = "server" notes = """ This is the server running inside CockroachDB zones that performs \ configuration and monitoring that requires the `cockroach` CLI. @@ -181,11 +209,14 @@ configuration and monitoring that requires the `cockroach` CLI. client_package_name = "crucible-agent-client" label = "Crucible Agent" server_package_name = "crucible-agent" +versioned_how = "server" [[apis]] client_package_name = "repair-client" label = "Crucible Repair" server_package_name = "crucible-downstairs" +versioned_how = "client" +versioned_how_reason = "depends on itself (i.e., instances call each other)" notes = """ The repair service offered by a crucible-downstairs supports both repairing \ one downstairs from another, and making a clone of a read-only downstairs \ @@ -196,11 +227,13 @@ when creating a new region in the crucible agent. client_package_name = "crucible-pantry-client" label = "Crucible Pantry" server_package_name = "crucible-pantry" +versioned_how = "server" [[apis]] client_package_name = "ddm-admin-client" label = "Maghemite DDM Admin" server_package_name = "ddmd" +versioned_how = "server" notes = """ The `ddmd` server runs in each sled GZ and each switch zone. These daemons \ provide an interface for advertising network prefixes, and observing what \ @@ -216,11 +249,17 @@ observability APIs in the future. client_package_name = "dns-service-client" label = "DNS Server" server_package_name = "dns-server-api" +versioned_how = "server" [[apis]] client_package_name = "dpd-client" label = "Dendrite DPD" server_package_name = "dpd" +# TODO We might need to pick this apart more carefully. DPD invokes and is +# invoked by several different services. It's possible that some of them can +# treat it as server-managed. +versioned_how = "client" +versioned_how_reason = "Sled Agent calling DPD creates a circular dependency" notes = """ Dendrite's data plane daemon (`dpd`) is the interface to configure and manage \ the rack switches. It's consumed by sled-agent to get the rack off the \ @@ -235,17 +274,29 @@ repo is not currently open source. client_package_name = "gateway-client" label = "Management Gateway Service" server_package_name = "gateway-api" +versioned_how = "server" notes = "Wicketd is deployed in a unit with MGS so we can ignore that one." [[apis]] client_package_name = "installinator-client" label = "Wicketd Installinator" server_package_name = "installinator-api" +# The installinator-client is used only by Installinator. This is part of the +# recovery OS image. Today, this is not really "shipped" in the traditional +# sense. The only way it gets used today is by an operator uploading it to +# Wicket and then performing a MUPdate. In this sense, we do not control the +# client, so we mark this client-managed -- it's up to the operator to make sure +# they're using a TUF repo whose recovery image contains an Installinator that's +# compatible with the API served by Wicket on the deployed system. In practice, +# we're almost certainly just going to avoid changing this API. +versioned_how = "client" +versioned_how_reason = "client is provided implicitly by the operator" [[apis]] client_package_name = "mg-admin-client" label = "Maghemite MG Admin" server_package_name = "mgd" +versioned_how = "server" notes = """ The `mgd` daemon runs in each switch zone. This daemon is responsible for all \ external route management for a switch. It provides interfaces for static \ @@ -258,17 +309,25 @@ bring the rack up. client_package_name = "nexus-client" label = "Nexus Internal API" server_package_name = "nexus-internal-api" +# nexus-client has to be client-versioned today because it's got a cyclic +# dependency with sled-agent-client, which is server-versioned. +versioned_how = "client" +versioned_how_reason = "Circular dependencies between Nexus and other services" [[apis]] client_package_name = "oxide-client" label = "External API" server_package_name = "nexus-external-api" +# The versioning strategy for the external API is outside the scope of this +# tool. It doesn't matter what we put here. +versioned_how = "server" notes = "Special case, since we don't fully control all clients" [[apis]] client_package_name = "oximeter-client" label = "Oximeter" server_package_name = "oximeter-api" +versioned_how = "server" notes = """ Shared types for this interface are in `omicron-common`. The producer makes \ requests to Nexus, and receives them from `oximeter-collector`. \ @@ -281,6 +340,7 @@ and makes the periodic renewal requests to `oximeter-collector`. client_package_name = "propolis-client" label = "Propolis" server_package_name = "propolis-server" +versioned_how = "server" notes = """ Sled Agent is deployed in a unit with Propolis so we can ignore that one. """ @@ -289,16 +349,20 @@ Sled Agent is deployed in a unit with Propolis so we can ignore that one. client_package_name = "sled-agent-client" label = "Sled Agent" server_package_name = "sled-agent-api" +versioned_how = "server" [[apis]] client_package_name = "repo-depot-client" label = "Repo Depot API" server_package_name = "repo-depot-api" +versioned_how = "client" +versioned_how_reason = "depends on itself (i.e., instances call each other)" [[apis]] client_package_name = "wicketd-client" label = "Wicketd" server_package_name = "wicketd-api" +versioned_how = "server" notes = """ wicketd-client is only used by wicket, which is deployed in a unit with wicketd. """ @@ -307,6 +371,7 @@ wicketd-client is only used by wicket, which is deployed in a unit with wicketd. client_package_name = "crucible-control-client" label = "Crucible Control (for testing only)" server_package_name = "crucible" +versioned_how = "server" notes = """ Exposed by Crucible upstairs for debugging via the `cmon` debugging tool. """ @@ -315,6 +380,7 @@ Exposed by Crucible upstairs for debugging via the `cmon` debugging tool. client_package_name = "dsc-client" label = "Downstairs Controller (debugging only)" server_package_name = "dsc" +versioned_how = "server" dev_only = true notes = """ `dsc` is a control program for spinning up and controlling instances of Crucible @@ -400,6 +466,16 @@ DNS server depends on itself only to provide TransientServer, which is not a deployed component. """ +[[dependency_filter_rules]] +ancestor = "omicron-sled-agent" +client = "sled-agent-client" +evaluation = "not-deployed" +note = """ +Sled Agent uses a Sled Agent client in two ways: RSS, which we don't care about +for the purpose of upgrade, and in the `zone-bundle` binary, which is not +deployed. +""" + # # "Bogus" dependencies. See above for details. # diff --git a/dev-tools/ls-apis/src/api_metadata.rs b/dev-tools/ls-apis/src/api_metadata.rs index 311e95549f..01f133debe 100644 --- a/dev-tools/ls-apis/src/api_metadata.rs +++ b/dev-tools/ls-apis/src/api_metadata.rs @@ -108,6 +108,20 @@ impl AllApiMetadata { Ok(which_rules[0].evaluation) } + + /// Returns the list of APIs that have non-DAG dependency rules + pub(crate) fn non_dag_apis(&self) -> impl Iterator { + self.dependency_rules.iter().filter_map(|(client_pkgname, rules)| { + rules.iter().any(|r| r.evaluation == Evaluation::NonDag).then( + || { + // unwrap(): we previously verified that the "client" for + // all dependency rules corresponds to an API that we have + // metadata for. + self.apis.get(client_pkgname).unwrap() + }, + ) + }) + } } /// Format of the `api-manifest.toml` file @@ -186,6 +200,23 @@ impl TryFrom for AllApiMetadata { } } +/// Describes how an API in the system manages drift between client and server +#[derive(Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "versioned_how", content = "versioned_how_reason")] +pub enum VersionedHow { + /// We have not yet determined how this API will be versioned. + Unknown, + + /// This API will be versioned solely on the server. (The update system + /// will ensure that servers are always updated before clients.) + Server, + + /// This API will be versioned on the client. (The update system cannot + /// ensure that servers are always updated before clients.) + Client(String), +} + /// Describes one API in the system #[derive(Deserialize)] pub struct ApiMetadata { @@ -199,6 +230,9 @@ pub struct ApiMetadata { pub server_package_name: ServerPackageName, /// human-readable notes about this API pub notes: Option, + /// describes how we've decided this API will be versioned + #[serde(default, flatten)] + pub versioned_how: VersionedHow, /// If `dev_only` is true, then this API's server is not deployed in a /// production system. It's only used in development environments. The /// default is that APIs *are* deployed. diff --git a/dev-tools/ls-apis/src/bin/ls-apis.rs b/dev-tools/ls-apis/src/bin/ls-apis.rs index 9d2a5b5349..ea0ae48cd4 100644 --- a/dev-tools/ls-apis/src/bin/ls-apis.rs +++ b/dev-tools/ls-apis/src/bin/ls-apis.rs @@ -4,12 +4,12 @@ //! Show information about Progenitor-based APIs -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; use clap::{Args, Parser, Subcommand}; use omicron_ls_apis::{ AllApiMetadata, ApiDependencyFilter, LoadArgs, ServerComponentName, - SystemApis, + SystemApis, VersionedHow, }; use parse_display::{Display, FromStr}; @@ -34,6 +34,8 @@ enum Cmds { Adoc, /// print out each API, what exports it, and what consumes it Apis(ShowDepsArgs), + /// check the update DAG and propose changes + Check, /// print out APIs exported and consumed by each deployment unit DeploymentUnits(DotArgs), /// print out APIs exported and consumed, by server component @@ -81,6 +83,7 @@ fn main() -> Result<()> { match cli_args.cmd { Cmds::Adoc => run_adoc(&apis), Cmds::Apis(args) => run_apis(&apis, args), + Cmds::Check => run_check(&apis), Cmds::DeploymentUnits(args) => run_deployment_units(&apis, args), Cmds::Servers(args) => run_servers(&apis, args), } @@ -93,12 +96,13 @@ fn run_adoc(apis: &SystemApis) -> Result<()> { println!( ".List of OpenAPI/Progenitor-based interfaces for online upgrade." ); - println!(r#"[cols="1h,2,2,2a,2", options="header"]"#); + println!(r#"[cols="1h,2,2,2a,2,2", options="header"]"#); println!("|==="); println!("|API"); println!("|Server location (`repo:path`)"); println!("|Client packages (`repo:path`)"); println!("|Consumers (`repo:path`; excluding omdb and tests)"); + println!("|Versioning"); println!("|Notes"); println!(""); @@ -122,6 +126,14 @@ fn run_adoc(apis: &SystemApis) -> Result<()> { println!("* {}", apis.adoc_label(c)?); } + match &api.versioned_how { + VersionedHow::Unknown => println!("|TBD"), + VersionedHow::Server => println!("|Server-side only"), + VersionedHow::Client(reason) => { + println!("|Client-side ({})", reason); + } + }; + print!("|{}", api.notes.as_deref().unwrap_or("-\n")); println!(""); } @@ -261,3 +273,84 @@ impl TryFrom<&LsApis> for LoadArgs { Ok(LoadArgs { api_manifest_path }) } } + +fn run_check(apis: &SystemApis) -> Result<()> { + let dag_check = apis.dag_check()?; + + for (pkg, reasons) in dag_check.proposed_server_managed() { + println!( + "proposal: make {:?} server-managed: {}", + pkg, + reasons.join(", ") + ); + } + + for (pkg, reasons) in dag_check.proposed_client_managed() { + println!( + "proposal: make {:?} client-managed: {}", + pkg, + reasons.join(", ") + ); + } + + for (pkg1, pkg2) in dag_check.proposed_upick() { + println!( + "proposal: choose either {:?} or {:?} to be client-managed \ + (they directly depend on each other)", + pkg1, pkg2, + ); + } + + println!("\n"); + println!("Server-managed APIs:\n"); + for api in apis + .api_metadata() + .apis() + .filter(|f| f.deployed() && f.versioned_how == VersionedHow::Server) + { + println!( + " {} ({}, exposed by {})", + api.label, + api.client_package_name, + apis.api_producer(&api.client_package_name).unwrap() + ); + } + + println!("\n"); + println!("Client-managed API:\n"); + for api in apis.api_metadata().apis().filter(|f| f.deployed()) { + if let VersionedHow::Client(reason) = &api.versioned_how { + println!( + " {} ({}, exposed by {})", + api.label, + api.client_package_name, + apis.api_producer(&api.client_package_name).unwrap() + ); + println!(" reason: {}", reason); + } + } + + println!("\n"); + print!("APIs with unknown version management:"); + let unknown: Vec<_> = apis + .api_metadata() + .apis() + .filter(|f| f.versioned_how == VersionedHow::Unknown) + .collect(); + if unknown.is_empty() { + println!(" none"); + } else { + println!("\n"); + for api in unknown { + println!( + " {} ({}, exposed by {})", + api.label, + api.client_package_name, + apis.api_producer(&api.client_package_name).unwrap() + ); + } + bail!("at least one API has unknown version strategy (see above)"); + } + + Ok(()) +} diff --git a/dev-tools/ls-apis/src/lib.rs b/dev-tools/ls-apis/src/lib.rs index 8d4f760c8b..c27606c25f 100644 --- a/dev-tools/ls-apis/src/lib.rs +++ b/dev-tools/ls-apis/src/lib.rs @@ -10,6 +10,7 @@ mod system_apis; mod workspaces; pub use api_metadata::AllApiMetadata; +pub use api_metadata::VersionedHow; pub use system_apis::ApiDependencyFilter; pub use system_apis::SystemApis; diff --git a/dev-tools/ls-apis/src/system_apis.rs b/dev-tools/ls-apis/src/system_apis.rs index 5d4a84160a..ef6dae0e43 100644 --- a/dev-tools/ls-apis/src/system_apis.rs +++ b/dev-tools/ls-apis/src/system_apis.rs @@ -8,6 +8,7 @@ use crate::api_metadata::AllApiMetadata; use crate::api_metadata::ApiMetadata; use crate::api_metadata::Evaluation; +use crate::api_metadata::VersionedHow; use crate::cargo::DepPath; use crate::parse_toml_file; use crate::workspaces::Workspaces; @@ -16,11 +17,13 @@ use crate::DeploymentUnitName; use crate::LoadArgs; use crate::ServerComponentName; use crate::ServerPackageName; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::Result; +use anyhow::{anyhow, bail, Context}; use camino::Utf8PathBuf; use cargo_metadata::Package; use parse_display::{Display, FromStr}; use petgraph::dot::Dot; +use petgraph::graph::NodeIndex; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -369,12 +372,27 @@ impl SystemApis { &self, filter: ApiDependencyFilter, ) -> Result { + let (graph, _nodes) = self.make_component_graph(filter, false)?; + Ok(Dot::new(&graph).to_string()) + } + + // The complex type below is only used in this one place: the return value + // of an internal helper function. A type alias doesn't seem better. + #[allow(clippy::type_complexity)] + fn make_component_graph( + &self, + dependency_filter: ApiDependencyFilter, + versioned_on_server_only: bool, + ) -> Result<( + petgraph::graph::Graph<&ServerComponentName, &ClientPackageName>, + BTreeMap<&ServerComponentName, NodeIndex>, + )> { let mut graph = petgraph::graph::Graph::new(); let nodes: BTreeMap<_, _> = self .server_component_units .keys() .map(|server_component| { - (server_component.clone(), graph.add_node(server_component)) + (server_component, graph.add_node(server_component)) }) .collect(); @@ -383,16 +401,350 @@ impl SystemApis { for server_component in self.apis_consumed.keys() { // unwrap(): we created a node for each server component above. let my_node = nodes.get(server_component).unwrap(); - let consumed_apis = - self.component_apis_consumed(server_component, filter)?; + let consumed_apis = self + .component_apis_consumed(server_component, dependency_filter)?; for (client_pkg, _) in consumed_apis { + if versioned_on_server_only { + let api = self + .api_metadata + .client_pkgname_lookup(client_pkg) + .unwrap(); + if api.versioned_how != VersionedHow::Server { + continue; + } + } + let other_component = self.api_producer(client_pkg).unwrap(); let other_node = nodes.get(other_component).unwrap(); - graph.add_edge(*my_node, *other_node, client_pkg.clone()); + graph.add_edge(*my_node, *other_node, client_pkg); } } - Ok(Dot::new(&graph).to_string()) + Ok((graph, nodes)) + } + + /// Verifies various important properties about the assignment of which APIs + /// are server-managed vs. client-managed. + /// + /// Returns a structure with proposals for how to assign APIs that are + /// currently unassigned. + pub fn dag_check(&self) -> Result> { + // In this function, we'll use the following ApiDependencyFilter a bunch + // when walking the component dependency graph. "Default" is the + // correct filter to use here. This excludes relationships that are + // totally bogus, only affect components that are never actually + // deployed, or are part of an edge that we've already determined will + // be "non-DAG". + // + // This last case might be a little confusing. The whole point of this + // function is to help developers figure out which edges should be part + // of the DAG or not. Why would we ignore edges based on whether + // they're already in the DAG or not? + // + // Recall that there are two ways that the metadata can specify that a + // particular API is "not part of the update DAG" (which is equivalent + // to client-side-managed): + // + // - a specific class of Cargo dependencies can be marked "non-DAG" via + // a dependency filter rule. The only case of this today is where we + // say that Cargo dependencies from "oximeter-producer" to + // "nexus-client" are "non-DAG". This means we promise to make the + // Nexus internal API client-managed (i.e., not part of the update + // DAG). We verify this promise below. + // - a specific API can be marked as server-managed (meaning it's part + // of the update DAG) or not. That's most of what this function deals + // with and proposes changes to. + // + // In the long term, it might be nice to combine these. But that's more + // work than it sounds like: we'd probably want to convert everything + // to filter rules, but that requires (tediously) writing out every + // single edge that we care about. An alternative would be to eliminate + // the non-DAG dependency filter rules and only use the property at the + // API level. However right now it seems quite possible that we do want + // this on a per-edge basis, rather than a per-API basis (i.e., there + // are some client-side-versioned APIs that have consumers that could + // treat them as server-side-versioned). + // + // Anyway, what we're talking about here is ignoring the first category + // of information and looking only at the second. This is *safe* (i.e., + // correct) because we verify below that the second category (the + // API-level `versioned_for` property) contains the same information + // provided by the first category (the non-DAG dependency filter rules). + // We *choose* to do this because it makes the heuristics below more + // useful. For example, excluding the non-DAG edges makes it easy for + // the heuristic below to tell that crucible-pantry ought to be + // server-side-managed because it has no (other) dependencies and so + // can't be part of a cycle. + let filter = ApiDependencyFilter::Default; + + // Construct a graph where: + // + // - nodes are all the API producer and consumer components + // - we only include edges *to* components that produce server-managed + // APIs + // + // Check if this DAG is cyclic. This can't be made to work. + let (graph, nodes) = self.make_component_graph(filter, true)?; + let reverse_nodes: BTreeMap<_, _> = + nodes.iter().map(|(s_c, node)| (node, s_c)).collect(); + if let Err(error) = petgraph::algo::toposort(&graph, None) { + bail!( + "graph of server-managed components has a cycle (includes \ + node: {:?})", + reverse_nodes.get(&error.node_id()).unwrap() + ); + } + + // Verify that the targets of any "non-dag" dependency filter rules are + // indeed not part of the server-side-versioned DAG. + for api in self.api_metadata.non_dag_apis() { + if !matches!(api.versioned_how, VersionedHow::Client(..)) { + bail!( + "API identified by client package {:?} ({}) is the \ + \"client\" in a \"non-dag\" dependency rule, but its \ + \"versioned_how\" is not \"client\"", + api.client_package_name, + api.label, + ); + } + } + + // Use some heuristics to propose next steps. + // + // We're only looking for possible next steps here -- we don't have to + // programmatically figure out the whole graph. + + let mut dag_check = DagCheck::new(); + + for api in self.api_metadata.apis() { + if !api.deployed() { + if api.versioned_how == VersionedHow::Unknown { + dag_check.propose_server( + &api.client_package_name, + String::from("not produced by a deployed component"), + ); + } + continue; + } + let producer = self.api_producer(&api.client_package_name).unwrap(); + let apis_consumed: BTreeSet<_> = self + .component_apis_consumed(producer, filter)? + .map(|(client_pkgname, _dep_path)| client_pkgname) + .collect(); + let dependents: BTreeSet<_> = self + .api_consumers(&api.client_package_name, filter) + .unwrap() + .map(|(dependent, _dep_path)| dependent) + .collect(); + + if api.versioned_how == VersionedHow::Unknown { + // If we haven't determined how to manage versioning on this + // API, and it has no dependencies on "unknown" or + // client-managed APIs, then it can be made server-managed. + if !apis_consumed.iter().any(|client_pkgname| { + let api = self + .api_metadata + .client_pkgname_lookup(*client_pkgname) + .unwrap(); + api.versioned_how != VersionedHow::Server + }) { + dag_check.propose_server( + &api.client_package_name, + String::from( + "has no unknown or client-managed dependencies", + ), + ); + } else if apis_consumed.contains(&api.client_package_name) { + // If this thing depends on itself, it must be + // client-managed. + dag_check.propose_client( + &api.client_package_name, + String::from("depends on itself"), + ); + } else if dependents.is_empty() { + // If something has no consumers in deployed components, it + // can be server-managed. (These are generally debug APIs.) + dag_check.propose_server( + &api.client_package_name, + String::from( + "has no consumers among deployed components", + ), + ); + } + + continue; + } + + let dependencies: BTreeMap<_, _> = apis_consumed + .iter() + .map(|dependency_clientpkg| { + ( + self.api_producer(dependency_clientpkg).unwrap(), + *dependency_clientpkg, + ) + }) + .collect(); + + // Look for one-step circular dependencies (i.e., API API A1 is + // produced by component C1, which uses API A2 produced by C2, which + // also uses A1). In such cases, either A1 or A2 must be + // client-managed (or both). + for other_pkgname in dependents { + if let Some(dependency_clientpkg) = + dependencies.get(other_pkgname) + { + let dependency_api = self + .api_metadata + .client_pkgname_lookup(*dependency_clientpkg) + .unwrap(); + + // If we're looking at a server-managed dependency and the + // other is unknown, then that one should be client-managed. + // + // Without loss of generality, we can ignore the reverse + // case (because we will catch that case when we're + // iterating over the dependency API). + if api.versioned_how == VersionedHow::Server + && dependency_api.versioned_how == VersionedHow::Unknown + { + dag_check.propose_client( + dependency_clientpkg, + format!( + "has cyclic dependency on {:?}, which is \ + server-managed", + api.client_package_name, + ), + ) + } + + // If both are Unknown, tell the user to pick one. + if api.versioned_how == VersionedHow::Unknown + && dependency_api.versioned_how == VersionedHow::Unknown + { + dag_check.propose_upick( + &api.client_package_name, + dependency_clientpkg, + ); + } + } + } + } + + Ok(dag_check) + } +} + +/// Describes proposals for assigning how APIs should be versioned, based on +/// heuristics applied while checking the DAG +pub struct DagCheck<'a> { + /// set of APIs (identified by client package name) that we propose should + /// be server-managed, along with a list of reasons why we think so + proposed_server_managed: BTreeMap<&'a ClientPackageName, Vec>, + /// set of APIs (identified by client package name) that we propose should + /// be client-managed, along with a list of reasons why we think so + proposed_client_managed: BTreeMap<&'a ClientPackageName, Vec>, + /// set of pairs of APIs where we propose that the user must pick one + /// package in each pair to be client-managed (because the two packages have + /// a mutual dependency) + /// + /// The ordering in these pairs is not semantically significant. The + /// implementation will ensure that each pair of packages is represented at + /// most once in this structure. + proposed_upick: + BTreeMap<&'a ClientPackageName, BTreeSet<&'a ClientPackageName>>, +} + +impl<'a> DagCheck<'a> { + fn new() -> DagCheck<'a> { + DagCheck { + proposed_server_managed: BTreeMap::new(), + proposed_client_managed: BTreeMap::new(), + proposed_upick: BTreeMap::new(), + } + } + + fn propose_client( + &mut self, + client_pkgname: &'a ClientPackageName, + reason: String, + ) { + self.proposed_client_managed + .entry(client_pkgname) + .or_insert_with(Vec::new) + .push(reason); + } + + fn propose_server( + &mut self, + client_pkgname: &'a ClientPackageName, + reason: String, + ) { + self.proposed_server_managed + .entry(client_pkgname) + .or_insert_with(Vec::new) + .push(reason); + } + + /// Propose that one of these two packages should be client-managed (because + /// they depend on each other, so they can't both be server-managed). + fn propose_upick( + &mut self, + client_pkgname1: &'a ClientPackageName, + client_pkgname2: &'a ClientPackageName, + ) { + // A "upick" is a situation where you (the person running the tool) + // should choose either of `pkg1` or `pkg2` to be client-managed. The + // caller will identify this situation twice: once when looking at + // `pkg1` and once when looking at `pkg2`. But we only want to report + // it once. So we'll ignore duplicates here because it's easier here + // than in the caller. + // + // To do that, first check whether the caller has already proposed this + // "upick" with the packages in the other order. If so, do nothing. + if let Some(other_pkg_upicks) = self.proposed_upick.get(client_pkgname2) + { + if other_pkg_upicks.contains(client_pkgname1) { + return; + } + } + + // Now go ahead and insert the pair in this order. This construction + // will also do nothing if this same pair has already been inserted in + // this order. + self.proposed_upick + .entry(client_pkgname1) + .or_insert_with(BTreeSet::new) + .insert(client_pkgname2); + } + + /// Returns a list of APIs (identified by client package name) that look + /// like they could use server-side versioning, along with reasons + pub fn proposed_server_managed( + &self, + ) -> impl Iterator)> { + self.proposed_server_managed.iter().map(|(c, r)| (*c, r)) + } + + /// Returns a list of APIs (identified by client package name) that look + /// like they should use client-side versioning, along with reasons + pub fn proposed_client_managed( + &self, + ) -> impl Iterator)> { + self.proposed_client_managed.iter().map(|(c, r)| (*c, r)) + } + + /// Returns a list of pairs of APIs (identified by client package names) for + /// which we have not yet picked client-side or server-side versioning and + /// where there is a direct mutual dependency + /// + /// At least one of these APIs will need to be marked client-managed. + pub fn proposed_upick( + &self, + ) -> impl Iterator + { + self.proposed_upick + .iter() + .flat_map(|(c, others)| others.iter().map(|o| (*c, *o))) } } diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index 8cd2f0a085..8e061f2906 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -79,7 +79,6 @@ Repo Depot API (client: repo-depot-client) Sled Agent (client: sled-agent-client) consumed by: dpd (dendrite/dpd) via 1 path consumed by: omicron-nexus (omicron/nexus) via 7 paths - consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path Wicketd (client: wicketd-client) From 85feb0952e1fde1ef322d970e3f41145f9a0caf4 Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Mon, 16 Dec 2024 11:21:35 +0000 Subject: [PATCH 09/10] Ensure services do not start in /root (#7166) By default, an SMF service which has a non-trivial context will start in the user's home directory. For services started as 'root' this means /root. While a lot of services will subsequently chdir elsewhere, we want to avoid anything using /root, even briefly, as it will soon be a separate ZFS dataset on an Oxide system. --- smf/chrony-setup/manifest.xml | 2 +- smf/cockroachdb/manifest.xml | 2 +- smf/ntp/manifest/manifest.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/smf/chrony-setup/manifest.xml b/smf/chrony-setup/manifest.xml index 7882693495..8127ca80df 100644 --- a/smf/chrony-setup/manifest.xml +++ b/smf/chrony-setup/manifest.xml @@ -14,7 +14,7 @@ - + diff --git a/smf/cockroachdb/manifest.xml b/smf/cockroachdb/manifest.xml index 67ddbe48b8..6ed18b1bca 100644 --- a/smf/cockroachdb/manifest.xml +++ b/smf/cockroachdb/manifest.xml @@ -19,7 +19,7 @@ - + diff --git a/smf/ntp/manifest/manifest.xml b/smf/ntp/manifest/manifest.xml index 5b45406a47..8fa1b4ebc4 100644 --- a/smf/ntp/manifest/manifest.xml +++ b/smf/ntp/manifest/manifest.xml @@ -66,7 +66,7 @@ - + From ca21fe769b0f1c3f08fd8858bc53d5c207b2f7dc Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Dec 2024 09:04:00 -0500 Subject: [PATCH 10/10] [reconfigurator] SledEditor: be more strict about decommissioned sleds (#7234) This is a followup from https://github.com/oxidecomputer/omicron/pull/7204#discussion_r1870465378 and makes two changes, neither of which should affect behavior: * `SledEditor` will now fail if a caller attempts to make changes to a decommissioned sled (internally, this is statically enforced by a type state enum - a sled in the decommissioned state does not have any methods that support editing, so we're forced to return an error) * `SledEditor::set_state()` is now `SledEditor::decommission()`, and it performs some checks that the sled looks decommissionable The second bullet is more questionable than I expected it to be: 1. There are some arguments that `SledEditor` shouldn't do any checks here; in particular, it doesn't have the full context (e.g., any checks on "should we decommission this sled" that depend on the `PlanningInput` can't be performed here, because `SledEditor` intentionally doesn't have access to `PlanningInput`). 2. I wanted to check zones + disks + datasets, but in practice it can only check zones today; I left a comment (and the commented-out disks + datasets checks we should do) about why. I think we will eventually be able to turn these on; the current behavior of removing disks/datasets from the blueprint for expunged sleds will have to change to fix #7078, at which point these checks should be valid. I don't feel super strongly about the checks in `decommission()` or even this PR as a whole; if this doesn't look like a useful direction, I'd be fine with discarding it. Please review with a pretty critical eye. --- Cargo.lock | 1 + nexus/reconfigurator/planning/Cargo.toml | 1 + .../planning/src/blueprint_builder/builder.rs | 46 ++-- .../src/blueprint_editor/sled_editor.rs | 255 +++++++++++++++++- .../blueprint_editor/sled_editor/datasets.rs | 6 +- nexus/reconfigurator/planning/src/planner.rs | 3 +- 6 files changed, 278 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6a44c7af3d..44ef4eccc5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6071,6 +6071,7 @@ dependencies = [ "indexmap 2.6.0", "internal-dns-resolver", "ipnet", + "itertools 0.13.0", "maplit", "nexus-config", "nexus-inventory", diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 43a65ad085..efed173a71 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -16,6 +16,7 @@ illumos-utils.workspace = true indexmap.workspace = true internal-dns-resolver.workspace = true ipnet.workspace = true +itertools.workspace = true nexus-config.workspace = true nexus-inventory.workspace = true nexus-sled-agent-shared.workspace = true diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 394133132b..74e91ff943 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -228,6 +228,14 @@ pub struct SledEditCounts { } impl SledEditCounts { + pub fn zeroes() -> Self { + Self { + disks: EditCounts::zeroes(), + datasets: EditCounts::zeroes(), + zones: EditCounts::zeroes(), + } + } + fn has_nonzero_counts(&self) -> bool { let Self { disks, datasets, zones } = self; disks.has_nonzero_counts() @@ -548,7 +556,7 @@ impl<'a> BlueprintBuilder<'a> { generation: Generation::new(), datasets: BTreeMap::new(), }); - let editor = SledEditor::new( + let editor = SledEditor::for_existing( state, zones.clone(), disks, @@ -565,8 +573,7 @@ impl<'a> BlueprintBuilder<'a> { // that weren't in the parent blueprint. (These are newly-added sleds.) for sled_id in input.all_sled_ids(SledFilter::Commissioned) { if let Entry::Vacant(slot) = sled_editors.entry(sled_id) { - slot.insert(SledEditor::new_empty( - SledState::Active, + slot.insert(SledEditor::for_new_active( build_preexisting_dataset_ids(sled_id)?, )); } @@ -735,8 +742,8 @@ impl<'a> BlueprintBuilder<'a> { .retain(|sled_id, _| in_service_sled_ids.contains(sled_id)); // If we have the clickhouse cluster setup enabled via policy and we - // don't yet have a `ClickhouseClusterConfiguration`, then we must create - // one and feed it to our `ClickhouseAllocator`. + // don't yet have a `ClickhouseClusterConfiguration`, then we must + // create one and feed it to our `ClickhouseAllocator`. let clickhouse_allocator = if self.input.clickhouse_cluster_enabled() { let parent_config = self .parent_blueprint @@ -815,18 +822,18 @@ impl<'a> BlueprintBuilder<'a> { } /// Set the desired state of the given sled. - pub fn set_sled_state( + pub fn set_sled_decommissioned( &mut self, sled_id: SledUuid, - desired_state: SledState, ) -> Result<(), Error> { let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { Error::Planner(anyhow!( "tried to set sled state for unknown sled {sled_id}" )) })?; - editor.set_state(desired_state); - Ok(()) + editor + .decommission() + .map_err(|err| Error::SledEditError { sled_id, err }) } /// Within tests, set an RNG for deterministic results. @@ -1046,15 +1053,18 @@ impl<'a> BlueprintBuilder<'a> { // blueprint for (disk_id, (zpool, disk)) in database_disks { database_disk_ids.insert(disk_id); - editor.ensure_disk( - BlueprintPhysicalDiskConfig { - disposition: BlueprintPhysicalDiskDisposition::InService, - identity: disk.disk_identity.clone(), - id: disk_id, - pool_id: *zpool, - }, - &mut self.rng, - ); + editor + .ensure_disk( + BlueprintPhysicalDiskConfig { + disposition: + BlueprintPhysicalDiskDisposition::InService, + identity: disk.disk_identity.clone(), + id: disk_id, + pool_id: *zpool, + }, + &mut self.rng, + ) + .map_err(|err| Error::SledEditError { sled_id, err })?; } // Remove any disks that appear in the blueprint, but not the database diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs index 13094b97a4..995cc306a5 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs @@ -7,18 +7,25 @@ use crate::blueprint_builder::SledEditCounts; use crate::planner::PlannerRng; use illumos_utils::zpool::ZpoolName; +use itertools::Either; +use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::BlueprintDatasetsConfig; use nexus_types::deployment::BlueprintPhysicalDiskConfig; use nexus_types::deployment::BlueprintPhysicalDisksConfig; use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::BlueprintZonesConfig; use nexus_types::deployment::DiskFilter; use nexus_types::external_api::views::SledState; +use omicron_common::disk::DatasetKind; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::ZpoolUuid; +use std::mem; mod datasets; mod disks; @@ -50,6 +57,32 @@ pub enum SledInputError { #[derive(Debug, thiserror::Error)] pub enum SledEditError { + #[error("editing a decommissioned sled is not allowed")] + EditDecommissioned, + #[error( + "sled is not decommissionable: \ + disk {disk_id} (zpool {zpool_id}) is in service" + )] + NonDecommissionableDiskInService { + disk_id: PhysicalDiskUuid, + zpool_id: ZpoolUuid, + }, + #[error( + "sled is not decommissionable: \ + dataset {dataset_id} (kind {kind:?}) is in service" + )] + NonDecommissionableDatasetInService { + dataset_id: DatasetUuid, + kind: DatasetKind, + }, + #[error( + "sled is not decommissionable: \ + zone {zone_id} (kind {kind:?}) is not expunged" + )] + NonDecommissionableZoneNotExpunged { + zone_id: OmicronZoneUuid, + kind: ZoneKind, + }, #[error("failed to edit disks")] EditDisks(#[from] DisksEditError), #[error("failed to edit datasets")] @@ -74,8 +107,196 @@ pub enum SledEditError { } #[derive(Debug)] -pub(crate) struct SledEditor { - state: SledState, +pub(crate) struct SledEditor(InnerSledEditor); + +#[derive(Debug)] +enum InnerSledEditor { + // Internally, `SledEditor` has a variant for each variant of `SledState`, + // as the operations allowed in different states are substantially different + // (i.e., an active sled allows any edit; a decommissioned sled allows + // none). + Active(ActiveSledEditor), + Decommissioned(EditedSled), +} + +impl SledEditor { + pub fn for_existing( + state: SledState, + zones: BlueprintZonesConfig, + disks: BlueprintPhysicalDisksConfig, + datasets: BlueprintDatasetsConfig, + preexisting_dataset_ids: DatasetIdsBackfillFromDb, + ) -> Result { + match state { + SledState::Active => { + let inner = ActiveSledEditor::new( + zones, + disks, + datasets, + preexisting_dataset_ids, + )?; + Ok(Self(InnerSledEditor::Active(inner))) + } + SledState::Decommissioned => { + let inner = EditedSled { + state, + zones, + disks, + datasets, + edit_counts: SledEditCounts::zeroes(), + }; + Ok(Self(InnerSledEditor::Decommissioned(inner))) + } + } + } + + pub fn for_new_active( + preexisting_dataset_ids: DatasetIdsBackfillFromDb, + ) -> Self { + Self(InnerSledEditor::Active(ActiveSledEditor::new_empty( + preexisting_dataset_ids, + ))) + } + + pub fn finalize(self) -> EditedSled { + match self.0 { + InnerSledEditor::Active(editor) => editor.finalize(), + InnerSledEditor::Decommissioned(edited) => edited, + } + } + + pub fn edit_counts(&self) -> SledEditCounts { + match &self.0 { + InnerSledEditor::Active(editor) => editor.edit_counts(), + InnerSledEditor::Decommissioned(edited) => edited.edit_counts, + } + } + + pub fn decommission(&mut self) -> Result<(), SledEditError> { + match &mut self.0 { + InnerSledEditor::Active(editor) => { + // Decommissioning a sled is a one-way trip that has many + // preconditions. We can't check all of them here (e.g., we + // should kick the sled out of trust quorum before + // decommissioning, which is entirely outside the realm of + // `SledEditor`. But we can do some basic checks: all of the + // disks, datasets, and zones for this sled should be expunged. + editor.validate_decommisionable()?; + + // We can't take ownership of `editor` from the `&mut self` + // reference we have, and we need ownership to call + // `finalize()`. Steal the contents via `mem::swap()` with an + // empty editor. This isn't panic safe (i.e., if we panic + // between the `mem::swap()` and the reassignment to `self.0` + // below, we'll be left in the active state with an empty sled + // editor), but omicron in general is not panic safe and aborts + // on panic. Plus `finalize()` should never panic. + let mut stolen = ActiveSledEditor::new_empty( + DatasetIdsBackfillFromDb::empty(), + ); + mem::swap(editor, &mut stolen); + + let mut finalized = stolen.finalize(); + finalized.state = SledState::Decommissioned; + self.0 = InnerSledEditor::Decommissioned(finalized); + } + // If we're already decommissioned, there's nothing to do. + InnerSledEditor::Decommissioned(_) => (), + } + Ok(()) + } + + pub fn disks( + &self, + filter: DiskFilter, + ) -> impl Iterator { + match &self.0 { + InnerSledEditor::Active(editor) => { + Either::Left(editor.disks(filter)) + } + InnerSledEditor::Decommissioned(edited) => Either::Right( + edited + .disks + .disks + .iter() + .filter(move |disk| disk.disposition.matches(filter)), + ), + } + } + + pub fn zones( + &self, + filter: BlueprintZoneFilter, + ) -> impl Iterator { + match &self.0 { + InnerSledEditor::Active(editor) => { + Either::Left(editor.zones(filter)) + } + InnerSledEditor::Decommissioned(edited) => Either::Right( + edited + .zones + .zones + .iter() + .filter(move |zone| zone.disposition.matches(filter)), + ), + } + } + + fn as_active_mut( + &mut self, + ) -> Result<&mut ActiveSledEditor, SledEditError> { + match &mut self.0 { + InnerSledEditor::Active(editor) => Ok(editor), + InnerSledEditor::Decommissioned(_) => { + Err(SledEditError::EditDecommissioned) + } + } + } + + pub fn ensure_disk( + &mut self, + disk: BlueprintPhysicalDiskConfig, + rng: &mut PlannerRng, + ) -> Result<(), SledEditError> { + self.as_active_mut()?.ensure_disk(disk, rng); + Ok(()) + } + + pub fn expunge_disk( + &mut self, + disk_id: &PhysicalDiskUuid, + ) -> Result<(), SledEditError> { + self.as_active_mut()?.expunge_disk(disk_id) + } + + pub fn add_zone( + &mut self, + zone: BlueprintZoneConfig, + rng: &mut PlannerRng, + ) -> Result<(), SledEditError> { + self.as_active_mut()?.add_zone(zone, rng) + } + + pub fn expunge_zone( + &mut self, + zone_id: &OmicronZoneUuid, + ) -> Result<(), SledEditError> { + self.as_active_mut()?.expunge_zone(zone_id) + } + + /// Backwards compatibility / test helper: If we're given a blueprint that + /// has zones but wasn't created via `SledEditor`, it might not have + /// datasets for all its zones. This method backfills them. + pub fn ensure_datasets_for_running_zones( + &mut self, + rng: &mut PlannerRng, + ) -> Result<(), SledEditError> { + self.as_active_mut()?.ensure_datasets_for_running_zones(rng) + } +} + +#[derive(Debug)] +struct ActiveSledEditor { zones: ZonesEditor, disks: DisksEditor, datasets: DatasetsEditor, @@ -90,16 +311,14 @@ pub(crate) struct EditedSled { pub edit_counts: SledEditCounts, } -impl SledEditor { +impl ActiveSledEditor { pub fn new( - state: SledState, zones: BlueprintZonesConfig, disks: BlueprintPhysicalDisksConfig, datasets: BlueprintDatasetsConfig, preexisting_dataset_ids: DatasetIdsBackfillFromDb, ) -> Result { Ok(Self { - state, zones: zones.try_into()?, disks: disks.try_into()?, datasets: DatasetsEditor::new(datasets, preexisting_dataset_ids)?, @@ -107,11 +326,9 @@ impl SledEditor { } pub fn new_empty( - state: SledState, preexisting_dataset_ids: DatasetIdsBackfillFromDb, ) -> Self { Self { - state, zones: ZonesEditor::empty(), disks: DisksEditor::empty(), datasets: DatasetsEditor::empty(preexisting_dataset_ids), @@ -123,7 +340,7 @@ impl SledEditor { let (datasets, datasets_counts) = self.datasets.finalize(); let (zones, zones_counts) = self.zones.finalize(); EditedSled { - state: self.state, + state: SledState::Active, zones, disks, datasets, @@ -135,6 +352,24 @@ impl SledEditor { } } + fn validate_decommisionable(&self) -> Result<(), SledEditError> { + // ... and all zones are expunged. + if let Some(zone) = self.zones(BlueprintZoneFilter::All).find(|zone| { + match zone.disposition { + BlueprintZoneDisposition::InService + | BlueprintZoneDisposition::Quiesced => true, + BlueprintZoneDisposition::Expunged => false, + } + }) { + return Err(SledEditError::NonDecommissionableZoneNotExpunged { + zone_id: zone.id, + kind: zone.zone_type.kind(), + }); + } + + Ok(()) + } + pub fn edit_counts(&self) -> SledEditCounts { SledEditCounts { disks: self.disks.edit_counts(), @@ -143,10 +378,6 @@ impl SledEditor { } } - pub fn set_state(&mut self, new_state: SledState) { - self.state = new_state; - } - pub fn disks( &self, filter: DiskFilter, diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs index de397b9caa..211af59aab 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/datasets.rs @@ -7,7 +7,6 @@ use crate::planner::PlannerRng; use illumos_utils::zpool::ZpoolName; use nexus_types::deployment::BlueprintDatasetConfig; use nexus_types::deployment::BlueprintDatasetDisposition; -use nexus_types::deployment::BlueprintDatasetFilter; use nexus_types::deployment::BlueprintDatasetsConfig; use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolFilter; @@ -24,6 +23,9 @@ use std::collections::BTreeMap; use std::collections::BTreeSet; use std::net::SocketAddrV6; +#[cfg(test)] +use nexus_types::deployment::BlueprintDatasetFilter; + #[derive(Debug, thiserror::Error)] #[error( "invalid blueprint input: multiple datasets with kind {kind:?} \ @@ -269,7 +271,7 @@ impl DatasetsEditor { self.counts } - #[allow(dead_code)] // currently only used by tests; this will change soon + #[cfg(test)] pub fn datasets( &self, filter: BlueprintDatasetFilter, diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 56fc671667..24ef80b253 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -159,8 +159,7 @@ impl<'a> Planner<'a> { let num_instances_assigned = 0; if all_zones_expunged && num_instances_assigned == 0 { - self.blueprint - .set_sled_state(sled_id, SledState::Decommissioned)?; + self.blueprint.set_sled_decommissioned(sled_id)?; } }