From 3d3ba51ad13da7286ab8c6637564a31cf9abd8c2 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 18 Nov 2024 13:15:45 -0800 Subject: [PATCH] [sled-agent] PUT zones doesn't create datasets (#7006) Calls to `PUT` zones will now check that datasets exist -- as they should be initialized, with prior calls -- rather than forcefully creating them. This PR checks that datasets exist in the "configuration", rather than querying inventory. Fixes https://github.com/oxidecomputer/omicron/issues/6991 --- common/src/ledger.rs | 1 + nexus-sled-agent-shared/src/inventory.rs | 18 ++++++- sled-agent/src/services.rs | 66 ++++++++++++++++++++++++ sled-agent/src/sled_agent.rs | 19 ------- sled-storage/src/manager.rs | 25 +++++---- 5 files changed, 100 insertions(+), 29 deletions(-) diff --git a/common/src/ledger.rs b/common/src/ledger.rs index a52c2441ca..289bf72081 100644 --- a/common/src/ledger.rs +++ b/common/src/ledger.rs @@ -130,6 +130,7 @@ impl Ledger { warn!(self.log, "Failed to write ledger"; "path" => ?path, "err" => ?e); failed_paths.push((path.to_path_buf(), e)); } else { + info!(self.log, "Successfully wrote to ledger"; "path" => ?path); one_successful_write = true; } } diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index ebc683a264..da6edfda1f 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -11,7 +11,7 @@ use omicron_common::{ external::{ByteCount, Generation}, internal::shared::{NetworkInterface, SourceNatConfig}, }, - disk::DiskVariant, + disk::{DatasetKind, DatasetName, DiskVariant}, zpool_name::ZpoolName, }; use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid}; @@ -168,6 +168,22 @@ impl OmicronZoneConfig { pub fn underlay_ip(&self) -> Ipv6Addr { self.zone_type.underlay_ip() } + + /// If this zone has a transient filesystem, return the dataset's name. + /// Otherwise, return `None`. + pub fn transient_zone_dataset_name(&self) -> Option { + self.filesystem_pool.as_ref().map(|pool| { + DatasetName::new( + pool.clone(), + DatasetKind::TransientZone { + name: illumos_utils::zone::zone_name( + self.zone_type.kind().zone_prefix(), + Some(self.id), + ), + }, + ) + }) + } } /// Describes a persistent ZFS dataset associated with an Omicron zone diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index e73773bcc8..da88d94103 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -104,6 +104,7 @@ use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashSet; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; @@ -275,6 +276,12 @@ pub enum Error { #[error("Error querying simnet devices")] Simnet(#[from] GetSimnetError), + #[error("Failed to access storage")] + Storage(#[from] sled_storage::error::Error), + + #[error("Missing datasets: {datasets:?}")] + MissingDatasets { datasets: BTreeSet }, + #[error( "Requested generation ({requested}) is older than current ({current})" )] @@ -3471,6 +3478,50 @@ impl ServiceManager { Ok(()) } + // Compares the incoming set of zones with the datasets this sled is + // configured to use. + // + // If the requested zones contain any datasets not configured on this sled, + // an error is returned. + async fn check_requested_zone_datasets_exist( + &self, + request: &OmicronZonesConfig, + ) -> Result<(), Error> { + // Before we provision these zones, inspect the request, and + // check all the datasets we're trying to make. + let mut requested_datasets = BTreeSet::new(); + for zone in &request.zones { + if let Some(dataset) = zone.dataset_name() { + requested_datasets.insert(dataset); + } + if let Some(dataset) = zone.transient_zone_dataset_name() { + requested_datasets.insert(dataset); + } + } + + // These datasets are configured to be part of the control plane. + let datasets_config = self.inner.storage.datasets_config_list().await?; + let existing_datasets: BTreeSet<_> = datasets_config + .datasets + .into_values() + .map(|config| config.name) + .collect(); + + // Check that the request is no larger than the configured set. + // + // (It's fine to have "existing_datasets" not contained in the + // request set). + let missing_datasets: BTreeSet<_> = requested_datasets + .difference(&existing_datasets) + .cloned() + .collect(); + if !missing_datasets.is_empty() { + warn!(self.inner.log, "Missing datasets: {missing_datasets:?}"); + return Err(Error::MissingDatasets { datasets: missing_datasets }); + } + Ok(()) + } + // Ensures that only the following Omicron zones are running. // // This method strives to be idempotent. @@ -3494,6 +3545,8 @@ impl ServiceManager { new_request: OmicronZonesConfig, fake_install_dir: Option<&String>, ) -> Result<(), Error> { + self.check_requested_zone_datasets_exist(&new_request).await?; + // Do some data-normalization to ensure we can compare the "requested // set" vs the "existing set" as HashSets. let old_zone_configs: HashSet = existing_zones @@ -4695,6 +4748,7 @@ mod illumos_tests { zone::MockZones, }; + use omicron_common::disk::DatasetsConfig; use omicron_uuid_kinds::OmicronZoneUuid; use sled_storage::manager_test_harness::StorageManagerTestHarness; use std::os::unix::process::ExitStatusExt; @@ -5040,6 +5094,18 @@ mod illumos_tests { .await .expect("Failed to ensure disks"); assert!(!result.has_error(), "{:?}", result); + let result = harness + .handle() + .datasets_ensure(DatasetsConfig { + generation: Generation::new(), + datasets: BTreeMap::new(), + }) + .await + .unwrap(); + assert!( + !result.has_error(), + "Failed to ensure empty dataset ledger: {result:?}" + ); harness } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index f1b2f910e7..80c0cdda21 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -15,7 +15,6 @@ use crate::metrics::MetricsManager; use crate::nexus::{ NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask, }; -use crate::params::OmicronZoneTypeExt; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; use crate::storage_monitor::StorageMonitorHandle; @@ -943,24 +942,6 @@ impl SledAgent { &self, requested_zones: OmicronZonesConfig, ) -> Result<(), Error> { - // TODO(https://github.com/oxidecomputer/omicron/issues/6043): - // - If these are the set of filesystems, we should also consider - // removing the ones which are not listed here. - // - It's probably worth sending a bulk request to the storage system, - // rather than requesting individual datasets. - for zone in &requested_zones.zones { - let Some(dataset_name) = zone.dataset_name() else { - continue; - }; - - // First, ensure the dataset exists - let dataset_id = zone.id.into_untyped_uuid(); - self.inner - .storage - .upsert_filesystem(dataset_id, dataset_name) - .await?; - } - self.inner .services .ensure_all_omicron_zones_persistent(requested_zones, None) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index a760285d3f..62f5834209 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -856,6 +856,7 @@ impl StorageManager { let ledger_paths = self.all_omicron_dataset_ledgers().await; let maybe_ledger = Ledger::::new(&log, ledger_paths.clone()).await; + let had_old_ledger = maybe_ledger.is_some(); let mut ledger = match maybe_ledger { Some(ledger) => { @@ -914,12 +915,14 @@ impl StorageManager { let result = self.datasets_ensure_internal(&log, &config).await; let ledger_data = ledger.data_mut(); - if *ledger_data == config { - return Ok(result); - } - *ledger_data = config; - ledger.commit().await?; + // Commit the ledger to storage if either: + // - No ledger exists in storage, or + // - The ledger has changed + if !had_old_ledger || *ledger_data != config { + *ledger_data = config; + ledger.commit().await?; + } Ok(result) } @@ -1142,6 +1145,7 @@ impl StorageManager { ledger_paths.clone(), ) .await; + let had_old_ledger = maybe_ledger.is_some(); let mut ledger = match maybe_ledger { Some(ledger) => { @@ -1181,11 +1185,14 @@ impl StorageManager { self.omicron_physical_disks_ensure_internal(&log, &config).await?; let ledger_data = ledger.data_mut(); - if *ledger_data == config { - return Ok(result); + + // Commit the ledger to storage if either: + // - No ledger exists in storage, or + // - The ledger has changed + if !had_old_ledger || *ledger_data != config { + *ledger_data = config; + ledger.commit().await?; } - *ledger_data = config; - ledger.commit().await?; Ok(result) }