Skip to content

Commit

Permalink
[sled-agent] PUT zones doesn't create datasets (#7006)
Browse files Browse the repository at this point in the history
Calls to `PUT` zones will now check that datasets exist -- as they
should be initialized, with prior calls -- rather than
forcefully creating them.

This PR checks that datasets exist in the "configuration", rather than
querying inventory.

Fixes #6991
  • Loading branch information
smklein committed Nov 25, 2024
1 parent 106e722 commit 3d3ba51
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 29 deletions.
1 change: 1 addition & 0 deletions common/src/ledger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ impl<T: Ledgerable> Ledger<T> {
warn!(self.log, "Failed to write ledger"; "path" => ?path, "err" => ?e);
failed_paths.push((path.to_path_buf(), e));
} else {
info!(self.log, "Successfully wrote to ledger"; "path" => ?path);
one_successful_write = true;
}
}
Expand Down
18 changes: 17 additions & 1 deletion nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use omicron_common::{
external::{ByteCount, Generation},
internal::shared::{NetworkInterface, SourceNatConfig},
},
disk::DiskVariant,
disk::{DatasetKind, DatasetName, DiskVariant},
zpool_name::ZpoolName,
};
use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid};
Expand Down Expand Up @@ -168,6 +168,22 @@ impl OmicronZoneConfig {
pub fn underlay_ip(&self) -> Ipv6Addr {
self.zone_type.underlay_ip()
}

/// If this zone has a transient filesystem, return the dataset's name.
/// Otherwise, return `None`.
pub fn transient_zone_dataset_name(&self) -> Option<DatasetName> {
self.filesystem_pool.as_ref().map(|pool| {
DatasetName::new(
pool.clone(),
DatasetKind::TransientZone {
name: illumos_utils::zone::zone_name(
self.zone_type.kind().zone_prefix(),
Some(self.id),
),
},
)
})
}
}

/// Describes a persistent ZFS dataset associated with an Omicron zone
Expand Down
66 changes: 66 additions & 0 deletions sled-agent/src/services.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET};
use sled_storage::manager::StorageHandle;
use slog::Logger;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashSet;
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
use std::str::FromStr;
Expand Down Expand Up @@ -275,6 +276,12 @@ pub enum Error {
#[error("Error querying simnet devices")]
Simnet(#[from] GetSimnetError),

#[error("Failed to access storage")]
Storage(#[from] sled_storage::error::Error),

#[error("Missing datasets: {datasets:?}")]
MissingDatasets { datasets: BTreeSet<DatasetName> },

#[error(
"Requested generation ({requested}) is older than current ({current})"
)]
Expand Down Expand Up @@ -3471,6 +3478,50 @@ impl ServiceManager {
Ok(())
}

// Compares the incoming set of zones with the datasets this sled is
// configured to use.
//
// If the requested zones contain any datasets not configured on this sled,
// an error is returned.
async fn check_requested_zone_datasets_exist(
&self,
request: &OmicronZonesConfig,
) -> Result<(), Error> {
// Before we provision these zones, inspect the request, and
// check all the datasets we're trying to make.
let mut requested_datasets = BTreeSet::new();
for zone in &request.zones {
if let Some(dataset) = zone.dataset_name() {
requested_datasets.insert(dataset);
}
if let Some(dataset) = zone.transient_zone_dataset_name() {
requested_datasets.insert(dataset);
}
}

// These datasets are configured to be part of the control plane.
let datasets_config = self.inner.storage.datasets_config_list().await?;
let existing_datasets: BTreeSet<_> = datasets_config
.datasets
.into_values()
.map(|config| config.name)
.collect();

// Check that the request is no larger than the configured set.
//
// (It's fine to have "existing_datasets" not contained in the
// request set).
let missing_datasets: BTreeSet<_> = requested_datasets
.difference(&existing_datasets)
.cloned()
.collect();
if !missing_datasets.is_empty() {
warn!(self.inner.log, "Missing datasets: {missing_datasets:?}");
return Err(Error::MissingDatasets { datasets: missing_datasets });
}
Ok(())
}

// Ensures that only the following Omicron zones are running.
//
// This method strives to be idempotent.
Expand All @@ -3494,6 +3545,8 @@ impl ServiceManager {
new_request: OmicronZonesConfig,
fake_install_dir: Option<&String>,
) -> Result<(), Error> {
self.check_requested_zone_datasets_exist(&new_request).await?;

// Do some data-normalization to ensure we can compare the "requested
// set" vs the "existing set" as HashSets.
let old_zone_configs: HashSet<OmicronZoneConfig> = existing_zones
Expand Down Expand Up @@ -4695,6 +4748,7 @@ mod illumos_tests {
zone::MockZones,
};

use omicron_common::disk::DatasetsConfig;
use omicron_uuid_kinds::OmicronZoneUuid;
use sled_storage::manager_test_harness::StorageManagerTestHarness;
use std::os::unix::process::ExitStatusExt;
Expand Down Expand Up @@ -5040,6 +5094,18 @@ mod illumos_tests {
.await
.expect("Failed to ensure disks");
assert!(!result.has_error(), "{:?}", result);
let result = harness
.handle()
.datasets_ensure(DatasetsConfig {
generation: Generation::new(),
datasets: BTreeMap::new(),
})
.await
.unwrap();
assert!(
!result.has_error(),
"Failed to ensure empty dataset ledger: {result:?}"
);
harness
}

Expand Down
19 changes: 0 additions & 19 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ use crate::metrics::MetricsManager;
use crate::nexus::{
NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask,
};
use crate::params::OmicronZoneTypeExt;
use crate::probe_manager::ProbeManager;
use crate::services::{self, ServiceManager};
use crate::storage_monitor::StorageMonitorHandle;
Expand Down Expand Up @@ -943,24 +942,6 @@ impl SledAgent {
&self,
requested_zones: OmicronZonesConfig,
) -> Result<(), Error> {
// TODO(https://github.com/oxidecomputer/omicron/issues/6043):
// - If these are the set of filesystems, we should also consider
// removing the ones which are not listed here.
// - It's probably worth sending a bulk request to the storage system,
// rather than requesting individual datasets.
for zone in &requested_zones.zones {
let Some(dataset_name) = zone.dataset_name() else {
continue;
};

// First, ensure the dataset exists
let dataset_id = zone.id.into_untyped_uuid();
self.inner
.storage
.upsert_filesystem(dataset_id, dataset_name)
.await?;
}

self.inner
.services
.ensure_all_omicron_zones_persistent(requested_zones, None)
Expand Down
25 changes: 16 additions & 9 deletions sled-storage/src/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,7 @@ impl StorageManager {
let ledger_paths = self.all_omicron_dataset_ledgers().await;
let maybe_ledger =
Ledger::<DatasetsConfig>::new(&log, ledger_paths.clone()).await;
let had_old_ledger = maybe_ledger.is_some();

let mut ledger = match maybe_ledger {
Some(ledger) => {
Expand Down Expand Up @@ -914,12 +915,14 @@ impl StorageManager {
let result = self.datasets_ensure_internal(&log, &config).await;

let ledger_data = ledger.data_mut();
if *ledger_data == config {
return Ok(result);
}
*ledger_data = config;
ledger.commit().await?;

// Commit the ledger to storage if either:
// - No ledger exists in storage, or
// - The ledger has changed
if !had_old_ledger || *ledger_data != config {
*ledger_data = config;
ledger.commit().await?;
}
Ok(result)
}

Expand Down Expand Up @@ -1142,6 +1145,7 @@ impl StorageManager {
ledger_paths.clone(),
)
.await;
let had_old_ledger = maybe_ledger.is_some();

let mut ledger = match maybe_ledger {
Some(ledger) => {
Expand Down Expand Up @@ -1181,11 +1185,14 @@ impl StorageManager {
self.omicron_physical_disks_ensure_internal(&log, &config).await?;

let ledger_data = ledger.data_mut();
if *ledger_data == config {
return Ok(result);

// Commit the ledger to storage if either:
// - No ledger exists in storage, or
// - The ledger has changed
if !had_old_ledger || *ledger_data != config {
*ledger_data = config;
ledger.commit().await?;
}
*ledger_data = config;
ledger.commit().await?;

Ok(result)
}
Expand Down

0 comments on commit 3d3ba51

Please sign in to comment.