Skip to content

Commit

Permalink
Remove retry_until_known_result() (#6868)
Browse files Browse the repository at this point in the history
This builds on #6866. After its changes, there was only caller of
`retry_until_known_result` left; this PR removes it. We keep the retry
loop, but instead of retrying forever, we bail out if the sled we're
trying to reach is gone, as determined by "is it no longer in-service",
which in practice means it's been expunged.
  • Loading branch information
jgallagher authored Nov 14, 2024
1 parent df6d11b commit 41b78f1
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 53 deletions.
38 changes: 0 additions & 38 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,44 +61,6 @@ impl slog::KV for FileKv {

pub const OMICRON_DPD_TAG: &str = "omicron";

use crate::api::external::Error;
use crate::progenitor_operation_retry::ProgenitorOperationRetry;
use crate::progenitor_operation_retry::ProgenitorOperationRetryError;
use std::future::Future;

/// Retry a progenitor client operation until a known result is returned.
///
/// See [`ProgenitorOperationRetry`] for more information.
// TODO mark this deprecated, `never_bail` is a bad idea
pub async fn retry_until_known_result<F, T, E, Fut>(
log: &slog::Logger,
f: F,
) -> Result<T, progenitor_client::Error<E>>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
E: std::fmt::Debug,
{
match ProgenitorOperationRetry::new(f, never_bail).run(log).await {
Ok(v) => Ok(v),

Err(e) => match e {
ProgenitorOperationRetryError::ProgenitorError(e) => Err(e),

ProgenitorOperationRetryError::Gone
| ProgenitorOperationRetryError::GoneCheckError(_) => {
// ProgenitorOperationRetry::new called with `never_bail` as the
// bail check should never return these variants!
unreachable!();
}
},
}
}

async fn never_bail() -> Result<bool, Error> {
Ok(false)
}

/// A wrapper struct that does nothing other than elide the inner value from
/// [`std::fmt::Debug`] output.
///
Expand Down
4 changes: 2 additions & 2 deletions nexus/db-queries/src/db/datastore/physical_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use omicron_common::api::external::ResourceType;
use omicron_uuid_kinds::CollectionUuid;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::PhysicalDiskUuid;
use omicron_uuid_kinds::SledUuid;
use uuid::Uuid;

impl DataStore {
Expand Down Expand Up @@ -66,7 +67,7 @@ impl DataStore {
// expunged.
Self::check_sled_in_service_on_connection(
&conn,
disk.sled_id,
SledUuid::from_untyped_uuid(disk.sled_id),
)
.await
.map_err(|txn_error| txn_error.into_diesel(&err))?;
Expand Down Expand Up @@ -336,7 +337,6 @@ mod test {
use omicron_common::api::external::ByteCount;
use omicron_common::disk::{DiskIdentity, DiskVariant};
use omicron_test_utils::dev;
use omicron_uuid_kinds::SledUuid;
use std::net::{Ipv6Addr, SocketAddrV6};
use std::num::NonZeroU32;

Expand Down
18 changes: 16 additions & 2 deletions nexus/db-queries/src/db/datastore/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ use omicron_common::api::external::Error;
use omicron_common::api::external::ListResultVec;
use omicron_common::api::external::ResourceType;
use omicron_common::bail_unless;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::SledUuid;
use std::fmt;
use strum::IntoEnumIterator;
use thiserror::Error;
Expand Down Expand Up @@ -99,18 +101,30 @@ impl DataStore {
Ok((sled, was_modified))
}

/// Confirms that a sled exists and is in-service.
pub async fn check_sled_in_service(
&self,
opctx: &OpContext,
sled_id: SledUuid,
) -> Result<(), Error> {
let conn = &*self.pool_connection_authorized(&opctx).await?;
Self::check_sled_in_service_on_connection(conn, sled_id)
.await
.map_err(From::from)
}

/// Confirms that a sled exists and is in-service.
///
/// This function may be called from a transaction context.
pub async fn check_sled_in_service_on_connection(
conn: &async_bb8_diesel::Connection<DbConnection>,
sled_id: Uuid,
sled_id: SledUuid,
) -> Result<(), TransactionError<Error>> {
use db::schema::sled::dsl;
let sled_exists_and_in_service = diesel::select(diesel::dsl::exists(
dsl::sled
.filter(dsl::time_deleted.is_null())
.filter(dsl::id.eq(sled_id))
.filter(dsl::id.eq(sled_id.into_untyped_uuid()))
.sled_filter(SledFilter::InService),
))
.get_result_async::<bool>(conn)
Expand Down
33 changes: 22 additions & 11 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,11 @@ use anyhow::anyhow;
use nexus_db_model::Generation;
use nexus_db_queries::db::identity::{Asset, Resource};
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::retry_until_known_result;
use omicron_common::api::external::Error;
use omicron_common::progenitor_operation_retry::ProgenitorOperationRetryError;
use omicron_common::{
api::external, progenitor_operation_retry::ProgenitorOperationRetry,
};
use omicron_common::{
api::external::Error,
progenitor_operation_retry::ProgenitorOperationRetryError,
};
use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid};
use rand::{rngs::StdRng, RngCore, SeedableRng};
use serde::Deserialize;
Expand Down Expand Up @@ -851,6 +848,7 @@ async fn ssc_send_snapshot_request_to_sled_agent(
"instance no longer has an active VMM!",
)));
};
let sled_id = SledUuid::from_untyped_uuid(sled_id);

info!(log, "asking for disk snapshot from Propolis via sled agent";
"disk_id" => %params.disk_id,
Expand All @@ -860,22 +858,35 @@ async fn ssc_send_snapshot_request_to_sled_agent(

let sled_agent_client = osagactx
.nexus()
.sled_client(&SledUuid::from_untyped_uuid(sled_id))
.sled_client(&sled_id)
.await
.map_err(ActionError::action_failed)?;

retry_until_known_result(log, || async {
let snapshot_operation = || async {
sled_agent_client
.vmm_issue_disk_snapshot_request(
&PropolisUuid::from_untyped_uuid(propolis_id),
&params.disk_id,
&VmmIssueDiskSnapshotRequestBody { snapshot_id },
)
.await
})
.await
.map_err(|e| e.to_string())
.map_err(ActionError::action_failed)?;
};
let gone_check = || async {
osagactx.datastore().check_sled_in_service(&opctx, sled_id).await?;
// `check_sled_in_service` returns an error if the sled is no longer in
// service; if it succeeds, the sled is not gone.
Ok(false)
};

ProgenitorOperationRetry::new(snapshot_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"failed to issue VMM disk snapshot request: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}
Expand Down

0 comments on commit 41b78f1

Please sign in to comment.