Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[repo depot 3/n] nexus background task to replicate TUF artifacts across sleds #7129

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus
use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus;
use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus;
use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
use nexus_types::internal_api::background::TufArtifactReplicationStatus;
use nexus_types::inventory::BaseboardId;
use omicron_uuid_kinds::CollectionUuid;
use omicron_uuid_kinds::DemoSagaUuid;
Expand Down Expand Up @@ -1928,6 +1929,33 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
}
}
};
} else if name == "tuf_artifact_replication" {
match serde_json::from_value::<TufArtifactReplicationStatus>(
details.clone(),
) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(status) => {
const ROWS: &[&str] = &[
"requests ok:",
"requests errored:",
"requests outstanding:",
"local repos:",
];
const WIDTH: usize = const_max_len(ROWS);
println!(" last execution:");
for (label, value) in ROWS.iter().zip([
status.requests_ok,
status.requests_err,
status.requests_outstanding,
status.local_repos,
]) {
println!(" {label:<WIDTH$} {value:>3}");
}
}
}
} else {
println!(
"warning: unknown background task: {:?} \
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds


task: "v2p_manager"
manages opte v2p mappings for vpc networking

Expand Down Expand Up @@ -337,6 +341,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds


task: "v2p_manager"
manages opte v2p mappings for vpc networking

Expand Down Expand Up @@ -495,6 +503,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds


task: "v2p_manager"
manages opte v2p mappings for vpc networking

Expand Down
26 changes: 26 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds


task: "v2p_manager"
manages opte v2p mappings for vpc networking

Expand Down Expand Up @@ -698,6 +702,17 @@ task: "switch_port_config_manager"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})

task: "tuf_artifact_replication"
configured period: every <REDACTED_DURATION>m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
last execution:
requests ok: 0
requests errored: 0
requests outstanding: 0
local repos: 0

task: "v2p_manager"
configured period: every <REDACTED_DURATION>s
currently executing: no
Expand Down Expand Up @@ -1141,6 +1156,17 @@ task: "switch_port_config_manager"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})

task: "tuf_artifact_replication"
configured period: every <REDACTED_DURATION>m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
last execution:
requests ok: 0
requests errored: 0
requests outstanding: 0
local repos: 0

task: "v2p_manager"
configured period: every <REDACTED_DURATION>s
currently executing: no
Expand Down
16 changes: 16 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,8 @@ pub struct BackgroundTaskConfig {
/// configuration for region snapshot replacement finisher task
pub region_snapshot_replacement_finish:
RegionSnapshotReplacementFinishConfig,
/// configuration for TUF artifact replication task
pub tuf_artifact_replication: TufArtifactReplicationConfig,
}

#[serde_as]
Expand Down Expand Up @@ -703,6 +705,14 @@ pub struct RegionSnapshotReplacementFinishConfig {
pub period_secs: Duration,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct TufArtifactReplicationConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

/// Configuration for a nexus server
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
pub struct PackageConfig {
Expand Down Expand Up @@ -958,6 +968,7 @@ mod test {
region_snapshot_replacement_garbage_collection.period_secs = 30
region_snapshot_replacement_step.period_secs = 30
region_snapshot_replacement_finish.period_secs = 30
tuf_artifact_replication.period_secs = 60
[default_region_allocation_strategy]
type = "random"
seed = 0
Expand Down Expand Up @@ -1156,6 +1167,10 @@ mod test {
RegionSnapshotReplacementFinishConfig {
period_secs: Duration::from_secs(30),
},
tuf_artifact_replication:
TufArtifactReplicationConfig {
period_secs: Duration::from_secs(60)
},
},
default_region_allocation_strategy:
crate::nexus_config::RegionAllocationStrategy::Random {
Expand Down Expand Up @@ -1237,6 +1252,7 @@ mod test {
region_snapshot_replacement_garbage_collection.period_secs = 30
region_snapshot_replacement_step.period_secs = 30
region_snapshot_replacement_finish.period_secs = 30
tuf_artifact_replication.period_secs = 60
[default_region_allocation_strategy]
type = "random"
"##,
Expand Down
1 change: 1 addition & 0 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ table! {
sled_policy -> crate::sled_policy::SledPolicyEnum,
sled_state -> crate::SledStateEnum,
sled_agent_gen -> Int8,
repo_depot_port -> Int4,
}
}

Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(115, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(115, "tuf-artifact-replication"),
KnownVersion::new(114, "crucible-ref-count-records"),
KnownVersion::new(113, "add-tx-eq"),
KnownVersion::new(112, "blueprint-dataset"),
Expand Down
10 changes: 10 additions & 0 deletions nexus/db-model/src/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ pub struct Sled {
/// This is specifically distinct from `rcgen`, which is incremented by
/// child resources as part of `DatastoreCollectionConfig`.
pub sled_agent_gen: Generation,

// ServiceAddress (Repo Depot API). Uses `ip`.
pub repo_depot_port: SqlU16,
}

impl Sled {
Expand Down Expand Up @@ -169,6 +172,7 @@ impl From<Sled> for params::SledAgentInfo {
};
Self {
sa_address: sled.address(),
repo_depot_port: sled.repo_depot_port.into(),
role,
baseboard: Baseboard {
serial: sled.serial_number.clone(),
Expand Down Expand Up @@ -220,6 +224,9 @@ pub struct SledUpdate {
pub ip: ipv6::Ipv6Addr,
pub port: SqlU16,

// ServiceAddress (Repo Depot API). Uses `ip`.
pub repo_depot_port: SqlU16,

// Generation number - owned and incremented by sled-agent.
pub sled_agent_gen: Generation,
}
Expand All @@ -228,6 +235,7 @@ impl SledUpdate {
pub fn new(
id: Uuid,
addr: SocketAddrV6,
repo_depot_port: u16,
baseboard: SledBaseboard,
hardware: SledSystemHardware,
rack_id: Uuid,
Expand All @@ -247,6 +255,7 @@ impl SledUpdate {
reservoir_size: hardware.reservoir_size,
ip: addr.ip().into(),
port: addr.port().into(),
repo_depot_port: repo_depot_port.into(),
sled_agent_gen,
}
}
Expand Down Expand Up @@ -282,6 +291,7 @@ impl SledUpdate {
reservoir_size: self.reservoir_size,
ip: self.ip,
port: self.port,
repo_depot_port: self.repo_depot_port,
last_used_address,
sled_agent_gen: self.sled_agent_gen,
}
Expand Down
1 change: 1 addition & 0 deletions nexus/db-queries/src/db/datastore/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ mod test {
let sled = SledUpdate::new(
*sled_id.as_untyped_uuid(),
"[::1]:0".parse().unwrap(),
0,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are quite a lot of places where we use 0 for the repo depot port. I assume this is a sentinel value? It might be nice to use Option instead here. (see also the discussion about whether the field should be NULLable but I think this is true regardless).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general in these test functions, I attempted to follow the same usage of how the sled agent port was specified. In this case you can see the sled agent SocketAddr is localhost port 0.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that makes sense.

The end result is that there are many callers where we're repeating the same values that, if I'm understanding right, can't actually be right -- they're just unused. This makes me wonder if both of those ought to be optional. Maybe this should be a SledUpdateBuilder? But anyway it's fine to say that's out of scope here.

SledBaseboard {
serial_number: "test-sn".to_string(),
part_number: "test-pn".to_string(),
Expand Down
5 changes: 5 additions & 0 deletions nexus/db-queries/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ mod test {
use nexus_db_model::{to_db_typed_uuid, Generation};
use nexus_types::external_api::params;
use nexus_types::silo::DEFAULT_SILO_ID;
use omicron_common::address::REPO_DEPOT_PORT;
use omicron_common::api::external::{
ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name,
};
Expand Down Expand Up @@ -684,12 +685,14 @@ mod test {
0,
0,
);
let bogus_repo_depot_port = 8081;
let rack_id = Uuid::new_v4();
let sled_id = SledUuid::new_v4();

let sled_update = SledUpdate::new(
sled_id.into_untyped_uuid(),
bogus_addr,
bogus_repo_depot_port,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id,
Expand Down Expand Up @@ -1691,6 +1694,7 @@ mod test {
let sled1 = db::model::SledUpdate::new(
sled1_id,
addr1,
REPO_DEPOT_PORT,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id,
Expand All @@ -1703,6 +1707,7 @@ mod test {
let sled2 = db::model::SledUpdate::new(
sled2_id,
addr2,
REPO_DEPOT_PORT,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id,
Expand Down
2 changes: 2 additions & 0 deletions nexus/db-queries/src/db/datastore/physical_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,10 +343,12 @@ mod test {
async fn create_test_sled(db: &DataStore) -> Sled {
let sled_id = Uuid::new_v4();
let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0);
let repo_depot_port = 0;
let rack_id = Uuid::new_v4();
let sled_update = SledUpdate::new(
sled_id,
addr,
repo_depot_port,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id,
Expand Down
2 changes: 2 additions & 0 deletions nexus/db-queries/src/db/datastore/rack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1239,9 +1239,11 @@ mod test {

async fn create_test_sled(db: &DataStore, sled_id: Uuid) -> Sled {
let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0);
let repo_depot_port = 0;
let sled_update = SledUpdate::new(
sled_id,
addr,
repo_depot_port,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id(),
Expand Down
3 changes: 3 additions & 0 deletions nexus/db-queries/src/db/datastore/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ impl DataStore {
dsl::time_modified.eq(now),
dsl::ip.eq(sled_update.ip),
dsl::port.eq(sled_update.port),
dsl::repo_depot_port.eq(sled_update.repo_depot_port),
dsl::rack_id.eq(sled_update.rack_id),
dsl::is_scrimlet.eq(sled_update.is_scrimlet()),
dsl::usable_hardware_threads
Expand Down Expand Up @@ -1489,9 +1490,11 @@ pub(in crate::db::datastore) mod test {
pub(crate) fn test_new_sled_update() -> SledUpdate {
let sled_id = Uuid::new_v4();
let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0);
let repo_depot_port = 0;
SledUpdate::new(
sled_id,
addr,
repo_depot_port,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id(),
Expand Down
22 changes: 20 additions & 2 deletions nexus/db-queries/src/db/datastore/update.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ use crate::context::OpContext;
use crate::db;
use crate::db::error::{public_error_from_diesel, ErrorHandler};
use crate::db::model::SemverVersion;
use crate::db::pagination::paginated;
use crate::transaction_retry::OptionalError;
use async_bb8_diesel::AsyncRunQueryDsl;
use diesel::prelude::*;
use diesel::result::Error as DieselError;
use nexus_db_model::{ArtifactHash, TufArtifact, TufRepo, TufRepoDescription};
use omicron_common::api::external::{
self, CreateResult, LookupResult, LookupType, ResourceType,
TufRepoInsertStatus,
self, CreateResult, DataPageParams, ListResultVec, LookupResult,
LookupType, ResourceType, TufRepoInsertStatus,
};
use omicron_uuid_kinds::TufRepoKind;
use omicron_uuid_kinds::TypedUuid;
Expand Down Expand Up @@ -147,6 +148,23 @@ impl DataStore {
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
Ok(TufRepoDescription { repo, artifacts })
}

/// Returns the list of all TUF repo artifacts known to the system.
pub async fn update_tuf_artifact_list(
&self,
opctx: &OpContext,
pagparams: &DataPageParams<'_, ArtifactHash>,
) -> ListResultVec<TufArtifact> {
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;

use db::schema::tuf_artifact::dsl;

paginated(dsl::tuf_artifact, dsl::sha256, pagparams)
.select(TufArtifact::as_select())
.load_async(&*self.pool_connection_authorized(opctx).await?)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}
}

// This is a separate method mostly to make rustfmt not bail out on long lines
Expand Down
1 change: 1 addition & 0 deletions nexus/db-queries/src/db/datastore/vpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3058,6 +3058,7 @@ mod tests {
.sled_upsert(SledUpdate::new(
sled_id.into_untyped_uuid(),
"[::1]:0".parse().unwrap(),
0,
sled_baseboard_for_test(),
sled_system_hardware_for_test(),
rack_id,
Expand Down
1 change: 1 addition & 0 deletions nexus/examples/config-second.toml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ region_snapshot_replacement_start.period_secs = 30
region_snapshot_replacement_garbage_collection.period_secs = 30
region_snapshot_replacement_step.period_secs = 30
region_snapshot_replacement_finish.period_secs = 30
tuf_artifact_replication.period_secs = 60

[default_region_allocation_strategy]
# allocate region on 3 random distinct zpools, on 3 random distinct sleds.
Expand Down
Loading
Loading