diff --git a/Cargo.lock b/Cargo.lock index 71ead293b1..2f36116a16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7264,6 +7264,7 @@ dependencies = [ "sled-agent-api", "sled-agent-client", "sled-agent-types", + "sled-diagnostics", "sled-hardware", "sled-hardware-types", "sled-storage", @@ -10677,6 +10678,7 @@ dependencies = [ "schemars", "serde", "sled-agent-types", + "sled-diagnostics", "sled-hardware-types", "uuid", ] @@ -10734,6 +10736,22 @@ dependencies = [ "uuid", ] +[[package]] +name = "sled-diagnostics" +version = "0.1.0" +dependencies = [ + "cfg-if", + "fs-err", + "futures", + "libc", + "omicron-workspace-hack", + "schemars", + "serde", + "slog", + "thiserror", + "tokio", +] + [[package]] name = "sled-hardware" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index c0f4be8bae..c9fc5e300d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,6 +108,7 @@ members = [ "sled-agent/bootstrap-agent-api", "sled-agent/repo-depot-api", "sled-agent/types", + "sled-diagnostics", "sled-hardware", "sled-hardware/types", "sled-storage", @@ -240,6 +241,7 @@ default-members = [ "sled-agent/bootstrap-agent-api", "sled-agent/repo-depot-api", "sled-agent/types", + "sled-diagnostics", "sled-hardware", "sled-hardware/types", "sled-storage", @@ -596,6 +598,7 @@ sled = "=0.34.7" sled-agent-api = { path = "sled-agent/api" } sled-agent-client = { path = "clients/sled-agent-client" } sled-agent-types = { path = "sled-agent/types" } +sled-diagnostics = { path = "sled-diagnostics" } sled-hardware = { path = "sled-hardware" } sled-hardware-types = { path = "sled-hardware/types" } sled-storage = { path = "sled-storage" } diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 14a4c92692..1e872e8264 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -678,10 +678,16 @@ "operationId": "support_dladm_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } } } }, @@ -699,10 +705,70 @@ "operationId": "support_ipadm_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/support/pargs-info": { + "get": { + "operationId": "support_pargs_info", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/support/pstack-info": { + "get": { + "operationId": "support_pstack_info", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } } } }, @@ -720,10 +786,12 @@ "operationId": "support_zoneadm_info", "responses": { "200": { - "description": "", + "description": "successful operation", "content": { - "*/*": { - "schema": {} + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } } } }, @@ -5415,6 +5483,68 @@ "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, + "SledDiagnosticsQueryOutput": { + "oneOf": [ + { + "type": "object", + "properties": { + "success": { + "type": "object", + "properties": { + "command": { + "description": "The command and it's arguments.", + "type": "string" + }, + "exit_code": { + "nullable": true, + "description": "The exit code if one was present when the comman exited.", + "type": "integer", + "format": "int32" + }, + "exit_status": { + "description": "The exit status of the command. This will be the exit code (if any) and exit reason such as from a signal.", + "type": "string" + }, + "stdio": { + "description": "Any stdout/stderr produced by the command.", + "type": "string" + } + }, + "required": [ + "command", + "exit_status", + "stdio" + ] + } + }, + "required": [ + "success" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "failure": { + "type": "object", + "properties": { + "error": { + "description": "The reason the command failed to execute.", + "type": "string" + } + }, + "required": [ + "error" + ] + } + }, + "required": [ + "failure" + ], + "additionalProperties": false + } + ] + }, "SledIdentifiers": { "description": "Identifiers for a single sled.\n\nThis is intended primarily to be used in timeseries, to identify sled from which metric data originates.", "type": "object", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 10b4ba1cdb..88c758dc31 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -81,6 +81,7 @@ sha3.workspace = true sled-agent-api.workspace = true sled-agent-client.workspace = true sled-agent-types.workspace = true +sled-diagnostics.workspace = true sled-hardware.workspace = true sled-hardware-types.workspace = true sled-storage.workspace = true diff --git a/sled-agent/api/Cargo.toml b/sled-agent/api/Cargo.toml index 95e9552f53..e53dfb6948 100644 --- a/sled-agent/api/Cargo.toml +++ b/sled-agent/api/Cargo.toml @@ -19,4 +19,5 @@ schemars.workspace = true serde.workspace = true sled-agent-types.workspace = true sled-hardware-types.workspace = true +sled-diagnostics.workspace = true uuid.workspace = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 634640079a..3012413a45 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -54,6 +54,7 @@ use sled_agent_types::{ ZoneBundleId, ZoneBundleMetadata, }, }; +use sled_diagnostics::SledDiagnosticsQueryOutput; use uuid::Uuid; #[dropshot::api_description] @@ -592,7 +593,7 @@ pub trait SledAgentApi { }] async fn support_zoneadm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; #[endpoint { method = GET, @@ -600,7 +601,7 @@ pub trait SledAgentApi { }] async fn support_ipadm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; #[endpoint { method = GET, @@ -608,7 +609,23 @@ pub trait SledAgentApi { }] async fn support_dladm_info( request_context: RequestContext, - ) -> Result, HttpError>; + ) -> Result>, HttpError>; + + #[endpoint { + method = GET, + path = "/support/pargs-info", + }] + async fn support_pargs_info( + request_context: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = GET, + path = "/support/pstack-info", + }] + async fn support_pstack_info( + request_context: RequestContext, + ) -> Result>, HttpError>; } #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 844e13151a..824de9cd94 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -6,7 +6,6 @@ use super::sled_agent::SledAgent; use crate::sled_agent::Error as SledAgentError; -use crate::support_bundle::queries::SupportBundleCommandHttpOutput; use crate::zone_bundle::BundleError; use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; @@ -53,6 +52,9 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, StorageLimit, ZoneBundleId, ZoneBundleMetadata, }; +use sled_diagnostics::{ + SledDiagnosticsCommandHttpOutput, SledDiagnosticsQueryOutput, +}; use std::collections::BTreeMap; type SledApiDescription = ApiDescription; @@ -865,41 +867,65 @@ impl SledAgentApi for SledAgentImpl { async fn support_zoneadm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let sa = request_context.context(); let res = sa.support_zoneadm_info().await; - Ok(HttpResponseOk(FreeformBody(res.get_output().into()))) + Ok(HttpResponseOk(res.get_output())) } async fn support_ipadm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); - let output = sa - .support_ipadm_info() - .await - .into_iter() - .map(|cmd| cmd.get_output()) - .collect::>() - .as_slice() - .join("\n\n"); - - Ok(HttpResponseOk(FreeformBody(output.into()))) + Ok(HttpResponseOk( + sa.support_ipadm_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) } async fn support_dladm_info( request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { let sa = request_context.context(); - let output = sa - .support_dladm_info() - .await - .into_iter() - .map(|cmd| cmd.get_output()) - .collect::>() - .as_slice() - .join("\n\n"); + Ok(HttpResponseOk( + sa.support_dladm_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) + } - Ok(HttpResponseOk(FreeformBody(output.into()))) + async fn support_pargs_info( + request_context: RequestContext, + ) -> Result>, HttpError> + { + let sa = request_context.context(); + Ok(HttpResponseOk( + sa.support_pargs_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) + } + + async fn support_pstack_info( + request_context: RequestContext, + ) -> Result>, HttpError> + { + let sa = request_context.context(); + Ok(HttpResponseOk( + sa.support_pstack_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) } } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 2d23f9150b..f48becb6a0 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -58,6 +58,7 @@ use sled_agent_types::zone_bundle::CleanupContext; use sled_agent_types::zone_bundle::CleanupCount; use sled_agent_types::zone_bundle::ZoneBundleId; use sled_agent_types::zone_bundle::ZoneBundleMetadata; +use sled_diagnostics::SledDiagnosticsQueryOutput; use std::collections::BTreeMap; use std::sync::Arc; @@ -638,19 +639,35 @@ impl SledAgentApi for SledAgentSimImpl { async fn support_zoneadm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { method_unimplemented() } async fn support_ipadm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { method_unimplemented() } async fn support_dladm_info( _request_context: RequestContext, - ) -> Result, HttpError> { + ) -> Result>, HttpError> + { + method_unimplemented() + } + + async fn support_pargs_info( + _request_context: RequestContext, + ) -> Result>, HttpError> + { + method_unimplemented() + } + + async fn support_pstack_info( + _request_context: RequestContext, + ) -> Result>, HttpError> + { method_unimplemented() } } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 80dbe72ea3..d0a653268e 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -19,10 +19,6 @@ use crate::params::OmicronZoneTypeExt; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; use crate::storage_monitor::StorageMonitorHandle; -use crate::support_bundle::queries::{ - dladm_info, ipadm_info, zoneadm_info, SupportBundleCmdError, - SupportBundleCmdOutput, -}; use crate::support_bundle::storage::SupportBundleManager; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager}; @@ -76,6 +72,7 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, PriorityOrder, StorageLimit, ZoneBundleMetadata, }; +use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput}; use sled_hardware::{underlay, HardwareManager}; use sled_hardware_types::underlay::BootstrapInterface; use sled_hardware_types::Baseboard; @@ -1367,20 +1364,32 @@ impl SledAgent { pub(crate) async fn support_zoneadm_info( &self, - ) -> Result { - zoneadm_info().await + ) -> Result { + sled_diagnostics::zoneadm_info().await } pub(crate) async fn support_ipadm_info( &self, - ) -> Vec> { - ipadm_info().await + ) -> Vec> { + sled_diagnostics::ipadm_info().await } pub(crate) async fn support_dladm_info( &self, - ) -> Vec> { - dladm_info().await + ) -> Vec> { + sled_diagnostics::dladm_info().await + } + + pub(crate) async fn support_pargs_info( + &self, + ) -> Vec> { + sled_diagnostics::pargs_oxide_processes(&self.log).await + } + + pub(crate) async fn support_pstack_info( + &self, + ) -> Vec> { + sled_diagnostics::pargs_oxide_processes(&self.log).await } } diff --git a/sled-agent/src/support_bundle/mod.rs b/sled-agent/src/support_bundle/mod.rs index 314edfaec8..a1c4942751 100644 --- a/sled-agent/src/support_bundle/mod.rs +++ b/sled-agent/src/support_bundle/mod.rs @@ -2,5 +2,4 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -pub mod queries; pub mod storage; diff --git a/sled-diagnostics/.gitignore b/sled-diagnostics/.gitignore new file mode 100644 index 0000000000..ea8c4bf7f3 --- /dev/null +++ b/sled-diagnostics/.gitignore @@ -0,0 +1 @@ +/target diff --git a/sled-diagnostics/Cargo.toml b/sled-diagnostics/Cargo.toml new file mode 100644 index 0000000000..afbe642d76 --- /dev/null +++ b/sled-diagnostics/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "sled-diagnostics" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +cfg-if.workspace = true +fs-err.workspace = true +futures.workspace = true +libc.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true +slog.workspace = true +thiserror.workspace = true +tokio = { workspace = true, features = ["full"] } diff --git a/sled-diagnostics/src/contract.rs b/sled-diagnostics/src/contract.rs new file mode 100644 index 0000000000..8b090b3193 --- /dev/null +++ b/sled-diagnostics/src/contract.rs @@ -0,0 +1,172 @@ +use fs_err as fs; +use libc::{c_char, c_int, c_void, pid_t}; +use slog::{warn, Logger}; +use thiserror::Error; + +use std::{ + collections::BTreeSet, + ffi::{CStr, CString}, + os::fd::AsRawFd, + path::Path, +}; + +const CT_ALL: &str = "/system/contract/all"; +const OXIDE_FMRI: &str = "svc:/oxide/"; +const CTD_ALL: i32 = 2; + +#[allow(non_camel_case_types)] +type ct_stathdl_t = *mut c_void; + +#[link(name = "contract")] +extern "C" { + fn ct_status_read( + fd: c_int, + detail: c_int, + stathdlp: *mut ct_stathdl_t, + ) -> c_int; + fn ct_status_free(stathdlp: ct_stathdl_t); + fn ct_status_get_id(stathdlp: ct_stathdl_t) -> i32; + fn ct_pr_status_get_members( + stathdlp: ct_stathdl_t, + pidpp: *mut *mut pid_t, + n: *mut u32, + ) -> c_int; + fn ct_pr_status_get_svc_fmri( + stathdlp: ct_stathdl_t, + fmri: *mut *mut c_char, + ) -> c_int; +} + +#[derive(Error, Debug)] +pub enum ContractError { + #[error(transparent)] + FileIo(#[from] std::io::Error), + #[error( + "Failed to call ct_pr_status_get_svc_fmri for contract {ctid}: {error}" + )] + Fmri { ctid: i32, error: std::io::Error }, + #[error( + "Failed to call ct_pr_status_get_members for contract {ctid}: {error}" + )] + Members { ctid: i32, error: std::io::Error }, + #[error("ct_status_read returned successfully but handed back a null ptr for {0}")] + Null(std::path::PathBuf), + #[error("Failed to call ct_status_read on {path}: {error}")] + StatusRead { path: std::path::PathBuf, error: std::io::Error }, +} + +pub struct ContractStatus { + handle: ct_stathdl_t, +} + +impl Drop for ContractStatus { + fn drop(&mut self) { + unsafe { ct_status_free(self.handle) }; + } +} + +macro_rules! libcall_io { + ($fn: ident ( $($arg: expr), * $(,)*) ) => {{ + let res = unsafe { $fn($($arg, )*) }; + if res == 0 { + Ok(res) + } else { + Err(std::io::Error::last_os_error()) + } + }}; + } + +impl ContractStatus { + fn new(contract_status: &Path) -> Result { + let file = fs::File::open(contract_status)?; + let mut handle: ct_stathdl_t = std::ptr::null_mut(); + libcall_io!(ct_status_read(file.as_raw_fd(), CTD_ALL, &mut handle,)) + .map_err(|error| ContractError::StatusRead { + path: contract_status.to_path_buf(), + error, + })?; + + // We don't ever expect the system to hand back a null ptr when + // returning success but let's be extra cautious anyways. + if handle.is_null() { + return Err(ContractError::Null(contract_status.to_path_buf())); + } + + Ok(Self { handle }) + } + + fn get_members(&self) -> Result<&[i32], ContractError> { + let mut numpids = 0; + let mut pids: *mut pid_t = std::ptr::null_mut(); + + let pids = { + libcall_io!(ct_pr_status_get_members( + self.handle, + &mut pids, + &mut numpids, + )) + .map_err(|error| { + let ctid = unsafe { ct_status_get_id(self.handle) }; + ContractError::Members { ctid, error } + })?; + + unsafe { + if pids.is_null() { + &[] + } else { + std::slice::from_raw_parts(pids, numpids as usize) + } + } + }; + + Ok(pids) + } + + fn get_fmri(&self) -> Result, ContractError> { + // The lifetime of this string is tied to the lifetime of the status + // handle itself and will be cleaned up when the handle is freed. + let mut ptr: *mut c_char = std::ptr::null_mut(); + libcall_io!(ct_pr_status_get_svc_fmri(self.handle, &mut ptr)).map_err( + |error| { + let ctid = unsafe { ct_status_get_id(self.handle) }; + ContractError::Fmri { ctid, error } + }, + )?; + + if ptr.is_null() { + return Ok(None); + } + + let cstr = unsafe { CStr::from_ptr(ptr) }; + Ok(Some(cstr.to_owned())) + } +} + +pub fn find_oxide_pids(log: &Logger) -> Result, ContractError> { + let mut pids = BTreeSet::new(); + let ents = fs::read_dir(CT_ALL)?; + for ct in ents { + let ctid = ct?; + let mut path = ctid.path(); + path.push("status"); + + let status = match ContractStatus::new(path.as_path()) { + Ok(status) => status, + Err(e) => { + // There's a race between the time we find the contracts to the + // time we attempt to read the contract's status. We can safely + // skip all of the errors for diagnostics purposes but we should + // leave a log in our wake. + warn!(log, "Failed to read contract ({:?}): {}", path, e); + continue; + } + }; + + let fmri = status.get_fmri()?.unwrap_or_default(); + if fmri.to_string_lossy().starts_with(OXIDE_FMRI) { + pids.extend(status.get_members()?); + } + } + + Ok(pids) +} diff --git a/sled-diagnostics/src/contract_stub.rs b/sled-diagnostics/src/contract_stub.rs new file mode 100644 index 0000000000..9637c3486d --- /dev/null +++ b/sled-diagnostics/src/contract_stub.rs @@ -0,0 +1,18 @@ +//! Stub implementation for platfroms without libcontract(3lib). + +use std::collections::BTreeSet; + +use slog::{warn, Logger}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ContractError {} + +pub fn find_oxide_pids(log: &Logger) -> Result, ContractError> { + warn!( + log, + "Unable to find oxide pids on a non illumos platform, \ + returning empty set" + ); + Ok(BTreeSet::new()) +} diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs new file mode 100644 index 0000000000..fb596977f1 --- /dev/null +++ b/sled-diagnostics/src/lib.rs @@ -0,0 +1,104 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Diagnostics for an Oxide sled that exposes common support commands. + +use futures::{stream::FuturesUnordered, StreamExt}; +use slog::Logger; + +cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + mod contract; + } else { + mod contract_stub; + use contract_stub as contract; + } +} + +mod queries; +pub use crate::queries::{ + SledDiagnosticsCmdError, SledDiagnosticsCmdOutput, + SledDiagnosticsCommandHttpOutput, SledDiagnosticsQueryOutput, +}; +use queries::*; + +/// List all zones on a sled. +pub async fn zoneadm_info( +) -> Result { + execute_command_with_timeout(zoneadm_list(), DEFAULT_TIMEOUT).await +} + +/// Retrieve various `ipadm` command output for the system. +pub async fn ipadm_info( +) -> Vec> { + [ipadm_show_interface(), ipadm_show_addr(), ipadm_show_prop()] + .into_iter() + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} + +/// Retrieve various `dladm` command output for the system. +pub async fn dladm_info( +) -> Vec> { + [ + dladm_show_phys(), + dladm_show_ether(), + dladm_show_link(), + dladm_show_vnic(), + dladm_show_linkprop(), + ] + .into_iter() + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} + +pub async fn pargs_oxide_processes( + log: &Logger, +) -> Vec> { + // In a diagnostics context we care about looping over every pid we find, + // but on failure we should just return a single error in a vec that + // represents the entire failed operation. + let pids = match contract::find_oxide_pids(log) { + Ok(pids) => pids, + Err(e) => return vec![Err(e.into())], + }; + + pids.iter() + .map(|pid| pargs_process(*pid)) + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} + +pub async fn pstack_oxide_processes( + log: &Logger, +) -> Vec> { + // In a diagnostics context we care about looping over every pid we find, + // but on failure we should just return a single error in a vec that + // represents the entire failed operation. + let pids = match contract::find_oxide_pids(log) { + Ok(pids) => pids, + Err(e) => return vec![Err(e.into())], + }; + + pids.iter() + .map(|pid| pstack_process(*pid)) + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} diff --git a/sled-agent/src/support_bundle/queries.rs b/sled-diagnostics/src/queries.rs similarity index 58% rename from sled-agent/src/support_bundle/queries.rs rename to sled-diagnostics/src/queries.rs index 2313d9e08d..5ee2650406 100644 --- a/sled-agent/src/support_bundle/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -1,18 +1,62 @@ -use std::{process::Command, time::Duration}; +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. -use futures::{stream::FuturesUnordered, StreamExt}; -use illumos_utils::{dladm::DLADM, zone::IPADM, PFEXEC, ZONEADM}; +//! Wrapper for command execution with timeout. + +use std::{ + process::{Command, ExitStatus}, + time::Duration, +}; + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::io::AsyncReadExt; -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +#[cfg(target_os = "illumos")] +use crate::contract::ContractError; + +#[cfg(not(target_os = "illumos"))] +use crate::contract_stub::ContractError; -pub trait SupportBundleCommandHttpOutput { - fn get_output(self) -> String; +const DLADM: &str = "/usr/sbin/dladm"; +const IPADM: &str = "/usr/sbin/ipadm"; +const PFEXEC: &str = "/usr/bin/pfexec"; +const PSTACK: &str = "/usr/bin/pstack"; +const PARGS: &str = "/usr/bin/pargs"; +const ZONEADM: &str = "/usr/sbin/zoneadm"; + +pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); + +pub trait SledDiagnosticsCommandHttpOutput { + fn get_output(self) -> SledDiagnosticsQueryOutput; +} + +#[derive(Clone, Debug, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SledDiagnosticsQueryOutput { + Success { + /// The command and it's arguments. + command: String, + /// Any stdout/stderr produced by the command. + stdio: String, + /// The exit status of the command. This will be the exit code (if any) + /// and exit reason such as from a signal. + exit_status: String, + /// The exit code if one was present when the comman exited. + exit_code: Option, + }, + Failure { + /// The reason the command failed to execute. + error: String, + }, } #[derive(Error, Debug)] -pub enum SupportBundleCmdError { +pub enum SledDiagnosticsCmdError { + #[error("libcontract error: {0}")] + Contract(#[from] ContractError), #[error("Failed to duplicate pipe for command [{command}]: {error}")] Dup { command: String, error: std::io::Error }, #[error("Failed to proccess output for command [{command}]: {error}")] @@ -32,27 +76,26 @@ pub enum SupportBundleCmdError { } #[derive(Debug)] -pub struct SupportBundleCmdOutput { +pub struct SledDiagnosticsCmdOutput { pub command: String, pub stdio: String, - pub exit_status: String, + pub exit_status: ExitStatus, } -impl std::fmt::Display for SupportBundleCmdOutput { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "Command executed [{}]:", self.command)?; - writeln!(f, " ==== stdio ====\n{}", self.stdio)?; - writeln!(f, " ==== exit status ====\n{}", self.exit_status) - } -} - -impl SupportBundleCommandHttpOutput - for Result +impl SledDiagnosticsCommandHttpOutput + for Result { - fn get_output(self) -> String { + fn get_output(self) -> SledDiagnosticsQueryOutput { match self { - Ok(output) => format!("{output}"), - Err(error) => format!("{error}"), + Ok(output) => SledDiagnosticsQueryOutput::Success { + command: output.command, + stdio: output.stdio, + exit_status: output.exit_status.to_string(), + exit_code: output.exit_status.code(), + }, + Err(error) => { + SledDiagnosticsQueryOutput::Failure { error: error.to_string() } + } } } } @@ -76,16 +119,19 @@ fn command_to_string(command: &Command) -> String { /// and stderr as they occur. async fn execute( cmd: Command, -) -> Result { +) -> Result { let cmd_string = command_to_string(&cmd); let (sender, mut rx) = tokio::net::unix::pipe::pipe().map_err(|e| { - SupportBundleCmdError::Pipe { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::Pipe { command: cmd_string.clone(), error: e } })?; let pipe = sender.into_nonblocking_fd().map_err(|e| { - SupportBundleCmdError::OwnedFd { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::OwnedFd { + command: cmd_string.clone(), + error: e, + } })?; let pipe_dup = pipe.try_clone().map_err(|e| { - SupportBundleCmdError::Dup { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::Dup { command: cmd_string.clone(), error: e } })?; // TODO MTZ: We may eventually want to reuse some of the process contract @@ -95,9 +141,8 @@ async fn execute( cmd.stdout(pipe); cmd.stderr(pipe_dup); - let mut child = cmd.spawn().map_err(|e| SupportBundleCmdError::Spawn { - command: cmd_string.clone(), - error: e, + let mut child = cmd.spawn().map_err(|e| { + SledDiagnosticsCmdError::Spawn { command: cmd_string.clone(), error: e } })?; // NB: This drop call is load-bearing and prevents a deadlock. The command // struct holds onto the write half of the pipe preventing the read side @@ -107,130 +152,99 @@ async fn execute( let mut stdio = String::new(); rx.read_to_string(&mut stdio).await.map_err(|e| { - SupportBundleCmdError::Output { command: cmd_string.clone(), error: e } + SledDiagnosticsCmdError::Output { + command: cmd_string.clone(), + error: e, + } })?; - let exit_status = - child.wait().await.map(|es| format!("{es}")).map_err(|e| { - SupportBundleCmdError::Wait { - command: cmd_string.clone(), - error: e, - } - })?; + let exit_status = child.wait().await.map_err(|e| { + SledDiagnosticsCmdError::Wait { command: cmd_string.clone(), error: e } + })?; - Ok(SupportBundleCmdOutput { command: cmd_string, stdio, exit_status }) + Ok(SledDiagnosticsCmdOutput { command: cmd_string, stdio, exit_status }) } /// Spawn a command that's allowed to execute within a given time limit. -async fn execute_command_with_timeout( +pub async fn execute_command_with_timeout( command: Command, duration: Duration, -) -> Result { +) -> Result { let cmd_string = command_to_string(&command); let tokio_command = execute(command); match tokio::time::timeout(duration, tokio_command).await { Ok(res) => res, - Err(_elapsed) => Err(SupportBundleCmdError::Timeout { + Err(_elapsed) => Err(SledDiagnosticsCmdError::Timeout { command: cmd_string, duration, }), } } -fn zoneadm_list() -> Command { +pub fn zoneadm_list() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(ZONEADM).arg("list").arg("-cip"); cmd } -fn ipadm_show_interface() -> Command { +pub fn ipadm_show_interface() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(IPADM).arg("show-if"); cmd } -fn ipadm_show_addr() -> Command { +pub fn ipadm_show_addr() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(IPADM).arg("show-addr"); cmd } -fn ipadm_show_prop() -> Command { +pub fn ipadm_show_prop() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(IPADM).arg("show-prop"); cmd } -fn dladm_show_phys() -> Command { +pub fn dladm_show_phys() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).args(["show-phys", "-m"]); cmd } -fn dladm_show_ether() -> Command { +pub fn dladm_show_ether() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-ether"); cmd } -fn dladm_show_link() -> Command { +pub fn dladm_show_link() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-link"); cmd } -fn dladm_show_vnic() -> Command { +pub fn dladm_show_vnic() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-vnic"); cmd } -fn dladm_show_linkprop() -> Command { +pub fn dladm_show_linkprop() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear().arg(DLADM).arg("show-linkprop"); cmd } -/* - * Public API - */ - -/// List all zones on a sled. -pub async fn zoneadm_info( -) -> Result { - execute_command_with_timeout(zoneadm_list(), DEFAULT_TIMEOUT).await -} - -/// Retrieve various `ipadm` command output for the system. -pub async fn ipadm_info( -) -> Vec> { - [ipadm_show_interface(), ipadm_show_addr(), ipadm_show_prop()] - .into_iter() - .map(|c| async move { - execute_command_with_timeout(c, DEFAULT_TIMEOUT).await - }) - .collect::>() - .collect::>>() - .await +pub fn pargs_process(pid: i32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(PARGS).arg("-ae").arg(pid.to_string()); + cmd } -/// Retrieve various `dladm` command output for the system. -pub async fn dladm_info( -) -> Vec> { - [ - dladm_show_phys(), - dladm_show_ether(), - dladm_show_link(), - dladm_show_vnic(), - dladm_show_linkprop(), - ] - .into_iter() - .map(|c| async move { - execute_command_with_timeout(c, DEFAULT_TIMEOUT).await - }) - .collect::>() - .collect::>>() - .await +pub fn pstack_process(pid: i32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(PSTACK).arg(pid.to_string()); + cmd } #[cfg(test)] @@ -246,7 +260,7 @@ mod test { match execute_command_with_timeout(command, Duration::from_millis(500)) .await { - Err(SupportBundleCmdError::Timeout { .. }) => (), + Err(SledDiagnosticsCmdError::Timeout { .. }) => (), _ => panic!("command should have timed out"), } } @@ -267,7 +281,7 @@ mod test { #[tokio::test] async fn test_command_stderr_is_correct() { let mut command = Command::new("bash"); - command.env_clear().args(&["-c", "echo oxide computer > /dev/stderr"]); + command.env_clear().args(["-c", "echo oxide computer > /dev/stderr"]); let res = execute_command_with_timeout(command, Duration::from_secs(5)) .await @@ -279,7 +293,7 @@ mod test { #[tokio::test] async fn test_command_stdout_stderr_are_interleaved() { let mut command = Command::new("bash"); - command.env_clear().args(&[ + command.env_clear().args([ "-c", "echo one > /dev/stdout \ && echo two > /dev/stderr \