Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for finding oxide processes on a sled #7194

Open
wants to merge 9 commits into
base: spr/papertigers/main.add-support-for-finding-oxide-processes-on-a-sled
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,48 @@
}
}
},
"/support/pargs-info": {
"get": {
"operationId": "support_pargs_info",
"responses": {
"200": {
"description": "",
"content": {
"*/*": {
"schema": {}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/support/pstack-info": {
"get": {
"operationId": "support_pstack_info",
"responses": {
"200": {
"description": "",
"content": {
"*/*": {
"schema": {}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/support/zoneadm-info": {
"get": {
"operationId": "support_zoneadm_info",
Expand Down
16 changes: 16 additions & 0 deletions sled-agent/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,22 @@ pub trait SledAgentApi {
async fn support_dladm_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError>;

#[endpoint {
method = GET,
path = "/support/pargs-info",
}]
async fn support_pargs_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError>;

#[endpoint {
method = GET,
path = "/support/pstack-info",
}]
async fn support_pstack_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError>;
}

#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)]
Expand Down
32 changes: 32 additions & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -902,4 +902,36 @@ impl SledAgentApi for SledAgentImpl {

Ok(HttpResponseOk(FreeformBody(output.into())))
}

async fn support_pargs_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
let sa = request_context.context();
let output = sa
.support_pargs_info()
.await
.into_iter()
.map(|cmd| cmd.get_output())
.collect::<Vec<_>>()
.as_slice()
.join("\n\n");

Ok(HttpResponseOk(FreeformBody(output.into())))
}

async fn support_pstack_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
let sa = request_context.context();
let output = sa
.support_pstack_info()
.await
.into_iter()
.map(|cmd| cmd.get_output())
.collect::<Vec<_>>()
.as_slice()
.join("\n\n");

Ok(HttpResponseOk(FreeformBody(output.into())))
}
}
12 changes: 12 additions & 0 deletions sled-agent/src/sim/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,18 @@ impl SledAgentApi for SledAgentSimImpl {
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
method_unimplemented()
}

async fn support_pargs_info(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
method_unimplemented()
}

async fn support_pstack_info(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
method_unimplemented()
}
}

fn method_unimplemented<T>() -> Result<T, HttpError> {
Expand Down
12 changes: 12 additions & 0 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1379,6 +1379,18 @@ impl SledAgent {
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::dladm_info().await
}

pub(crate) async fn support_pargs_info(
&self,
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::pargs_oxide_processes(&self.log).await
}

pub(crate) async fn support_pstack_info(
&self,
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::pstack_oxide_processes(&self.log).await
}
}

#[derive(From, thiserror::Error, Debug)]
Expand Down
4 changes: 4 additions & 0 deletions sled-diagnostics/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ edition = "2021"
workspace = true

[dependencies]
cfg-if.workspace = true
fs-err.workspace = true
futures.workspace = true
libc.workspace = true
omicron-workspace-hack.workspace = true
slog.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
182 changes: 182 additions & 0 deletions sled-diagnostics/src/contract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// ! Bindings to libcontract(3lib).

use fs_err as fs;
use libc::{c_char, c_int, c_void, pid_t};
use slog::{warn, Logger};
use thiserror::Error;

use std::{
collections::BTreeSet,
ffi::{CStr, CString},
os::fd::AsRawFd,
path::Path,
};

const CT_ALL: &str = "/system/contract/all";
// Most Oxide services
const OXIDE_FMRI: &str = "svc:/oxide/";
// NB: Used for propolis zones
const ILLUMOS_FMRI: &str = "svc:/system/illumos/";
const CTD_ALL: i32 = 2;

#[allow(non_camel_case_types)]
type ct_stathdl_t = *mut c_void;

#[link(name = "contract")]
extern "C" {
fn ct_status_read(
fd: c_int,
detail: c_int,
stathdlp: *mut ct_stathdl_t,
) -> c_int;
fn ct_status_free(stathdlp: ct_stathdl_t);
fn ct_status_get_id(stathdlp: ct_stathdl_t) -> i32;
fn ct_pr_status_get_members(
stathdlp: ct_stathdl_t,
pidpp: *mut *mut pid_t,
n: *mut u32,
) -> c_int;
fn ct_pr_status_get_svc_fmri(
stathdlp: ct_stathdl_t,
fmri: *mut *mut c_char,
) -> c_int;
}

#[derive(Error, Debug)]
pub enum ContractError {
#[error(transparent)]
FileIo(#[from] std::io::Error),
#[error(
"Failed to call ct_pr_status_get_svc_fmri for contract {ctid}: {error}"
)]
Fmri { ctid: i32, error: std::io::Error },
#[error(
"Failed to call ct_pr_status_get_members for contract {ctid}: {error}"
)]
Members { ctid: i32, error: std::io::Error },
#[error("ct_status_read returned successfully but handed back a null ptr for {0}")]
Null(std::path::PathBuf),
#[error("Failed to call ct_status_read on {path}: {error}")]
StatusRead { path: std::path::PathBuf, error: std::io::Error },
}

pub struct ContractStatus {
handle: ct_stathdl_t,
}

impl Drop for ContractStatus {
fn drop(&mut self) {
unsafe { ct_status_free(self.handle) };
}
}

macro_rules! libcall_io {
($fn: ident ( $($arg: expr), * $(,)*) ) => {{
let res = unsafe { $fn($($arg, )*) };
if res == 0 {
Ok(res)
} else {
Err(std::io::Error::last_os_error())
}
}};
}

impl ContractStatus {
fn new(contract_status: &Path) -> Result<Self, ContractError> {
let file = fs::File::open(contract_status)?;
let mut handle: ct_stathdl_t = std::ptr::null_mut();
libcall_io!(ct_status_read(file.as_raw_fd(), CTD_ALL, &mut handle,))
.map_err(|error| ContractError::StatusRead {
path: contract_status.to_path_buf(),
error,
})?;

// We don't ever expect the system to hand back a null ptr when
// returning success but let's be extra cautious anyways.
if handle.is_null() {
return Err(ContractError::Null(contract_status.to_path_buf()));
}

Ok(Self { handle })
}

fn get_members(&self) -> Result<&[i32], ContractError> {
let mut numpids = 0;
let mut pids: *mut pid_t = std::ptr::null_mut();

let pids = {
libcall_io!(ct_pr_status_get_members(
self.handle,
&mut pids,
&mut numpids,
))
.map_err(|error| {
let ctid = unsafe { ct_status_get_id(self.handle) };
ContractError::Members { ctid, error }
})?;

unsafe {
if pids.is_null() {
&[]
} else {
std::slice::from_raw_parts(pids, numpids as usize)
}
}
};

Ok(pids)
}

fn get_fmri(&self) -> Result<Option<CString>, ContractError> {
// The lifetime of this string is tied to the lifetime of the status
// handle itself and will be cleaned up when the handle is freed.
let mut ptr: *mut c_char = std::ptr::null_mut();
libcall_io!(ct_pr_status_get_svc_fmri(self.handle, &mut ptr)).map_err(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From https://illumos.org/man/3CONTRACT/ct_pr_status_get_svc_creator#:~:text=The%20ct_pr_status_get_svc_fmri(),not%20be%20modified

  The ct_pr_status_get_svc_fmri(), ct_pr_status_get_svc_creator(), and
  ct_pr_status_get_svc_aux() functions read, respectively, the service
  FMRI, the contract's creator execname and  the creator's auxiliary field.
  The buffer pointed to by fmri, aux or creator, is freed by a call to
  ct_status_free() and should not be modified.

I'm under the impression that the ptr value here must be freed via a call to ct_status_free - is that right? or is this just saying that the lifetime of the returned result must be shorter than ContractStatus?

If it's the former (the ptr value must be freed explicitly): Are we leaking?
If it's the latter (the ptr values lives as long as ContractStatus): Is this code unsound? Would it be possible to keep using the returned CString after self has been dropped?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ptr returned is tied to the lifetime of the contract status handle ct_stathdl_t. We read that pointer as a cstr but the function returns a CString which copies the data to own it. If I understand correctly the ptr returned from the call to ct_pr_status_get_svc_fmri will not leave anything around because it will be cleaned up when ct_status_free is called via the drop method on ContractStatus.

I will verify this, but this was at least my intention upon writing this code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, gotcha, so:

        let cstr = unsafe { CStr::from_ptr(ptr) }

Would create a cstr that needs to have a shorter lifetime than the handle, but:

        cstr.to_owned()

Would clone it, and free us from that lifetime constraint when the function returns

|error| {
let ctid = unsafe { ct_status_get_id(self.handle) };
ContractError::Fmri { ctid, error }
},
)?;

if ptr.is_null() {
return Ok(None);
}

let cstr = unsafe { CStr::from_ptr(ptr) };
Ok(Some(cstr.to_owned()))
}
}

pub fn find_oxide_pids(log: &Logger) -> Result<BTreeSet<i32>, ContractError> {
let mut pids = BTreeSet::new();
let ents = fs::read_dir(CT_ALL)?;
for ct in ents {
let ctid = ct?;
let mut path = ctid.path();
path.push("status");

let status = match ContractStatus::new(path.as_path()) {
Ok(status) => status,
Err(e) => {
// There's a race between the time we find the contracts to the
// time we attempt to read the contract's status. We can safely
// skip all of the errors for diagnostics purposes but we should
// leave a log in our wake.
warn!(log, "Failed to read contract ({:?}): {}", path, e);
continue;
}
};

let fmri_owned = status.get_fmri()?.unwrap_or_default();
let fmri = fmri_owned.to_string_lossy();
if fmri.starts_with(OXIDE_FMRI) || fmri.starts_with(ILLUMOS_FMRI) {
pids.extend(status.get_members()?);
}
}

Ok(pids)
}
18 changes: 18 additions & 0 deletions sled-diagnostics/src/contract_stub.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//! Stub implementation for platfroms without libcontract(3lib).

use std::collections::BTreeSet;

use slog::{warn, Logger};
use thiserror::Error;

#[derive(Error, Debug)]
pub enum ContractError {}

pub fn find_oxide_pids(log: &Logger) -> Result<BTreeSet<i32>, ContractError> {
warn!(
log,
"Unable to find oxide pids on a non illumos platform, \
returning empty set"
);
Ok(BTreeSet::new())
}
Loading
Loading