From 752d46afc320f12cc2a8d58d26a7ec82c936f01b Mon Sep 17 00:00:00 2001 From: Alberto Faria Date: Mon, 4 Mar 2024 18:53:16 +0000 Subject: [PATCH] Add support for running bootc bootable containers We attempt to detect if a container image is bootable. We can't easily retrieve the image's labels, so we instead look for files under /usr/lib/bootc/install. If there are none, it isn't a bootable container. If it is a bootable container but we're not running under Podman, we fail with an error. Once our container's entrypoint starts running, a background process on the host (outside the container) queries Podman for the image's name and ID, which the OCI runtime does not get but bootc-install needs. It then saves the container image as an OCI archive. It then runs the original container to generate the VM image. We do this using krun [1] so that elevated privileges aren't necessary. Our entrypoint blocks until this is done, and all subsequent logic remains the same. We could potentially avoid the OCI archive creation step by mounting the host's container storage into the container running under krun. This isn't trivial to achieve due to SELinux label and context mismatches between the host and the krun environment, so we leave this optimization for a future date. Closes #26. [1] https://github.com/containers/crun/blob/main/krun.1.md Signed-off-by: Alberto Faria --- docs/1-installing.md | 2 +- embed/bootc/config.json | 88 +++++++++++++++++ embed/bootc/entrypoint.sh | 19 ++++ embed/bootc/prepare.sh | 62 ++++++++++++ {scripts => embed}/entrypoint.sh | 21 ++++ {scripts => embed}/exec.sh | 0 {scripts => embed}/virtiofsd.sh | 0 src/commands/create/mod.rs | 159 ++++++++++++++++++++++++------- src/util.rs | 13 ++- tests/env.sh | 7 +- tests/t/bootc-rootfs.sh | 15 +++ tests/t/cloud-init.sh | 36 ++++--- tests/t/hostname.sh | 2 +- tests/t/mount.sh | 2 +- tests/t/publish.sh | 41 ++++---- 15 files changed, 392 insertions(+), 75 deletions(-) create mode 100644 embed/bootc/config.json create mode 100644 embed/bootc/entrypoint.sh create mode 100644 embed/bootc/prepare.sh rename {scripts => embed}/entrypoint.sh (86%) rename {scripts => embed}/exec.sh (100%) rename {scripts => embed}/virtiofsd.sh (100%) create mode 100644 tests/t/bootc-rootfs.sh diff --git a/docs/1-installing.md b/docs/1-installing.md index b1c2822..44cb689 100644 --- a/docs/1-installing.md +++ b/docs/1-installing.md @@ -35,7 +35,7 @@ To also set up crun-vm for use with Docker: 1. Install crun-vm's runtime dependencies: ```console - $ dnf install bash coreutils crun genisoimage grep libselinux-devel libvirt-client libvirt-daemon-driver-qemu libvirt-daemon-log openssh-clients qemu-img qemu-system-x86-core shadow-utils util-linux virtiofsd + $ dnf install bash coreutils crun crun-krun genisoimage grep libselinux-devel libvirt-client libvirt-daemon-driver-qemu libvirt-daemon-log openssh-clients qemu-img qemu-system-x86-core sed shadow-utils util-linux virtiofsd ``` 2. Install Rust and Cargo if you do not already have Rust tooling available: diff --git a/embed/bootc/config.json b/embed/bootc/config.json new file mode 100644 index 0000000..a40fc6c --- /dev/null +++ b/embed/bootc/config.json @@ -0,0 +1,88 @@ +{ + "ociVersion": "1.0.0", + "process": { + "terminal": true, + "user": { "uid": 0, "gid": 0 }, + "args": ["/output/entrypoint.sh", ""], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [], + "effective": [], + "inheritable": [], + "permitted": [], + "ambient": [] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 262144, + "soft": 262144 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "", + "readonly": false + }, + "hostname": "bootc-install", + "mounts": [ + { + "type": "bind", + "source": "/root/crun-vm/bootc", + "destination": "/output", + "options": ["bind", "rprivate", "rw"] + }, + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + } + ], + "linux": { + "namespaces": [ + { "type": "pid" }, + { "type": "network" }, + { "type": "ipc" }, + { "type": "uts" }, + { "type": "cgroup" }, + { "type": "mount" } + ], + "maskedPaths": [ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} diff --git a/embed/bootc/entrypoint.sh b/embed/bootc/entrypoint.sh new file mode 100644 index 0000000..0a5aa21 --- /dev/null +++ b/embed/bootc/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later + +set -e + +image_name=$1 + +bootc install to-disk \ + --source-imgref oci-archive:/output/image.oci-archive \ + --target-imgref "$image_name" \ + --skip-fetch-check \ + --generic-image \ + --via-loopback \ + --karg console=tty0 \ + --karg console=ttyS0 \ + --karg selinux=0 \ + /output/image.raw + +touch /output/success diff --git a/embed/bootc/prepare.sh b/embed/bootc/prepare.sh new file mode 100644 index 0000000..3b7dd5b --- /dev/null +++ b/embed/bootc/prepare.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +set -o errexit -o pipefail -o nounset + +original_root=$1 +priv_dir=$2 +container_id=$3 + +__step() { + >&2 printf "\033[36m%s\033[0m\n" "$*" +} + +mkfifo "$priv_dir/root/crun-vm/bootc/progress" +exec > "$priv_dir/root/crun-vm/bootc/progress" 2>&1 + +# this blocks here until the named pipe above is opened by entrypoint.sh + +# get info about the container *image* + +__step 'Storing the container image as an OCI archive...' + +image_info=$( + podman container inspect \ + --format '{{.ImageName}}\t{{.Image}}' \ + "$container_id" + ) + +image_name=$( cut -f1 <<< "$image_info" ) +image_id=$( cut -f2 <<< "$image_info" ) + +oci_archive=$priv_dir/root/crun-vm/bootc/image.oci-archive + +# save container *image* as an OCI archive + +podman save --format oci-archive --output "$oci_archive.tmp" "$image_id" +mv "$oci_archive.tmp" "$oci_archive" + +# adjust krun config + +__step 'Generating a VM image from the container image...' + +__sed() { + sed -i "s|$1|$2|" "$priv_dir/root/crun-vm/bootc/config.json" +} + +__sed "" "$image_name" +__sed "" "$original_root" +__sed "" "$priv_dir" + +# run bootc-install under krun + +truncate --size 10G "$priv_dir/root/crun-vm/bootc/image.raw" # TODO: allow adjusting disk size + +krun run \ + --config "$priv_dir/root/crun-vm/bootc/config.json" \ + "crun-vm-$container_id" \ + ]) -> Resu let config_path = bundle_path.join("config.json"); let mut spec = oci_spec::runtime::Spec::load(&config_path)?; - let original_root_path: Utf8PathBuf = spec.root_path()?.canonicalize()?.try_into()?; // ensure absolute - - if let Some(process) = spec.process().as_ref() { - if let Some(capabilities) = process.capabilities().as_ref() { - fn any_is_cap_sys_admin(caps: &Option) -> bool { - caps.as_ref() - .is_some_and(|set| set.contains(&oci_spec::runtime::Capability::SysAdmin)) - } + ensure_unprivileged(&spec)?; - ensure!( - !any_is_cap_sys_admin(capabilities.bounding()) - && !any_is_cap_sys_admin(capabilities.effective()) - && !any_is_cap_sys_admin(capabilities.inheritable()) - && !any_is_cap_sys_admin(capabilities.permitted()) - && !any_is_cap_sys_admin(capabilities.ambient()), - "crun-vm is incompatible with privileged containers" - ); - } - } + let original_root_path: Utf8PathBuf = spec.root_path()?.canonicalize()?.try_into()?; // ensure absolute let runtime_env = RuntimeEnv::current(&spec, &original_root_path)?; let custom_options = CustomOptions::from_spec(&spec, runtime_env)?; + let is_bootc_container = is_bootc_container( + &args.container_id, + bundle_path, + &original_root_path, + runtime_env, + )?; + // We include container_id in our paths to ensure no overlap with the user container's contents. let priv_dir_path = original_root_path.join(format!("crun-vm-{}", args.container_id)); fs::create_dir_all(&priv_dir_path)?; @@ -66,7 +59,13 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu set_file_context(&priv_dir_path, context)?; } - set_up_container_root(&mut spec, &priv_dir_path, &custom_options)?; + set_up_container_root( + &mut spec, + &priv_dir_path, + &custom_options, + is_bootc_container, + )?; + let is_first_create = is_first_create(&spec)?; let base_vm_image_info = set_up_vm_image( @@ -75,6 +74,7 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu &priv_dir_path, &custom_options, is_first_create, + is_bootc_container, )?; let mut mounts = Mounts::default(); @@ -100,9 +100,87 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu crun(raw_args)?; // actually create container + if is_first_create && is_bootc_container { + // We want to ask podman what our image name is, so we can give it to bootc-install, but we + // can't wait synchronously for a response since podman hangs until this create command + // completes. We then want to run bootc-install under krun, which already isolates the + // workload and so can be run outside of our container. We thus launch a process that + // asynchronously performs these steps, and share its progress and output with out + // container's entrypoint through a named pipe. + // + // Note that this process blocks until our container's entrypoint actually starts running, + // thus after the "start" OCI runtime command is called. + + let bootc_dir = priv_dir_path.join("root/crun-vm/bootc"); + fs::create_dir_all(&bootc_dir)?; + + std::process::Command::new(bootc_dir.join("prepare.sh")) + .arg(&original_root_path) + .arg(&priv_dir_path) + .arg(&args.container_id) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn()?; + } + + Ok(()) +} + +fn ensure_unprivileged(spec: &oci_spec::runtime::Spec) -> Result<()> { + if let Some(process) = spec.process().as_ref() { + if let Some(capabilities) = process.capabilities().as_ref() { + fn any_is_cap_sys_admin(caps: &Option) -> bool { + caps.as_ref() + .is_some_and(|set| set.contains(&oci_spec::runtime::Capability::SysAdmin)) + } + + ensure!( + !any_is_cap_sys_admin(capabilities.bounding()) + && !any_is_cap_sys_admin(capabilities.effective()) + && !any_is_cap_sys_admin(capabilities.inheritable()) + && !any_is_cap_sys_admin(capabilities.permitted()) + && !any_is_cap_sys_admin(capabilities.ambient()), + "crun-vm is incompatible with privileged containers" + ); + } + } + Ok(()) } +fn is_bootc_container( + container_id: &str, + bundle_path: &Utf8Path, + original_root_path: &Utf8Path, + env: RuntimeEnv, +) -> Result { + lazy_static! { + static ref PATTERN: Regex = Regex::new(r"/overlay-containers/([^/]+)/userdata$").unwrap(); + } + + let bootc_config_dir = original_root_path.join("usr/lib/bootc/install"); + + let is_bootc_container = + bootc_config_dir.is_dir() && bootc_config_dir.read_dir()?.next().is_some(); + + if is_bootc_container { + // check as much as we can that we're running under podman + + let is_podman_bundle_path = match PATTERN.captures(bundle_path.as_str()) { + Some(captures) => &captures[1] == container_id, + None => false, + }; + + ensure!( + env == RuntimeEnv::Other && is_podman_bundle_path, + "bootc containers are only supported with Podman" + ); + } + + Ok(is_bootc_container) +} + fn is_first_create(spec: &oci_spec::runtime::Spec) -> Result { let path = spec.root_path()?.join("crun-vm/create-ran"); @@ -124,6 +202,7 @@ fn set_up_container_root( spec: &mut oci_spec::runtime::Spec, priv_dir_path: &Utf8Path, custom_options: &CustomOptions, + is_bootc_container: bool, ) -> Result<()> { let new_root_path = priv_dir_path.join("root"); fs::create_dir_all(&new_root_path)?; @@ -138,19 +217,22 @@ fn set_up_container_root( .unwrap(), )); - // set up container scripts + // set up container files #[derive(RustEmbed)] - #[folder = "scripts/"] - struct Scripts; + #[folder = "embed/"] + struct Embed; - for path in Scripts::iter() { + for path in Embed::iter() { let path_in_host = new_root_path.join("crun-vm").join(path.as_ref()); fs::create_dir_all(path_in_host.parent().unwrap())?; - let file = Scripts::get(&path).unwrap(); + let file = Embed::get(&path).unwrap(); fs::write(&path_in_host, file.data)?; - fs::set_permissions(&path_in_host, Permissions::from_mode(0o755))?; + + let is_script = path.as_ref().ends_with(".sh"); + let mode = if is_script { 0o755 } else { 0o644 }; + fs::set_permissions(&path_in_host, Permissions::from_mode(mode))?; } // configure container entrypoint @@ -160,7 +242,8 @@ fn set_up_container_root( } else if custom_options.print_config_json { vec!["cat", "/crun-vm/config.json"] } else { - vec!["/crun-vm/entrypoint.sh"] + let arg = if is_bootc_container { "1" } else { "0" }; + vec!["/crun-vm/entrypoint.sh", arg] }; spec.set_process({ @@ -184,7 +267,20 @@ fn set_up_vm_image( priv_dir_path: &Utf8Path, custom_options: &CustomOptions, is_first_create: bool, + is_bootc_container: bool, ) -> Result { + let mirror_vm_image_path_in_container = Utf8PathBuf::from("/crun-vm/image/image"); + let mirror_vm_image_path_in_host = spec.root_path()?.join("crun-vm/image/image"); + + if is_bootc_container { + // the image will be generated later + return Ok(VmImageInfo { + path: mirror_vm_image_path_in_container, + size: 0, + format: "raw".to_string(), + }); + } + // where inside the container to look for the VM image const VM_IMAGE_SEARCH_PATHS: [&str; 2] = ["./", "disk/"]; @@ -208,9 +304,6 @@ fn set_up_vm_image( fs::hard_link(vm_image_path_in_host, image_dir_path.join("image"))?; } - let mirror_vm_image_path_in_container = Utf8PathBuf::from("/crun-vm/image/image"); - let mirror_vm_image_path_in_host = spec.root_path()?.join("crun-vm/image/image"); - if custom_options.persistent { // Mount overlayfs to expose the user's VM image file with a different SELinux context so we // can always access it, using the file's parent as the upperdir so that writes still @@ -220,7 +313,7 @@ fn set_up_vm_image( bind_mount_dir_with_different_context( image_dir_path, mirror_vm_image_path_in_host.parent().unwrap(), - priv_dir_path.join("scratch"), + priv_dir_path.join("scratch-image"), spec.mount_label(), false, )?; @@ -243,7 +336,7 @@ fn set_up_vm_image( bind_mount_dir_with_different_context( image_dir_path, mirror_vm_image_path_in_host.parent().unwrap(), - priv_dir_path.join("scratch"), + priv_dir_path.join("scratch-image"), spec.mount_label(), true, )?; @@ -560,7 +653,7 @@ fn set_up_security(spec: &mut oci_spec::runtime::Spec) { // TODO: This doesn't seem reasonable at all. Should we just force users to use a different // seccomp profile? Should passt provide the option to bypass a lot of the isolation that it // does, given we're already in a container *and* under a seccomp profile? - spec.linux_seccomp_syscalls_push( + spec.linux_seccomp_syscalls_push_front( oci_spec::runtime::LinuxSyscallBuilder::default() .names(["mount", "pivot_root", "umount2", "unshare"].map(String::from)) .action(oci_spec::runtime::LinuxSeccompAction::ScmpActAllow) diff --git a/src/util.rs b/src/util.rs index fc0cde4..84ce83f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -33,13 +33,13 @@ pub fn fix_selinux_label(process: &mut oci_spec::runtime::Process) { pub fn set_file_context(path: impl AsRef, context: &str) -> Result<()> { extern "C" { - fn setfilecon(path: *const c_char, con: *const c_char) -> i32; + fn lsetfilecon(path: *const c_char, con: *const c_char) -> i32; } let path = CString::new(path.as_ref().as_os_str().as_bytes())?; let context = CString::new(context.as_bytes())?; - if unsafe { setfilecon(path.as_ptr(), context.as_ptr()) } != 0 { + if unsafe { lsetfilecon(path.as_ptr(), context.as_ptr()) } != 0 { return Err(io::Error::last_os_error().into()); } @@ -198,7 +198,7 @@ pub trait SpecExt { linux_device_cgroup: oci_spec::runtime::LinuxDeviceCgroup, ); fn process_capabilities_insert_beip(&mut self, capability: oci_spec::runtime::Capability); - fn linux_seccomp_syscalls_push(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall); + fn linux_seccomp_syscalls_push_front(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall); } impl SpecExt for oci_spec::runtime::Spec { @@ -276,7 +276,10 @@ impl SpecExt for oci_spec::runtime::Spec { }); } - fn linux_seccomp_syscalls_push(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall) { + fn linux_seccomp_syscalls_push_front( + &mut self, + linux_syscall: oci_spec::runtime::LinuxSyscall, + ) { self.set_linux({ let mut linux = self.linux().clone().expect("linux config"); linux.set_seccomp({ @@ -284,7 +287,7 @@ impl SpecExt for oci_spec::runtime::Spec { if let Some(seccomp) = &mut seccomp { seccomp.set_syscalls({ let mut syscalls = seccomp.syscalls().clone().unwrap_or_default(); - syscalls.push(linux_syscall); + syscalls.insert(0, linux_syscall); Some(syscalls) }); } diff --git a/tests/env.sh b/tests/env.sh index 2aea024..d2b7008 100755 --- a/tests/env.sh +++ b/tests/env.sh @@ -11,20 +11,23 @@ container_name=crun-vm-test-env declare -A TEST_IMAGES TEST_IMAGES=( - [fedora]=quay.io/containerdisks/fedora:39 # uses cloud-init - [coreos]=quay.io/crun-vm/example-fedora-coreos:39 # uses Ignition + [fedora]=quay.io/containerdisks/fedora:39 # uses cloud-init + [coreos]=quay.io/crun-vm/example-fedora-coreos:39 # uses Ignition + [fedora-bootc]=quay.io/centos-bootc/fedora-bootc:eln # bootable container ) declare -A TEST_IMAGES_DEFAULT_USER TEST_IMAGES_DEFAULT_USER=( [fedora]=fedora [coreos]=core + [fedora-bootc]=cloud-user ) declare -A TEST_IMAGES_DEFAULT_USER_HOME TEST_IMAGES_DEFAULT_USER_HOME=( [fedora]=/home/fedora [coreos]=/var/home/core + [fedora-bootc]=/var/home/cloud-user ) __bad_usage() { diff --git a/tests/t/bootc-rootfs.sh b/tests/t/bootc-rootfs.sh new file mode 100644 index 0000000..5d78b04 --- /dev/null +++ b/tests/t/bootc-rootfs.sh @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ "$ENGINE" == docker ]]; then + # we only support bootc containers under Podman + __skip +fi + +"$UTIL_DIR/extract-vm-image.sh" "${TEST_IMAGES[fedora-bootc]}" "$TEMP_DIR/image" + +__run() { + __engine run --rm --detach --name bootc-rootfs "$@" --rootfs "$TEMP_DIR" +} + +! __run +! __run --persistent diff --git a/tests/t/cloud-init.sh b/tests/t/cloud-init.sh index 6ea51dd..0c95f02 100644 --- a/tests/t/cloud-init.sh +++ b/tests/t/cloud-init.sh @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-or-later -image="${TEST_IMAGES[fedora]}" -user="${TEST_IMAGES_DEFAULT_USER[fedora]}" -home="${TEST_IMAGES_DEFAULT_USER_HOME[fedora]}" - cat >"$TEMP_DIR/user-data" <"$TEMP_DIR/meta-data" </dev/null + endpoint=$( __engine port publish | tee /dev/stderr | cut -d' ' -f3 ) -__engine exec publish --as "$user" python -m http.server & -trap '__engine stop publish' EXIT + __engine exec publish --as "$user" + + __log 'Ensuring curl fails...' + ! curl "$endpoint" 2>/dev/null + + __engine exec publish --as "$user" python -m http.server & + + sleep 3 + + __log 'Ensuring curl succeeds...' + [[ "$( curl "$endpoint" 2>/dev/null | head -1 )" == "" ]] -sleep 3 + __engine stop publish -__log 'Ensuring curl succeeds...' -[[ "$( curl "$endpoint" 2>/dev/null | head -1 )" == "" ]] +done