Skip to content

Commit

Permalink
lib/sysroot-deploy: Add experimental support for automatic early prune
Browse files Browse the repository at this point in the history
During the early design of FCOS and RHCOS, we chose a value of 384M
for the boot partition. This turned out to be too small: some arches
other than x86_64 have larger initrds, kernel binaries, or additional
artifacts (like device tree blobs). We'll likely bump the boot partition
size in the future, but we don't want to abandon all the nodes deployed
with the current size.[[1]]

Because stale entries in `/boot` are cleaned up after new entries are
written, there is a window in the update process during which the bootfs
temporarily must host all the `(kernel, initrd)` pairs for the union of
current and new deployments.

This patch determines if the bootfs is capable of holding all the
pairs. If it can't but it could hold all the pairs from just the new
deployments, the outgoing deployments (e.g. rollbacks) are deleted
*before* new deployments are written. This is done by updating the
bootloader in two steps to maintain atomicity.

Since this is a lot of new logic in an important section of the
code, this feature is gated for now behind an environment variable
(`OSTREE_ENABLE_AUTO_EARLY_PRUNE`). Once we gain more experience with
it, we can consider turning it on by default.

This strategy increases the fallibility of the update system since one
would no longer be able to rollback to the previous deployment if a bug
is present in the bootloader update logic after auto-pruning (see [[2]]
and following). This is however mitigated by the fact that the heuristic
is opportunistic: the rollback is pruned *only if* it's the only way for
the system to update.

[1]: coreos/fedora-coreos-tracker#1247
[2]: #2670 (comment)

Closes: #2670
  • Loading branch information
jlebon committed Apr 14, 2023
1 parent fd3304e commit b159791
Show file tree
Hide file tree
Showing 3 changed files with 343 additions and 3 deletions.
251 changes: 248 additions & 3 deletions src/libostree/ostree-sysroot-deploy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1925,8 +1925,8 @@ install_deployment_kernel (OstreeSysroot *sysroot,
}
else
{
if (!copy_dir_recurse(kernel_layout->boot_dfd, bootcsum_dfd, kernel_layout->devicetree_srcpath,
sysroot->debug_flags, cancellable, error))
if (!copy_dir_recurse (kernel_layout->boot_dfd, bootcsum_dfd, kernel_layout->devicetree_srcpath,
sysroot->debug_flags, cancellable, error))
return FALSE;
}
}
Expand Down Expand Up @@ -1959,6 +1959,8 @@ install_deployment_kernel (OstreeSysroot *sysroot,
}
}

/* NOTE: if adding more things in bootcsum_dfd, also update get_kernel_layout_size() */

g_autoptr(GPtrArray) overlay_initrds = NULL;
for (char **it = _ostree_deployment_get_overlay_initrds (deployment); it && *it; it++)
{
Expand Down Expand Up @@ -2487,6 +2489,243 @@ write_deployments_finish (OstreeSysroot *self,
return TRUE;
}

static gboolean
add_file_size_if_nonnull (int dfd,
const char *path,
guint64 *inout_size,
GError **error)
{
if (path == NULL)
return TRUE;

struct stat stbuf;
if (!glnx_fstatat (dfd, path, &stbuf, 0, error))
return FALSE;

*inout_size += stbuf.st_size;
return TRUE;
}

/* calculates the total size of the bootcsum dir in /boot after we would copy
* it. This reflects the logic in install_deployment_kernel(). */
static gboolean
get_kernel_layout_size (OstreeSysroot *self,
OstreeDeployment *deployment,
guint64 *out_size,
GCancellable *cancellable,
GError **error)
{
g_autofree char *deployment_dirpath = ostree_sysroot_get_deployment_dirpath (self, deployment);
glnx_autofd int deployment_dfd = -1;
if (!glnx_opendirat (self->sysroot_fd, deployment_dirpath, FALSE,
&deployment_dfd, error))
return FALSE;

g_autoptr(OstreeKernelLayout) kernel_layout = NULL;
if (!get_kernel_from_tree (self, deployment_dfd, &kernel_layout,
cancellable, error))
return FALSE;

guint64 bootdir_size = 0;
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->kernel_srcpath, &bootdir_size, error))
return FALSE;
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->initramfs_srcpath, &bootdir_size, error))
return FALSE;
if (kernel_layout->devicetree_srcpath)
{
if (kernel_layout->devicetree_namever)
{
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->devicetree_srcpath, &bootdir_size, error))
return FALSE;
}
else
{
guint64 dirsize = 0;
if (!ot_get_dir_size (kernel_layout->boot_dfd, kernel_layout->devicetree_srcpath, &dirsize, cancellable, error))
return FALSE;
bootdir_size += dirsize;
}
}
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->kernel_hmac_srcpath, &bootdir_size, error))
return FALSE;
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->aboot_srcpath, &bootdir_size, error))
return FALSE;

*out_size = bootdir_size;
return TRUE;
}

/* Analyze /boot and figure out if the new deployments won't fit in the
* remaining space. If they won't, check if deleting the deployments that are
* getting rotated out (e.g. the current rollback) would free up sufficient
* space. If so, call ostree_sysroot_write_deployments() to delete them. */
static gboolean
auto_early_prune_old_deployments (OstreeSysroot *self,
GPtrArray *new_deployments,
GCancellable *cancellable,
GError **error)
{
/* If we're not booted into a deployment, then this is some kind of e.g. disk
* creation/provisioning. The situation isn't as dire, so let's not resort to
* auto-pruning and instead let possible ENOSPC errors naturally bubble. */
if (self->booted_deployment == NULL)
return TRUE;

{
struct stat stbuf;
if (!glnx_fstatat (self->boot_fd, ".", &stbuf, 0, error))
return FALSE;

/* if /boot is on the same filesystem as the sysroot (which must be where
* the sysroot repo is), don't do anything */
if (stbuf.st_dev == self->repo->device)
return TRUE;
}

/* pre-emptive cleanup of any cruft in /boot to free up any wasted space */
if (!_ostree_sysroot_cleanup_bootfs (self, cancellable, error))
return FALSE;

/* tracks all the bootcsums currently in /boot */
g_autoptr(GHashTable) current_bootcsums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);

/* tracks all the bootcsums of new_deployments */
g_autoptr(GHashTable) new_bootcsums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);

g_auto(GStrv) bootdirs = NULL;
if (!_ostree_sysroot_list_all_boot_directories (self, &bootdirs, cancellable, error))
return glnx_prefix_error (error, "listing bootcsum directories in bootfs");

for (char **it = bootdirs; it && *it; it++)
{
const char *bootdir = *it;

g_autofree char *bootcsum = NULL;
if (!_ostree_sysroot_parse_bootdir_name (bootdir, NULL, &bootcsum))
g_assert_not_reached (); /* checked in _ostree_sysroot_list_all_boot_directories() */

guint64 bootdir_size;
g_autofree char *ostree_bootdir = g_build_filename ("ostree", bootdir, NULL);
if (!ot_get_dir_size (self->boot_fd, ostree_bootdir, &bootdir_size, cancellable, error))
return FALSE;

/* for our purposes of sizing bootcsums, it's highly unlikely we need a
* guint64; cast it down to guint so we can more easily store it */
if (bootdir_size > G_MAXUINT)
{
/* If it somehow happens, don't make it fatal. this is all an
* optimization anyway, so let the deployment continue. But log it so
* that users report it and we tweak this code to handle this.
*
* An alternative is working with the block size instead, which would
* be easier to handle. But ideally, `ot_get_dir_size` would be block
* size aware too for better accuracy, which is awkward since the
* function itself is generic over directories and doesn't consider
* e.g. mount points from different filesystems. */
g_printerr ("bootcsum %s size exceeds %u; disabling auto-prune optimization", bootdir, G_MAXUINT);
return TRUE;
}

g_assert_cmpuint (bootdir_size, >, 0);
g_hash_table_insert (current_bootcsums, g_steal_pointer (&bootcsum), GUINT_TO_POINTER (bootdir_size));
}

/* total size of all bootcsums dirs that aren't already in /boot */
guint64 net_new_bootcsum_dirs_total_size = 0;

/* now gather all the bootcsums of the new deployments */
for (guint i = 0; i < new_deployments->len; i++)
{
OstreeDeployment *deployment = new_deployments->pdata[i];

const char *bootcsum = ostree_deployment_get_bootcsum (deployment);
gpointer bootdir_sizep = g_hash_table_lookup (current_bootcsums, bootcsum);
if (bootdir_sizep != 0)
{
g_hash_table_insert (new_bootcsums, g_strdup (bootcsum), bootdir_sizep);
continue;
}

guint64 bootdir_size;
if (!get_kernel_layout_size (self, deployment, &bootdir_size, cancellable, error))
return FALSE;

/* see similar logic in previous loop */
if (bootdir_size > G_MAXUINT)
{
g_printerr ("deployment %s kernel layout size exceeds %u; disabling auto-prune optimization",
ostree_deployment_get_csum (deployment), G_MAXUINT);
return TRUE;
}

g_hash_table_insert (new_bootcsums, g_strdup (bootcsum), GUINT_TO_POINTER (bootdir_size));

/* it wasn't in current_bootcsums; add */
net_new_bootcsum_dirs_total_size += bootdir_size;
}

/* get bootfs free space */
struct statvfs stvfsbuf;
if (TEMP_FAILURE_RETRY (fstatvfs (self->boot_fd, &stvfsbuf)) < 0)
return glnx_throw_errno_prefix (error, "fstatvfs(boot)");

guint64 available_size = stvfsbuf.f_bsize * stvfsbuf.f_bfree;

/* does the bootfs have enough free space for net-new bootdirs? */
if (net_new_bootcsum_dirs_total_size <= available_size)
return TRUE; /* nothing to do! */

/* OK, we would fail if we tried to write the new bootdirs. Is it salvageable?
* First, calculate how much space we could save with the bootcsums scheduled
* for removal. */
guint64 size_to_remove = 0;
GLNX_HASH_TABLE_FOREACH_KV (current_bootcsums, const char *, bootcsum, gpointer, sizep)
{
if (!g_hash_table_contains (new_bootcsums, bootcsum))
size_to_remove += GPOINTER_TO_UINT (sizep);
}

if (net_new_bootcsum_dirs_total_size > (available_size + size_to_remove))
{
/* Even if we auto-pruned, the new bootdirs wouldn't fit. Just let the
* code continue and let it hit ENOSPC. */
return TRUE;
}

g_printerr ("Insufficient space left in bootfs; updating bootloader in two steps");

/* Auto-pruning can salvage the situation. Calculate the set of deployments in common. */
g_autoptr(GPtrArray) common_deployments = g_ptr_array_new ();
for (guint i = 0; i < self->deployments->len; i++)
{
OstreeDeployment *deployment = self->deployments->pdata[i];
const char *bootcsum = ostree_deployment_get_bootcsum (deployment);
if (g_hash_table_contains (new_bootcsums, bootcsum))
{
g_ptr_array_add (common_deployments, deployment);
}
else if (deployment == self->booted_deployment)
g_assert_not_reached (); /* we always keep the booted deployment */
}

/* if we're here, it means that removing some deployments is possible to gain space */
g_assert_cmpuint (common_deployments->len, <, self->deployments->len);

/* Do an initial write out where we do a pure deployment pruning, keeping
* common deployments. To be safe, disable auto-pruning to make recursion
* impossible (though the logic in this function shouldn't kick in anyway in
* that recursive call). Disable cleaning since it's an intermediate stage. */
OstreeSysrootWriteDeploymentsOpts opts = { .do_postclean = FALSE, .disable_auto_early_prune = TRUE };
if (!ostree_sysroot_write_deployments_with_options (self, common_deployments, &opts, cancellable, error))
return FALSE;

/* clean up /boot */
if (!_ostree_sysroot_cleanup_bootfs (self, cancellable, error))
return FALSE;

return TRUE;
}

/**
* ostree_sysroot_write_deployments_with_options:
* @self: Sysroot
Expand Down Expand Up @@ -2516,6 +2755,12 @@ ostree_sysroot_write_deployments_with_options (OstreeSysroot *self,
if (!_ostree_sysroot_ensure_writable (self, error))
return FALSE;

/* for now, this is gated on an environment variable */
const char *opted_in = getenv ("OSTREE_ENABLE_AUTO_EARLY_PRUNE");
if (opted_in && !G_IN_SET (*opted_in, '\0', '0') && !opts->disable_auto_early_prune &&
!auto_early_prune_old_deployments (self, new_deployments, cancellable, error))
return FALSE;

/* Dealing with the staged deployment is quite tricky here. This function is
* primarily concerned with writing out "finalized" deployments which have
* bootloader entries. Originally, we simply dropped the staged deployment
Expand Down Expand Up @@ -2630,7 +2875,7 @@ ostree_sysroot_write_deployments_with_options (OstreeSysroot *self,
OstreeDeployment *deployment = new_deployments->pdata[i];
g_assert (!ostree_deployment_is_staged (deployment));

if (deployment == self->booted_deployment)
if (ostree_deployment_equal (deployment, self->booted_deployment))
found_booted_deployment = TRUE;

g_autoptr(GFile) deployment_root = ostree_sysroot_get_deployment_directory (self, deployment);
Expand Down
1 change: 1 addition & 0 deletions src/libostree/ostree-sysroot.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ gboolean ostree_sysroot_write_deployments (OstreeSysroot *self,

typedef struct {
gboolean do_postclean;
gboolean disable_auto_early_prune;
gboolean unused_bools[7];
int unused_ints[7];
gpointer unused_ptrs[7];
Expand Down
94 changes: 94 additions & 0 deletions tests/kolainst/destructive/auto-prune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/bin/bash
set -xeuo pipefail

# https://github.com/ostreedev/ostree/issues/2670

. ${KOLA_EXT_DATA}/libinsttest.sh

# make two fake ostree commits with modified kernels of about the same size
cd /root
mkdir -p rootfs/usr/lib/modules/`uname -r`
cp /usr/lib/modules/`uname -r`/vmlinuz rootfs/usr/lib/modules/`uname -r`
echo 1 >> rootfs/usr/lib/modules/`uname -r`/vmlinuz
ostree commit --base "${host_refspec}" -P --tree=dir=rootfs -b modkernel1
echo 1 >> rootfs/usr/lib/modules/`uname -r`/vmlinuz
ostree commit --base "${host_refspec}" -P --tree=dir=rootfs -b modkernel2

assert_bootfs_has_n_bootcsum_dirs() {
local expected=$1; shift
local actual
actual=$(ls -d /boot/ostree/${host_osname}-* | wc -l)
if [ "$expected" != "$actual" ]; then
ls -l /boot/ostree
assert_not_reached "expected $expected bootcsum dirs, found $actual"
fi
}

consume_bootfs_space() {
local free_blocks=$(stat --file-system /boot -c '%a')
local block_size=$(stat --file-system /boot -c '%s')
# leave 1 block free
unshare -m bash -c \
"mount -o rw,remount /boot && \
dd if=/dev/zero of=/boot/bigfile count=$((free_blocks-1)) bs=${block_size}"
}

unconsume_bootfs_space() {
unshare -m bash -c "mount -o rw,remount /boot && rm /boot/bigfile"
}

assert_bootfs_has_n_bootcsum_dirs 1

# first, deploy our second deployment on a filled up bootfs
# the booted deployment is never pruned, so this is a hopeless case and auto-pruning can't save us
consume_bootfs_space
rpm-ostree rebase :modkernel1
if OSTREE_ENABLE_AUTO_EARLY_PRUNE=1 ostree admin finalize-staged |& tee out.txt; then
assert_not_reached "successfully wrote to filled up bootfs"
fi
assert_file_has_content out.txt "No space left on device"
rm out.txt
unconsume_bootfs_space
rpm-ostree cleanup -bpr

# OK, now deploy our second deployment for realsies on a bootfs with ample space
# and sanity-check that auto-pruning doesn't kick in
assert_bootfs_has_n_bootcsum_dirs 1

rpm-ostree rebase :modkernel1
OSTREE_ENABLE_AUTO_EARLY_PRUNE=1 ostree admin finalize-staged |& tee out.txt
assert_not_file_has_content out.txt "updating bootloader in two steps"
rm out.txt

# and put it in rollback position; this is the deployment that'll get auto-pruned
rpm-ostree rollback

assert_bootfs_has_n_bootcsum_dirs 2
bootloader_orig=$(sha256sum /boot/loader/entries/*)

# now try to deploy a third deployment without early pruning; we should hit ENOSPC
consume_bootfs_space
rpm-ostree rebase :modkernel2
if ostree admin finalize-staged |& tee out.txt; then
assert_not_reached "successfully wrote kernel without auto-pruning"
fi
assert_file_has_content out.txt "No space left on device"
rm out.txt

# there's 3 bootcsums now because it'll also have the partially written
# bootcsum dir we were creating when we hit ENOSPC; this verifies that all the
# deployments have different bootcsums
assert_bootfs_has_n_bootcsum_dirs 3
# but the bootloader wasn't updated
assert_streq "$bootloader_orig" "$(sha256sum /boot/loader/entries/*)"

# now, try again but with auto-pruning enabled
rpm-ostree rebase :modkernel2
OSTREE_ENABLE_AUTO_EARLY_PRUNE=1 ostree admin finalize-staged |& tee out.txt
assert_file_has_content out.txt "updating bootloader in two steps"
rm out.txt

assert_bootfs_has_n_bootcsum_dirs 2
assert_not_streq "$bootloader_orig" "$(sha256sum /boot/loader/entries/*)"

echo "ok bootfs auto-prune"

0 comments on commit b159791

Please sign in to comment.