From b1597916eb1ad54f33f9d95d3e3c860575eae6b4 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Thu, 13 Apr 2023 17:22:43 -0400 Subject: [PATCH] lib/sysroot-deploy: Add experimental support for automatic early prune During the early design of FCOS and RHCOS, we chose a value of 384M for the boot partition. This turned out to be too small: some arches other than x86_64 have larger initrds, kernel binaries, or additional artifacts (like device tree blobs). We'll likely bump the boot partition size in the future, but we don't want to abandon all the nodes deployed with the current size.[[1]] Because stale entries in `/boot` are cleaned up after new entries are written, there is a window in the update process during which the bootfs temporarily must host all the `(kernel, initrd)` pairs for the union of current and new deployments. This patch determines if the bootfs is capable of holding all the pairs. If it can't but it could hold all the pairs from just the new deployments, the outgoing deployments (e.g. rollbacks) are deleted *before* new deployments are written. This is done by updating the bootloader in two steps to maintain atomicity. Since this is a lot of new logic in an important section of the code, this feature is gated for now behind an environment variable (`OSTREE_ENABLE_AUTO_EARLY_PRUNE`). Once we gain more experience with it, we can consider turning it on by default. This strategy increases the fallibility of the update system since one would no longer be able to rollback to the previous deployment if a bug is present in the bootloader update logic after auto-pruning (see [[2]] and following). This is however mitigated by the fact that the heuristic is opportunistic: the rollback is pruned *only if* it's the only way for the system to update. [1]: https://github.com/coreos/fedora-coreos-tracker/issues/1247 [2]: https://github.com/ostreedev/ostree/issues/2670#issuecomment-1179341883 Closes: #2670 --- src/libostree/ostree-sysroot-deploy.c | 251 ++++++++++++++++++++++- src/libostree/ostree-sysroot.h | 1 + tests/kolainst/destructive/auto-prune.sh | 94 +++++++++ 3 files changed, 343 insertions(+), 3 deletions(-) create mode 100755 tests/kolainst/destructive/auto-prune.sh diff --git a/src/libostree/ostree-sysroot-deploy.c b/src/libostree/ostree-sysroot-deploy.c index d2056cdf6a..c286d0c1d5 100644 --- a/src/libostree/ostree-sysroot-deploy.c +++ b/src/libostree/ostree-sysroot-deploy.c @@ -1925,8 +1925,8 @@ install_deployment_kernel (OstreeSysroot *sysroot, } else { - if (!copy_dir_recurse(kernel_layout->boot_dfd, bootcsum_dfd, kernel_layout->devicetree_srcpath, - sysroot->debug_flags, cancellable, error)) + if (!copy_dir_recurse (kernel_layout->boot_dfd, bootcsum_dfd, kernel_layout->devicetree_srcpath, + sysroot->debug_flags, cancellable, error)) return FALSE; } } @@ -1959,6 +1959,8 @@ install_deployment_kernel (OstreeSysroot *sysroot, } } + /* NOTE: if adding more things in bootcsum_dfd, also update get_kernel_layout_size() */ + g_autoptr(GPtrArray) overlay_initrds = NULL; for (char **it = _ostree_deployment_get_overlay_initrds (deployment); it && *it; it++) { @@ -2487,6 +2489,243 @@ write_deployments_finish (OstreeSysroot *self, return TRUE; } +static gboolean +add_file_size_if_nonnull (int dfd, + const char *path, + guint64 *inout_size, + GError **error) +{ + if (path == NULL) + return TRUE; + + struct stat stbuf; + if (!glnx_fstatat (dfd, path, &stbuf, 0, error)) + return FALSE; + + *inout_size += stbuf.st_size; + return TRUE; +} + +/* calculates the total size of the bootcsum dir in /boot after we would copy + * it. This reflects the logic in install_deployment_kernel(). */ +static gboolean +get_kernel_layout_size (OstreeSysroot *self, + OstreeDeployment *deployment, + guint64 *out_size, + GCancellable *cancellable, + GError **error) +{ + g_autofree char *deployment_dirpath = ostree_sysroot_get_deployment_dirpath (self, deployment); + glnx_autofd int deployment_dfd = -1; + if (!glnx_opendirat (self->sysroot_fd, deployment_dirpath, FALSE, + &deployment_dfd, error)) + return FALSE; + + g_autoptr(OstreeKernelLayout) kernel_layout = NULL; + if (!get_kernel_from_tree (self, deployment_dfd, &kernel_layout, + cancellable, error)) + return FALSE; + + guint64 bootdir_size = 0; + if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->kernel_srcpath, &bootdir_size, error)) + return FALSE; + if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->initramfs_srcpath, &bootdir_size, error)) + return FALSE; + if (kernel_layout->devicetree_srcpath) + { + if (kernel_layout->devicetree_namever) + { + if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->devicetree_srcpath, &bootdir_size, error)) + return FALSE; + } + else + { + guint64 dirsize = 0; + if (!ot_get_dir_size (kernel_layout->boot_dfd, kernel_layout->devicetree_srcpath, &dirsize, cancellable, error)) + return FALSE; + bootdir_size += dirsize; + } + } + if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->kernel_hmac_srcpath, &bootdir_size, error)) + return FALSE; + if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->aboot_srcpath, &bootdir_size, error)) + return FALSE; + + *out_size = bootdir_size; + return TRUE; +} + +/* Analyze /boot and figure out if the new deployments won't fit in the + * remaining space. If they won't, check if deleting the deployments that are + * getting rotated out (e.g. the current rollback) would free up sufficient + * space. If so, call ostree_sysroot_write_deployments() to delete them. */ +static gboolean +auto_early_prune_old_deployments (OstreeSysroot *self, + GPtrArray *new_deployments, + GCancellable *cancellable, + GError **error) +{ + /* If we're not booted into a deployment, then this is some kind of e.g. disk + * creation/provisioning. The situation isn't as dire, so let's not resort to + * auto-pruning and instead let possible ENOSPC errors naturally bubble. */ + if (self->booted_deployment == NULL) + return TRUE; + + { + struct stat stbuf; + if (!glnx_fstatat (self->boot_fd, ".", &stbuf, 0, error)) + return FALSE; + + /* if /boot is on the same filesystem as the sysroot (which must be where + * the sysroot repo is), don't do anything */ + if (stbuf.st_dev == self->repo->device) + return TRUE; + } + + /* pre-emptive cleanup of any cruft in /boot to free up any wasted space */ + if (!_ostree_sysroot_cleanup_bootfs (self, cancellable, error)) + return FALSE; + + /* tracks all the bootcsums currently in /boot */ + g_autoptr(GHashTable) current_bootcsums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + + /* tracks all the bootcsums of new_deployments */ + g_autoptr(GHashTable) new_bootcsums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); + + g_auto(GStrv) bootdirs = NULL; + if (!_ostree_sysroot_list_all_boot_directories (self, &bootdirs, cancellable, error)) + return glnx_prefix_error (error, "listing bootcsum directories in bootfs"); + + for (char **it = bootdirs; it && *it; it++) + { + const char *bootdir = *it; + + g_autofree char *bootcsum = NULL; + if (!_ostree_sysroot_parse_bootdir_name (bootdir, NULL, &bootcsum)) + g_assert_not_reached (); /* checked in _ostree_sysroot_list_all_boot_directories() */ + + guint64 bootdir_size; + g_autofree char *ostree_bootdir = g_build_filename ("ostree", bootdir, NULL); + if (!ot_get_dir_size (self->boot_fd, ostree_bootdir, &bootdir_size, cancellable, error)) + return FALSE; + + /* for our purposes of sizing bootcsums, it's highly unlikely we need a + * guint64; cast it down to guint so we can more easily store it */ + if (bootdir_size > G_MAXUINT) + { + /* If it somehow happens, don't make it fatal. this is all an + * optimization anyway, so let the deployment continue. But log it so + * that users report it and we tweak this code to handle this. + * + * An alternative is working with the block size instead, which would + * be easier to handle. But ideally, `ot_get_dir_size` would be block + * size aware too for better accuracy, which is awkward since the + * function itself is generic over directories and doesn't consider + * e.g. mount points from different filesystems. */ + g_printerr ("bootcsum %s size exceeds %u; disabling auto-prune optimization", bootdir, G_MAXUINT); + return TRUE; + } + + g_assert_cmpuint (bootdir_size, >, 0); + g_hash_table_insert (current_bootcsums, g_steal_pointer (&bootcsum), GUINT_TO_POINTER (bootdir_size)); + } + + /* total size of all bootcsums dirs that aren't already in /boot */ + guint64 net_new_bootcsum_dirs_total_size = 0; + + /* now gather all the bootcsums of the new deployments */ + for (guint i = 0; i < new_deployments->len; i++) + { + OstreeDeployment *deployment = new_deployments->pdata[i]; + + const char *bootcsum = ostree_deployment_get_bootcsum (deployment); + gpointer bootdir_sizep = g_hash_table_lookup (current_bootcsums, bootcsum); + if (bootdir_sizep != 0) + { + g_hash_table_insert (new_bootcsums, g_strdup (bootcsum), bootdir_sizep); + continue; + } + + guint64 bootdir_size; + if (!get_kernel_layout_size (self, deployment, &bootdir_size, cancellable, error)) + return FALSE; + + /* see similar logic in previous loop */ + if (bootdir_size > G_MAXUINT) + { + g_printerr ("deployment %s kernel layout size exceeds %u; disabling auto-prune optimization", + ostree_deployment_get_csum (deployment), G_MAXUINT); + return TRUE; + } + + g_hash_table_insert (new_bootcsums, g_strdup (bootcsum), GUINT_TO_POINTER (bootdir_size)); + + /* it wasn't in current_bootcsums; add */ + net_new_bootcsum_dirs_total_size += bootdir_size; + } + + /* get bootfs free space */ + struct statvfs stvfsbuf; + if (TEMP_FAILURE_RETRY (fstatvfs (self->boot_fd, &stvfsbuf)) < 0) + return glnx_throw_errno_prefix (error, "fstatvfs(boot)"); + + guint64 available_size = stvfsbuf.f_bsize * stvfsbuf.f_bfree; + + /* does the bootfs have enough free space for net-new bootdirs? */ + if (net_new_bootcsum_dirs_total_size <= available_size) + return TRUE; /* nothing to do! */ + + /* OK, we would fail if we tried to write the new bootdirs. Is it salvageable? + * First, calculate how much space we could save with the bootcsums scheduled + * for removal. */ + guint64 size_to_remove = 0; + GLNX_HASH_TABLE_FOREACH_KV (current_bootcsums, const char *, bootcsum, gpointer, sizep) + { + if (!g_hash_table_contains (new_bootcsums, bootcsum)) + size_to_remove += GPOINTER_TO_UINT (sizep); + } + + if (net_new_bootcsum_dirs_total_size > (available_size + size_to_remove)) + { + /* Even if we auto-pruned, the new bootdirs wouldn't fit. Just let the + * code continue and let it hit ENOSPC. */ + return TRUE; + } + + g_printerr ("Insufficient space left in bootfs; updating bootloader in two steps"); + + /* Auto-pruning can salvage the situation. Calculate the set of deployments in common. */ + g_autoptr(GPtrArray) common_deployments = g_ptr_array_new (); + for (guint i = 0; i < self->deployments->len; i++) + { + OstreeDeployment *deployment = self->deployments->pdata[i]; + const char *bootcsum = ostree_deployment_get_bootcsum (deployment); + if (g_hash_table_contains (new_bootcsums, bootcsum)) + { + g_ptr_array_add (common_deployments, deployment); + } + else if (deployment == self->booted_deployment) + g_assert_not_reached (); /* we always keep the booted deployment */ + } + + /* if we're here, it means that removing some deployments is possible to gain space */ + g_assert_cmpuint (common_deployments->len, <, self->deployments->len); + + /* Do an initial write out where we do a pure deployment pruning, keeping + * common deployments. To be safe, disable auto-pruning to make recursion + * impossible (though the logic in this function shouldn't kick in anyway in + * that recursive call). Disable cleaning since it's an intermediate stage. */ + OstreeSysrootWriteDeploymentsOpts opts = { .do_postclean = FALSE, .disable_auto_early_prune = TRUE }; + if (!ostree_sysroot_write_deployments_with_options (self, common_deployments, &opts, cancellable, error)) + return FALSE; + + /* clean up /boot */ + if (!_ostree_sysroot_cleanup_bootfs (self, cancellable, error)) + return FALSE; + + return TRUE; +} + /** * ostree_sysroot_write_deployments_with_options: * @self: Sysroot @@ -2516,6 +2755,12 @@ ostree_sysroot_write_deployments_with_options (OstreeSysroot *self, if (!_ostree_sysroot_ensure_writable (self, error)) return FALSE; + /* for now, this is gated on an environment variable */ + const char *opted_in = getenv ("OSTREE_ENABLE_AUTO_EARLY_PRUNE"); + if (opted_in && !G_IN_SET (*opted_in, '\0', '0') && !opts->disable_auto_early_prune && + !auto_early_prune_old_deployments (self, new_deployments, cancellable, error)) + return FALSE; + /* Dealing with the staged deployment is quite tricky here. This function is * primarily concerned with writing out "finalized" deployments which have * bootloader entries. Originally, we simply dropped the staged deployment @@ -2630,7 +2875,7 @@ ostree_sysroot_write_deployments_with_options (OstreeSysroot *self, OstreeDeployment *deployment = new_deployments->pdata[i]; g_assert (!ostree_deployment_is_staged (deployment)); - if (deployment == self->booted_deployment) + if (ostree_deployment_equal (deployment, self->booted_deployment)) found_booted_deployment = TRUE; g_autoptr(GFile) deployment_root = ostree_sysroot_get_deployment_directory (self, deployment); diff --git a/src/libostree/ostree-sysroot.h b/src/libostree/ostree-sysroot.h index 23c7139aaa..b159020d1e 100644 --- a/src/libostree/ostree-sysroot.h +++ b/src/libostree/ostree-sysroot.h @@ -193,6 +193,7 @@ gboolean ostree_sysroot_write_deployments (OstreeSysroot *self, typedef struct { gboolean do_postclean; + gboolean disable_auto_early_prune; gboolean unused_bools[7]; int unused_ints[7]; gpointer unused_ptrs[7]; diff --git a/tests/kolainst/destructive/auto-prune.sh b/tests/kolainst/destructive/auto-prune.sh new file mode 100755 index 0000000000..5d5e5207b5 --- /dev/null +++ b/tests/kolainst/destructive/auto-prune.sh @@ -0,0 +1,94 @@ +#!/bin/bash +set -xeuo pipefail + +# https://github.com/ostreedev/ostree/issues/2670 + +. ${KOLA_EXT_DATA}/libinsttest.sh + +# make two fake ostree commits with modified kernels of about the same size +cd /root +mkdir -p rootfs/usr/lib/modules/`uname -r` +cp /usr/lib/modules/`uname -r`/vmlinuz rootfs/usr/lib/modules/`uname -r` +echo 1 >> rootfs/usr/lib/modules/`uname -r`/vmlinuz +ostree commit --base "${host_refspec}" -P --tree=dir=rootfs -b modkernel1 +echo 1 >> rootfs/usr/lib/modules/`uname -r`/vmlinuz +ostree commit --base "${host_refspec}" -P --tree=dir=rootfs -b modkernel2 + +assert_bootfs_has_n_bootcsum_dirs() { + local expected=$1; shift + local actual + actual=$(ls -d /boot/ostree/${host_osname}-* | wc -l) + if [ "$expected" != "$actual" ]; then + ls -l /boot/ostree + assert_not_reached "expected $expected bootcsum dirs, found $actual" + fi +} + +consume_bootfs_space() { + local free_blocks=$(stat --file-system /boot -c '%a') + local block_size=$(stat --file-system /boot -c '%s') + # leave 1 block free + unshare -m bash -c \ + "mount -o rw,remount /boot && \ + dd if=/dev/zero of=/boot/bigfile count=$((free_blocks-1)) bs=${block_size}" +} + +unconsume_bootfs_space() { + unshare -m bash -c "mount -o rw,remount /boot && rm /boot/bigfile" +} + +assert_bootfs_has_n_bootcsum_dirs 1 + +# first, deploy our second deployment on a filled up bootfs +# the booted deployment is never pruned, so this is a hopeless case and auto-pruning can't save us +consume_bootfs_space +rpm-ostree rebase :modkernel1 +if OSTREE_ENABLE_AUTO_EARLY_PRUNE=1 ostree admin finalize-staged |& tee out.txt; then + assert_not_reached "successfully wrote to filled up bootfs" +fi +assert_file_has_content out.txt "No space left on device" +rm out.txt +unconsume_bootfs_space +rpm-ostree cleanup -bpr + +# OK, now deploy our second deployment for realsies on a bootfs with ample space +# and sanity-check that auto-pruning doesn't kick in +assert_bootfs_has_n_bootcsum_dirs 1 + +rpm-ostree rebase :modkernel1 +OSTREE_ENABLE_AUTO_EARLY_PRUNE=1 ostree admin finalize-staged |& tee out.txt +assert_not_file_has_content out.txt "updating bootloader in two steps" +rm out.txt + +# and put it in rollback position; this is the deployment that'll get auto-pruned +rpm-ostree rollback + +assert_bootfs_has_n_bootcsum_dirs 2 +bootloader_orig=$(sha256sum /boot/loader/entries/*) + +# now try to deploy a third deployment without early pruning; we should hit ENOSPC +consume_bootfs_space +rpm-ostree rebase :modkernel2 +if ostree admin finalize-staged |& tee out.txt; then + assert_not_reached "successfully wrote kernel without auto-pruning" +fi +assert_file_has_content out.txt "No space left on device" +rm out.txt + +# there's 3 bootcsums now because it'll also have the partially written +# bootcsum dir we were creating when we hit ENOSPC; this verifies that all the +# deployments have different bootcsums +assert_bootfs_has_n_bootcsum_dirs 3 +# but the bootloader wasn't updated +assert_streq "$bootloader_orig" "$(sha256sum /boot/loader/entries/*)" + +# now, try again but with auto-pruning enabled +rpm-ostree rebase :modkernel2 +OSTREE_ENABLE_AUTO_EARLY_PRUNE=1 ostree admin finalize-staged |& tee out.txt +assert_file_has_content out.txt "updating bootloader in two steps" +rm out.txt + +assert_bootfs_has_n_bootcsum_dirs 2 +assert_not_streq "$bootloader_orig" "$(sha256sum /boot/loader/entries/*)" + +echo "ok bootfs auto-prune"