From 335d9e98dcd8adb811cb7a0ea0b77c160db2e3d1 Mon Sep 17 00:00:00 2001 From: Conor Schaefer Date: Mon, 20 May 2024 09:40:04 -0700 Subject: [PATCH] ci: automate chain upgrades on k8s deployments Here's long-needed automation for deploying chain upgrades on Penumbra testnets deployed to k8s. Ideally we'd use an operator pattern to manage the lifecycle, but we're trying to move quickly, so bash it'll be for now. Also included are some lower thresholds for the devnet environment, since it's frequently used for this kind of testing. Might make more sense to create a dedicated "upgrade-testing" environment, but again, prioritizing speed and reliability, so we can circle back on fleshing things out once we hit our deadlines. Tacked on some help scripts, such as "enable-maintenance-mode" and "perform-point-release", which are helpful for managing devnets. debugging devnets. --- .../charts/penumbra-network/values.yaml | 3 +- deployments/helmfile.d/penumbra-devnet.yaml | 6 +- deployments/scripts/k8s-deploy-point-release | 46 +++++++ .../scripts/k8s-disable-maintenance-mode | 42 ++++++ .../scripts/k8s-enable-maintenance-mode | 43 +++++++ .../k8s-perform-chain-upgrade-via-pd-migrate | 121 ++++++++++++++++++ 6 files changed, 258 insertions(+), 3 deletions(-) create mode 100755 deployments/scripts/k8s-deploy-point-release create mode 100755 deployments/scripts/k8s-disable-maintenance-mode create mode 100755 deployments/scripts/k8s-enable-maintenance-mode create mode 100755 deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate diff --git a/deployments/charts/penumbra-network/values.yaml b/deployments/charts/penumbra-network/values.yaml index b10555dba6..6f9537353a 100644 --- a/deployments/charts/penumbra-network/values.yaml +++ b/deployments/charts/penumbra-network/values.yaml @@ -28,7 +28,8 @@ network: # Customization of the voting period for governance proposals. # Dial this down if you want faster voting for testing. proposal_voting_blocks: - + # Set the length of an epoch, in blocks. If not set, uses pd's default. + epoch_duration: # How many validators are present at genesis. This number must # match the count in the JSON file used to define the validators. num_validators: 2 diff --git a/deployments/helmfile.d/penumbra-devnet.yaml b/deployments/helmfile.d/penumbra-devnet.yaml index fe718f8d51..4cb4f0d128 100644 --- a/deployments/helmfile.d/penumbra-devnet.yaml +++ b/deployments/helmfile.d/penumbra-devnet.yaml @@ -9,9 +9,11 @@ releases: - preserve_lb_svc: true - only_lb_svc: false - image: - tag: main + tag: "main" - network: external_addresses: "104.198.226.117:26656,34.134.110.25:26656" + proposal_voting_blocks: "50" + epoch_duration: "20" - part_of: penumbra-devnet # Sidecar vars file for storing external ips. The "penumbra-network" chart # doesn't read these vars, but the "get-lb-ips" script writes them there, @@ -32,7 +34,7 @@ releases: - preserve_lb_svc: true - only_lb_svc: false - image: - tag: main + tag: "main" # Communicate intra-cluster to the private validator rpc address. - penumbra_bootstrap_node_cometbft_rpc_url: "http://penumbra-devnet-val-0:26657" - persistence: diff --git a/deployments/scripts/k8s-deploy-point-release b/deployments/scripts/k8s-deploy-point-release new file mode 100755 index 0000000000..e2e1561f00 --- /dev/null +++ b/deployments/scripts/k8s-deploy-point-release @@ -0,0 +1,46 @@ +#!/bin/bash +# CI script to deploy a point-release to a Penumbra network, modifying +# existing versions while preserving state. Does *not* perform a migration. +# At a fundamental level, this script represents logic broken out from the +# catch-all 'ci.sh' script. +set -euo pipefail + +# Reference the usual vars. +IMAGE="${IMAGE:-ghcr.io/penumbra-zone/penumbra}" +# Force explicit version declaration +TO_VERSION="${TO_VERSION:-}" +if [[ -z "$TO_VERSION" ]] ; then + >&2 echo "ERROR: TO_VERSION must be set with point-release version to deploy" + exit 1 +fi + +HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}" + +# Check that the network we're trying to configure has a valid config. +HELMFILE_MANIFEST="./helmfile.d/${HELM_RELEASE}.yaml" +if [[ ! -e "$HELMFILE_MANIFEST" ]]; then + >&2 echo "ERROR: helm release name '$HELM_RELEASE' not supported" + >&2 echo "Consider creating '$HELMFILE_MANIFEST'" + exit 1 +fi + +# Bump the version of pd running for the deployment, across all +# fullnodes and validators. Allow the cluster to reconcile the changes +# by terminating and creating pods to match. Does *not* alter chain state. +# Allows us to handle "patch" versions. +function update_image_for_running_deployment() { + kubectl set image statefulset \ + -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)" \ + "pd=${IMAGE}:${TO_VERSION}" + # Wait for rollout to complete. Will block until pods are marked Ready. + kubectl rollout status statefulset \ + -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)" +} + +function main() { + >&2 echo "Performing point-release of '$HELM_RELEASE' to ${TO_VERSION}..." + sleep 2 + update_image_for_running_deployment +} + +main diff --git a/deployments/scripts/k8s-disable-maintenance-mode b/deployments/scripts/k8s-disable-maintenance-mode new file mode 100755 index 0000000000..3eb93c1d90 --- /dev/null +++ b/deployments/scripts/k8s-disable-maintenance-mode @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# CI script to set a given testnet deployment environment in "maintenaceMode", +# essentially stopping both pd and cometbft, so that an interactive environment +# can be created, without the services writing to local state. +set -euo pipefail + +# script expects to be in deployments/ dir +if [[ ! -e ci.sh ]] ; then + >&2 echo "ERROR: script should be run from inside 'deployments/' dir" + exit 1 +fi + +TO_VERSION="${TO_VERSION:-}" +if [[ -z "$TO_VERSION" ]] ; then + >&2 echo "ERROR: TO_VERSION must be set with an explicit version" + exit 1 +fi + +# Default to devnet to avoid touching testnet unless explicitly requested. +HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}" + +# Set maintenaceMode=false. +function disable_maintenance_mode() { + >&2 echo "Disabling maintenance mode..." + helmfile --quiet apply -f "helmfile.d/${HELM_RELEASE}.yaml" --args \ + --set=maintenanceMode=false \ + --set="image.tag=${TO_VERSION}" + + >&2 echo "Waiting for services to be running again..." + kubectl rollout status statefulset \ + -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)" + + >&2 echo "Done, the statefulsets are running again" +} + +# Main entrypoint +function main() { + disable_maintenance_mode +} + +main +exit 0 diff --git a/deployments/scripts/k8s-enable-maintenance-mode b/deployments/scripts/k8s-enable-maintenance-mode new file mode 100755 index 0000000000..814e7d22ac --- /dev/null +++ b/deployments/scripts/k8s-enable-maintenance-mode @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# CI script to set a given testnet deployment environment in "maintenaceMode", +# essentially stopping both pd and cometbft, so that an interactive environment +# can be created, without the services writing to local state. +set -euo pipefail + +# script expects to be in deployments/ dir +if [[ ! -e ci.sh ]] ; then + >&2 echo "ERROR: script should be run from inside 'deployments/' dir" + exit 1 +fi + +TO_VERSION="${TO_VERSION:-}" +if [[ -z "$TO_VERSION" ]] ; then + >&2 echo "ERROR: TO_VERSION must be set with an explicit version" + exit 1 +fi + +# Default to devnet to avoid touching testnet unless explicitly requested. +HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}" + +# Set maintenaceMode=true. +function enable_maintenance_mode() { + >&2 echo "Enabling maintenance mode on ${HELM_RELEASE}..." + helmfile --quiet apply -f "helmfile.d/${HELM_RELEASE}.yaml" --args \ + --set=maintenanceMode=true \ + --set="image.tag=${TO_VERSION}" + + >&2 echo "Waiting for maintenance mode..." + kubectl rollout status statefulset \ + -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)" + + >&2 echo "Done, the statefulsets are paused now" +} + +# Main entrypoint +function main() { + # uncomment at will + enable_maintenance_mode +} + +main +exit 0 diff --git a/deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate b/deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate new file mode 100755 index 0000000000..06feb77268 --- /dev/null +++ b/deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate @@ -0,0 +1,121 @@ +#!/usr/bin/env bash +# CI script to perform a chain migration via `pd migrate` on testnet deployment. +# The general flow is: +# +# 1. determine target environment +# 2. determine end-state version +# 3. set environment to maintenance mode (i.e. stop penumbra & cometbft) +# 4. backup +# 5. migrate +# 6. export archives (required for testnet join later) +# 7. disable maint mode. +# +# Eventually we can make this logic durable enough to run in CI, but for now +# we expect this script to be run on an admin's workstation, to be responsive +# to migration behavior. +set -euo pipefail + +# script expects to be in deployments/ dir +if [[ ! -e ci.sh ]] ; then + >&2 echo "ERROR: script should be run from inside 'deployments/' dir" + exit 1 +fi + +TO_VERSION="${TO_VERSION:-}" +if [[ -z "$TO_VERSION" ]] ; then + >&2 echo "ERROR: TO_VERSION must be set with post-upgrade version" + exit 1 +fi + +# Default to devnet to avoid touching testnet unless explicitly requested. +HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}" + +# Get the pod names for the genesis-validators in the target environment. +function get_validators() { + kubectl get pods -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component=genesis-validator" -o name +} + +# Get the pod names for all fullnodes in the target environment. +function get_fullnodes() { + kubectl get pods -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/name=penumbra-node" -o name +} + +# Perform chain migration. Generic over fullnode/validator, +# which have slightly different mount points for their data. +# Assumes that service has already been paused! +function perform_migration() { + local podname + local testnet_dir + podname="${1:-}" + testnet_dir="${2:-}" + shift 2 + + >&2 echo "Backing up node state for '$podname'..." + backup_tarball="${testnet_dir}/node0-state-backup.tar" + kubectl exec -it "$podname" -c pd -- rm -f "$backup_tarball" + kubectl exec -it "$podname" -c pd -- tar -C "$testnet_dir" -cf "$backup_tarball" node0 + + >&2 echo "Performing migration for '$podname'..." + kubectl exec -it "$podname" -c pd -- pd migrate \ + --force \ + --home "${testnet_dir}/node0/pd" \ + --comet-home "${testnet_dir}/node0/cometbft" + + >&2 echo "Exporting state archive for '$podname'..." + migration_archive="${testnet_dir}/node0-migration-archive.tar.gz" + kubectl exec -it "$podname" -c pd -- rm -f "$migration_archive" + kubectl exec -it "$podname" -c pd -- tar -C "$testnet_dir" \ + --transform='s#node0/pd/##;s#node0/cometbft/config/##;s#node0/cometbft/data/##' \ + -czf "$migration_archive" \ + node0/pd/rocksdb \ + node0/cometbft/config/genesis.json \ + node0/cometbft/data/priv_validator_state.json + >&2 echo "Migration complete! Archive available at: ${podname}:${migration_archive}" +} + +# Fetch pre-upgrade export archive, and post-export migration archive, +# locally, for rehosting on snapshots server. New-joining nodes will need +# a post-migration snapshot, and archive nodes (e.g. for Hermes) will need +# the pre-migration state. +function fetch_archives() { + # pick any node + pod_name="${HELM_RELEASE}-nodes-1" + + >&2 echo "Fetching archives from $pod_name ..." + # N.B. these filepaths are hardcoded and brittle, any refactor must be careful to update throughout. + for f in "node0-migration-archive.tar.gz" "node0-state-backup.tar" ; do + rm -f "$f" + kubectl cp -c pd "${pod_name}:/penumbra-config/testnet_data/${f}" "$f" + done + mv -v "node0-migration-archive.tar.gz" "${pod_name}-${TO_VERSION}-migration-archive.tar.gz" + mv -v "node0-state-backup.tar" "${pod_name}-state-backup-pre-${TO_VERSION}.tar" +} + +function main() { + >&2 echo "Upgrading environment '${HELM_RELEASE}' to version ${TO_VERSION}..." + + # sleep for a chance to ctrl+c if wrong environment specified + sleep 5 + + export HELM_RELEASE + export TO_VERSION + scripts/k8s-enable-maintenance-mode + # validators and fullnodes have a slightly different mount path + for v in $(get_validators) ; do + testnet_dir="/penumbra-config/${HELM_RELEASE}-val" + perform_migration "$v" "$testnet_dir" + done + + for n in $(get_fullnodes) ; do + testnet_dir="/penumbra-config/testnet_data" + perform_migration "$n" "$testnet_dir" + done + + fetch_archives + scripts/k8s-disable-maintenance-mode + >&2 echo "Migration complete! ${HELM_RELEASE} is now running version ${TO_VERSION}" +} + +main + +exit 0