ci: automate chain upgrades on k8s deployments

Here's long-needed automation for deploying chain upgrades on Penumbra testnets deployed to k8s. Ideally we'd use an operator pattern to manage the lifecycle, but we're trying to move quickly, so bash it'll be for now. Also included are some lower thresholds for the devnet environment, since it's frequently used for this kind of testing. Might make more sense to create a dedicated "upgrade-testing" environment, but again, prioritizing speed and reliability, so we can circle back on fleshing things out once we hit our deadlines. Tacked on some help scripts, such as "enable-maintenance-mode" and "perform-point-release", which are helpful for managing devnets. debugging devnets.
penumbra-zone · May 24, 2024 · 335d9e9 · 335d9e9
1 parent 75c5c94
commit 335d9e9
Show file tree

Hide file tree

Showing 6 changed files with 258 additions and 3 deletions.
diff --git a/deployments/charts/penumbra-network/values.yaml b/deployments/charts/penumbra-network/values.yaml
@@ -28,7 +28,8 @@ network:
   # Customization of the voting period for governance proposals.
   # Dial this down if you want faster voting for testing.
   proposal_voting_blocks:
-
+  # Set the length of an epoch, in blocks. If not set, uses pd's default.
+  epoch_duration:
   # How many validators are present at genesis. This number must
   # match the count in the JSON file used to define the validators.
   num_validators: 2

diff --git a/deployments/helmfile.d/penumbra-devnet.yaml b/deployments/helmfile.d/penumbra-devnet.yaml
@@ -9,9 +9,11 @@ releases:
       - preserve_lb_svc: true
       - only_lb_svc: false
       - image:
-          tag: main
+          tag: "main"
       - network:
           external_addresses: "104.198.226.117:26656,34.134.110.25:26656"
+          proposal_voting_blocks: "50"
+          epoch_duration: "20"
       - part_of: penumbra-devnet
       # Sidecar vars file for storing external ips. The "penumbra-network" chart
       # doesn't read these vars, but the "get-lb-ips" script writes them there,
@@ -32,7 +34,7 @@ releases:
       - preserve_lb_svc: true
       - only_lb_svc: false
       - image:
-          tag: main
+          tag: "main"
       # Communicate intra-cluster to the private validator rpc address.
       - penumbra_bootstrap_node_cometbft_rpc_url: "http://penumbra-devnet-val-0:26657"
       - persistence:

diff --git a/deployments/scripts/k8s-deploy-point-release b/deployments/scripts/k8s-deploy-point-release
@@ -0,0 +1,46 @@
+#!/bin/bash
+# CI script to deploy a point-release to a Penumbra network, modifying
+# existing versions while preserving state. Does *not* perform a migration.
+# At a fundamental level, this script represents logic broken out from the
+# catch-all 'ci.sh' script.
+set -euo pipefail
+
+# Reference the usual vars.
+IMAGE="${IMAGE:-ghcr.io/penumbra-zone/penumbra}"
+# Force explicit version declaration
+TO_VERSION="${TO_VERSION:-}"
+if [[ -z "$TO_VERSION" ]] ; then
+    >&2 echo "ERROR: TO_VERSION must be set with point-release version to deploy"
+    exit 1
+fi
+
+HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"
+
+# Check that the network we're trying to configure has a valid config.
+HELMFILE_MANIFEST="./helmfile.d/${HELM_RELEASE}.yaml"
+if [[ ! -e "$HELMFILE_MANIFEST" ]]; then
+    >&2 echo "ERROR: helm release name '$HELM_RELEASE' not supported"
+    >&2 echo "Consider creating '$HELMFILE_MANIFEST'"
+    exit 1
+fi
+
+# Bump the version of pd running for the deployment, across all
+# fullnodes and validators. Allow the cluster to reconcile the changes
+# by terminating and creating pods to match. Does *not* alter chain state.
+# Allows us to handle "patch" versions.
+function update_image_for_running_deployment() {
+    kubectl set image statefulset \
+        -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)" \
+        "pd=${IMAGE}:${TO_VERSION}"
+    # Wait for rollout to complete. Will block until pods are marked Ready.
+    kubectl rollout status statefulset \
+        -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)"
+}
+
+function main() {
+    >&2 echo "Performing point-release of '$HELM_RELEASE' to ${TO_VERSION}..."
+    sleep 2
+    update_image_for_running_deployment
+}
+
+main
diff --git a/deployments/scripts/k8s-disable-maintenance-mode b/deployments/scripts/k8s-disable-maintenance-mode
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# CI script to set a given testnet deployment environment in "maintenaceMode",
+# essentially stopping both pd and cometbft, so that an interactive environment
+# can be created, without the services writing to local state.
+set -euo pipefail
+
+# script expects to be in deployments/ dir
+if [[ ! -e ci.sh ]] ; then
+    >&2 echo "ERROR: script should be run from inside 'deployments/' dir"
+    exit 1
+fi
+
+TO_VERSION="${TO_VERSION:-}"
+if [[ -z "$TO_VERSION" ]] ; then
+    >&2 echo "ERROR: TO_VERSION must be set with an explicit version"
+    exit 1
+fi
+
+# Default to devnet to avoid touching testnet unless explicitly requested.
+HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"
+
+# Set maintenaceMode=false.
+function disable_maintenance_mode() {
+    >&2 echo "Disabling maintenance mode..."
+    helmfile --quiet apply -f "helmfile.d/${HELM_RELEASE}.yaml" --args \
+        --set=maintenanceMode=false \
+        --set="image.tag=${TO_VERSION}"
+
+    >&2 echo "Waiting for services to be running again..."
+    kubectl rollout status statefulset \
+        -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)"
+
+    >&2 echo "Done, the statefulsets are running again"
+}
+
+# Main entrypoint
+function main() {
+    disable_maintenance_mode
+}
+
+main
+exit 0
diff --git a/deployments/scripts/k8s-enable-maintenance-mode b/deployments/scripts/k8s-enable-maintenance-mode
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# CI script to set a given testnet deployment environment in "maintenaceMode",
+# essentially stopping both pd and cometbft, so that an interactive environment
+# can be created, without the services writing to local state.
+set -euo pipefail
+
+# script expects to be in deployments/ dir
+if [[ ! -e ci.sh ]] ; then
+    >&2 echo "ERROR: script should be run from inside 'deployments/' dir"
+    exit 1
+fi
+
+TO_VERSION="${TO_VERSION:-}"
+if [[ -z "$TO_VERSION" ]] ; then
+    >&2 echo "ERROR: TO_VERSION must be set with an explicit version"
+    exit 1
+fi
+
+# Default to devnet to avoid touching testnet unless explicitly requested.
+HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"
+
+# Set maintenaceMode=true.
+function enable_maintenance_mode() {
+    >&2 echo "Enabling maintenance mode on ${HELM_RELEASE}..."
+    helmfile --quiet apply -f "helmfile.d/${HELM_RELEASE}.yaml" --args \
+        --set=maintenanceMode=true \
+        --set="image.tag=${TO_VERSION}"
+
+    >&2 echo "Waiting for maintenance mode..."
+    kubectl rollout status statefulset \
+        -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)"
+
+    >&2 echo "Done, the statefulsets are paused now"
+}
+
+# Main entrypoint
+function main() {
+    # uncomment at will
+    enable_maintenance_mode
+}
+
+main
+exit 0
diff --git a/deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate b/deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate
@@ -0,0 +1,121 @@
+#!/usr/bin/env bash
+# CI script to perform a chain migration via `pd migrate` on testnet deployment.
+# The general flow is:
+#
+#   1. determine target environment
+#   2. determine end-state version
+#   3. set environment to maintenance mode (i.e. stop penumbra & cometbft)
+#   4. backup
+#   5. migrate
+#   6. export archives (required for testnet join later)
+#   7. disable maint mode.
+#
+# Eventually we can make this logic durable enough to run in CI, but for now
+# we expect this script to be run on an admin's workstation, to be responsive
+# to migration behavior.
+set -euo pipefail
+
+# script expects to be in deployments/ dir
+if [[ ! -e ci.sh ]] ; then
+    >&2 echo "ERROR: script should be run from inside 'deployments/' dir"
+    exit 1
+fi
+
+TO_VERSION="${TO_VERSION:-}"
+if [[ -z "$TO_VERSION" ]] ; then
+    >&2 echo "ERROR: TO_VERSION must be set with post-upgrade version"
+    exit 1
+fi
+
+# Default to devnet to avoid touching testnet unless explicitly requested.
+HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"
+
+# Get the pod names for the genesis-validators in the target environment.
+function get_validators() {
+    kubectl get pods -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component=genesis-validator" -o name
+}
+
+# Get the pod names for all fullnodes in the target environment.
+function get_fullnodes() {
+    kubectl get pods -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/name=penumbra-node" -o name
+}
+
+# Perform chain migration. Generic over fullnode/validator,
+# which have slightly different mount points for their data.
+# Assumes that service has already been paused!
+function perform_migration() {
+    local podname
+    local testnet_dir
+    podname="${1:-}"
+    testnet_dir="${2:-}"
+    shift 2
+
+    >&2 echo "Backing up node state for '$podname'..."
+    backup_tarball="${testnet_dir}/node0-state-backup.tar"
+    kubectl exec -it "$podname" -c pd -- rm -f "$backup_tarball"
+    kubectl exec -it "$podname" -c pd -- tar -C "$testnet_dir" -cf "$backup_tarball" node0
+
+    >&2 echo "Performing migration for '$podname'..."
+    kubectl exec -it "$podname" -c pd -- pd migrate \
+        --force \
+        --home "${testnet_dir}/node0/pd" \
+        --comet-home "${testnet_dir}/node0/cometbft"
+
+    >&2 echo "Exporting state archive for '$podname'..."
+    migration_archive="${testnet_dir}/node0-migration-archive.tar.gz"
+    kubectl exec -it "$podname" -c pd -- rm -f "$migration_archive"
+    kubectl exec -it "$podname" -c pd -- tar -C "$testnet_dir" \
+        --transform='s#node0/pd/##;s#node0/cometbft/config/##;s#node0/cometbft/data/##' \
+        -czf "$migration_archive" \
+        node0/pd/rocksdb \
+        node0/cometbft/config/genesis.json \
+        node0/cometbft/data/priv_validator_state.json
+    >&2 echo "Migration complete! Archive available at: ${podname}:${migration_archive}"
+}
+
+# Fetch pre-upgrade export archive, and post-export migration archive,
+# locally, for rehosting on snapshots server. New-joining nodes will need
+# a post-migration snapshot, and archive nodes (e.g. for Hermes) will need
+# the pre-migration state.
+function fetch_archives() {
+    # pick any node
+    pod_name="${HELM_RELEASE}-nodes-1"
+
+    >&2 echo "Fetching archives from $pod_name ..."
+    # N.B. these filepaths are hardcoded and brittle, any refactor must be careful to update throughout.
+    for f in "node0-migration-archive.tar.gz" "node0-state-backup.tar" ; do
+        rm -f "$f"
+        kubectl cp -c pd "${pod_name}:/penumbra-config/testnet_data/${f}" "$f"
+    done
+    mv -v "node0-migration-archive.tar.gz" "${pod_name}-${TO_VERSION}-migration-archive.tar.gz"
+    mv -v "node0-state-backup.tar" "${pod_name}-state-backup-pre-${TO_VERSION}.tar"
+}
+
+function main() {
+    >&2 echo "Upgrading environment '${HELM_RELEASE}' to version ${TO_VERSION}..."
+
+    # sleep for a chance to ctrl+c if wrong environment specified
+    sleep 5
+
+    export HELM_RELEASE
+    export TO_VERSION
+    scripts/k8s-enable-maintenance-mode
+    # validators and fullnodes have a slightly different mount path
+    for v in $(get_validators) ; do
+        testnet_dir="/penumbra-config/${HELM_RELEASE}-val"
+        perform_migration "$v" "$testnet_dir"
+    done
+
+    for n in $(get_fullnodes) ; do
+        testnet_dir="/penumbra-config/testnet_data"
+        perform_migration "$n" "$testnet_dir"
+    done
+
+    fetch_archives
+    scripts/k8s-disable-maintenance-mode
+    >&2 echo "Migration complete! ${HELM_RELEASE} is now running version ${TO_VERSION}"
+}
+
+main
+
+exit 0