Skip to content

Commit

Permalink
ci: automate chain upgrades on k8s deployments
Browse files Browse the repository at this point in the history
Here's long-needed automation for deploying chain upgrades on Penumbra
testnets deployed to k8s. Ideally we'd use an operator pattern to manage
the lifecycle, but we're trying to move quickly, so bash it'll be for
now.

Also included are some lower thresholds for the devnet environment,
since it's frequently used for this kind of testing. Might make more
sense to create a dedicated "upgrade-testing" environment, but again,
prioritizing speed and reliability, so we can circle back on fleshing
things out once we hit our deadlines.

Tacked on some help scripts, such as "enable-maintenance-mode"
and "perform-point-release", which are helpful for managing devnets.
debugging devnets.
  • Loading branch information
conorsch committed May 24, 2024
1 parent 75c5c94 commit 335d9e9
Show file tree
Hide file tree
Showing 6 changed files with 258 additions and 3 deletions.
3 changes: 2 additions & 1 deletion deployments/charts/penumbra-network/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ network:
# Customization of the voting period for governance proposals.
# Dial this down if you want faster voting for testing.
proposal_voting_blocks:

# Set the length of an epoch, in blocks. If not set, uses pd's default.
epoch_duration:
# How many validators are present at genesis. This number must
# match the count in the JSON file used to define the validators.
num_validators: 2
Expand Down
6 changes: 4 additions & 2 deletions deployments/helmfile.d/penumbra-devnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ releases:
- preserve_lb_svc: true
- only_lb_svc: false
- image:
tag: main
tag: "main"
- network:
external_addresses: "104.198.226.117:26656,34.134.110.25:26656"
proposal_voting_blocks: "50"
epoch_duration: "20"
- part_of: penumbra-devnet
# Sidecar vars file for storing external ips. The "penumbra-network" chart
# doesn't read these vars, but the "get-lb-ips" script writes them there,
Expand All @@ -32,7 +34,7 @@ releases:
- preserve_lb_svc: true
- only_lb_svc: false
- image:
tag: main
tag: "main"
# Communicate intra-cluster to the private validator rpc address.
- penumbra_bootstrap_node_cometbft_rpc_url: "http://penumbra-devnet-val-0:26657"
- persistence:
Expand Down
46 changes: 46 additions & 0 deletions deployments/scripts/k8s-deploy-point-release
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
# CI script to deploy a point-release to a Penumbra network, modifying
# existing versions while preserving state. Does *not* perform a migration.
# At a fundamental level, this script represents logic broken out from the
# catch-all 'ci.sh' script.
set -euo pipefail

# Reference the usual vars.
IMAGE="${IMAGE:-ghcr.io/penumbra-zone/penumbra}"
# Force explicit version declaration
TO_VERSION="${TO_VERSION:-}"
if [[ -z "$TO_VERSION" ]] ; then
>&2 echo "ERROR: TO_VERSION must be set with point-release version to deploy"
exit 1
fi

HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"

# Check that the network we're trying to configure has a valid config.
HELMFILE_MANIFEST="./helmfile.d/${HELM_RELEASE}.yaml"
if [[ ! -e "$HELMFILE_MANIFEST" ]]; then
>&2 echo "ERROR: helm release name '$HELM_RELEASE' not supported"
>&2 echo "Consider creating '$HELMFILE_MANIFEST'"
exit 1
fi

# Bump the version of pd running for the deployment, across all
# fullnodes and validators. Allow the cluster to reconcile the changes
# by terminating and creating pods to match. Does *not* alter chain state.
# Allows us to handle "patch" versions.
function update_image_for_running_deployment() {
kubectl set image statefulset \
-l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)" \
"pd=${IMAGE}:${TO_VERSION}"
# Wait for rollout to complete. Will block until pods are marked Ready.
kubectl rollout status statefulset \
-l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)"
}

function main() {
>&2 echo "Performing point-release of '$HELM_RELEASE' to ${TO_VERSION}..."
sleep 2
update_image_for_running_deployment
}

main
42 changes: 42 additions & 0 deletions deployments/scripts/k8s-disable-maintenance-mode
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
# CI script to set a given testnet deployment environment in "maintenaceMode",
# essentially stopping both pd and cometbft, so that an interactive environment
# can be created, without the services writing to local state.
set -euo pipefail

# script expects to be in deployments/ dir
if [[ ! -e ci.sh ]] ; then
>&2 echo "ERROR: script should be run from inside 'deployments/' dir"
exit 1
fi

TO_VERSION="${TO_VERSION:-}"
if [[ -z "$TO_VERSION" ]] ; then
>&2 echo "ERROR: TO_VERSION must be set with an explicit version"
exit 1
fi

# Default to devnet to avoid touching testnet unless explicitly requested.
HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"

# Set maintenaceMode=false.
function disable_maintenance_mode() {
>&2 echo "Disabling maintenance mode..."
helmfile --quiet apply -f "helmfile.d/${HELM_RELEASE}.yaml" --args \
--set=maintenanceMode=false \
--set="image.tag=${TO_VERSION}"

>&2 echo "Waiting for services to be running again..."
kubectl rollout status statefulset \
-l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)"

>&2 echo "Done, the statefulsets are running again"
}

# Main entrypoint
function main() {
disable_maintenance_mode
}

main
exit 0
43 changes: 43 additions & 0 deletions deployments/scripts/k8s-enable-maintenance-mode
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
# CI script to set a given testnet deployment environment in "maintenaceMode",
# essentially stopping both pd and cometbft, so that an interactive environment
# can be created, without the services writing to local state.
set -euo pipefail

# script expects to be in deployments/ dir
if [[ ! -e ci.sh ]] ; then
>&2 echo "ERROR: script should be run from inside 'deployments/' dir"
exit 1
fi

TO_VERSION="${TO_VERSION:-}"
if [[ -z "$TO_VERSION" ]] ; then
>&2 echo "ERROR: TO_VERSION must be set with an explicit version"
exit 1
fi

# Default to devnet to avoid touching testnet unless explicitly requested.
HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"

# Set maintenaceMode=true.
function enable_maintenance_mode() {
>&2 echo "Enabling maintenance mode on ${HELM_RELEASE}..."
helmfile --quiet apply -f "helmfile.d/${HELM_RELEASE}.yaml" --args \
--set=maintenanceMode=true \
--set="image.tag=${TO_VERSION}"

>&2 echo "Waiting for maintenance mode..."
kubectl rollout status statefulset \
-l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component in (fullnode, genesis-validator)"

>&2 echo "Done, the statefulsets are paused now"
}

# Main entrypoint
function main() {
# uncomment at will
enable_maintenance_mode
}

main
exit 0
121 changes: 121 additions & 0 deletions deployments/scripts/k8s-perform-chain-upgrade-via-pd-migrate
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env bash
# CI script to perform a chain migration via `pd migrate` on testnet deployment.
# The general flow is:
#
# 1. determine target environment
# 2. determine end-state version
# 3. set environment to maintenance mode (i.e. stop penumbra & cometbft)
# 4. backup
# 5. migrate
# 6. export archives (required for testnet join later)
# 7. disable maint mode.
#
# Eventually we can make this logic durable enough to run in CI, but for now
# we expect this script to be run on an admin's workstation, to be responsive
# to migration behavior.
set -euo pipefail

# script expects to be in deployments/ dir
if [[ ! -e ci.sh ]] ; then
>&2 echo "ERROR: script should be run from inside 'deployments/' dir"
exit 1
fi

TO_VERSION="${TO_VERSION:-}"
if [[ -z "$TO_VERSION" ]] ; then
>&2 echo "ERROR: TO_VERSION must be set with post-upgrade version"
exit 1
fi

# Default to devnet to avoid touching testnet unless explicitly requested.
HELM_RELEASE="${HELM_RELEASE:-penumbra-devnet}"

# Get the pod names for the genesis-validators in the target environment.
function get_validators() {
kubectl get pods -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/component=genesis-validator" -o name
}

# Get the pod names for all fullnodes in the target environment.
function get_fullnodes() {
kubectl get pods -l "app.kubernetes.io/part-of=${HELM_RELEASE}, app.kubernetes.io/name=penumbra-node" -o name
}

# Perform chain migration. Generic over fullnode/validator,
# which have slightly different mount points for their data.
# Assumes that service has already been paused!
function perform_migration() {
local podname
local testnet_dir
podname="${1:-}"
testnet_dir="${2:-}"
shift 2

>&2 echo "Backing up node state for '$podname'..."
backup_tarball="${testnet_dir}/node0-state-backup.tar"
kubectl exec -it "$podname" -c pd -- rm -f "$backup_tarball"
kubectl exec -it "$podname" -c pd -- tar -C "$testnet_dir" -cf "$backup_tarball" node0

>&2 echo "Performing migration for '$podname'..."
kubectl exec -it "$podname" -c pd -- pd migrate \
--force \
--home "${testnet_dir}/node0/pd" \
--comet-home "${testnet_dir}/node0/cometbft"

>&2 echo "Exporting state archive for '$podname'..."
migration_archive="${testnet_dir}/node0-migration-archive.tar.gz"
kubectl exec -it "$podname" -c pd -- rm -f "$migration_archive"
kubectl exec -it "$podname" -c pd -- tar -C "$testnet_dir" \
--transform='s#node0/pd/##;s#node0/cometbft/config/##;s#node0/cometbft/data/##' \
-czf "$migration_archive" \
node0/pd/rocksdb \
node0/cometbft/config/genesis.json \
node0/cometbft/data/priv_validator_state.json
>&2 echo "Migration complete! Archive available at: ${podname}:${migration_archive}"
}

# Fetch pre-upgrade export archive, and post-export migration archive,
# locally, for rehosting on snapshots server. New-joining nodes will need
# a post-migration snapshot, and archive nodes (e.g. for Hermes) will need
# the pre-migration state.
function fetch_archives() {
# pick any node
pod_name="${HELM_RELEASE}-nodes-1"

>&2 echo "Fetching archives from $pod_name ..."
# N.B. these filepaths are hardcoded and brittle, any refactor must be careful to update throughout.
for f in "node0-migration-archive.tar.gz" "node0-state-backup.tar" ; do
rm -f "$f"
kubectl cp -c pd "${pod_name}:/penumbra-config/testnet_data/${f}" "$f"
done
mv -v "node0-migration-archive.tar.gz" "${pod_name}-${TO_VERSION}-migration-archive.tar.gz"
mv -v "node0-state-backup.tar" "${pod_name}-state-backup-pre-${TO_VERSION}.tar"
}

function main() {
>&2 echo "Upgrading environment '${HELM_RELEASE}' to version ${TO_VERSION}..."

# sleep for a chance to ctrl+c if wrong environment specified
sleep 5

export HELM_RELEASE
export TO_VERSION
scripts/k8s-enable-maintenance-mode
# validators and fullnodes have a slightly different mount path
for v in $(get_validators) ; do
testnet_dir="/penumbra-config/${HELM_RELEASE}-val"
perform_migration "$v" "$testnet_dir"
done

for n in $(get_fullnodes) ; do
testnet_dir="/penumbra-config/testnet_data"
perform_migration "$n" "$testnet_dir"
done

fetch_archives
scripts/k8s-disable-maintenance-mode
>&2 echo "Migration complete! ${HELM_RELEASE} is now running version ${TO_VERSION}"
}

main

exit 0

0 comments on commit 335d9e9

Please sign in to comment.