Skip to content

Commit

Permalink
Add backup and restore test
Browse files Browse the repository at this point in the history
  • Loading branch information
mkjpryor committed Jul 10, 2024
1 parent 6e1b5fa commit b15f472
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 60 deletions.
10 changes: 8 additions & 2 deletions .github/actions/destroy/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@ runs:
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
ansible-playbook stackhpc.azimuth_ops.destroy -e @extra-vars.yml -e force_destroy=true
ansible-playbook stackhpc.azimuth_ops.destroy \
-e @extra-vars.yml \
-e force_destroy=true \
-e capi_cluster_volumes_policy=delete
if: ${{ always() }}

- name: Release ingress floating IP
- name: Release floating IPs
shell: bash
run: |
set -eo pipefail
Expand All @@ -21,6 +24,9 @@ runs:
if [ -n "$INGRESS_IP" ]; then
openstack floating ip delete $INGRESS_IP
fi
if [ -n "$ZENITH_SSHD_IP" ]; then
openstack floating ip delete $ZENITH_SSHD_IP
fi
if: ${{ always() }}

- name: Configure S3 lock
Expand Down
38 changes: 27 additions & 11 deletions .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,25 +143,41 @@ runs:
action: acquire
if: ${{ steps.s3-lock-config.outputs.host != '' }}

- name: Allocate floating IP for ingress
- name: Allocate floating IPs
shell: bash
run: |
set -eo pipefail
source ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
EXTNET_ID="$(
ansible -m debug -a "var=infra_external_network_id" -e @extra-vars.yml all |
jq -r ".plays[0].tasks[0].hosts.localhost.infra_external_network_id"
)"
IP_ADDRESS="$(
ansible_var() {
ANSIBLE_LOAD_CALLBACK_PLUGINS=true \
ANSIBLE_STDOUT_CALLBACK=json \
ansible -m debug -a "var=$1" -e @extra-vars.yml all | \
jq -r ".plays[0].tasks[0].hosts.localhost.$1"
}
EXTNET_ID="$(ansible_var infra_external_network_id)"
INSTALL_MODE="$(ansible_var install_mode)"
INGRESS_IP="$(
openstack floating ip create $EXTNET_ID \
--description "ingress IP for $AZIMUTH_ENVIRONMENT" \
--format value \
--column floating_ip_address
)"
cat >> ci.env <<EOF
export INGRESS_IP="$IP_ADDRESS"
cat >> ci.env <<-EOF
export INGRESS_IP="$INGRESS_IP"
EOF
env:
ANSIBLE_LOAD_CALLBACK_PLUGINS: "true"
ANSIBLE_STDOUT_CALLBACK: json
if [ "$INSTALL_MODE" = "ha" ]; then
ZENITH_SSHD_IP="$(
openstack floating ip create $EXTNET_ID \
--description "zenith SSHD IP for $AZIMUTH_ENVIRONMENT" \
--format value \
--column floating_ip_address
)"
cat >> ci.env <<-EOF
export ZENITH_SSHD_IP="$ZENITH_SSHD_IP"
EOF
fi
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ consul_server_replicas: 1
zenith_sshd_service_load_balancer_ip: "{{ lookup('env', 'ZENITH_SSHD_IP') }}"

# Configure Velero backups
velero_enabled: true
velero_enabled: "{{ not (not velero_aws_access_key_id) }}"
velero_s3_url: https://leafcloud.store
velero_bucket_name: "azimuth-{{ azimuth_environment }}-backups"
# Create the bucket if it doesn't exist
velero_bucket_create: true
velero_aws_access_key_id:
velero_aws_secret_access_key :
velero_bucket_name: azimuth-ci-backups
velero_aws_access_key_id: "{{ lookup('env', 'VELERO_S3_ACCESS_KEY') }}"
velero_aws_secret_access_key: "{{ lookup('env', 'VELERO_S3_SECRET_KEY') }}"
191 changes: 191 additions & 0 deletions .github/workflows/test-backup-restore.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
name: Backup and restore test

on:
# Allow manual execution on any branch
workflow_dispatch:
inputs:
target-cloud:
description: >-
The cloud to target for the run.
Leave blank to use the default cloud.
type: choice
options:
- ""
- arcus
- leafcloud

jobs:
# Tests that a backup and restore re-adopts all the existing platforms correctly
#
# Note that success() and failure() consider *all previous steps*, and continue-on-failure
# prevents the job from being marked as failed if that step fails
# This means that in order to get the execution flow that we want while still resulting in a
# failed job when required, we need to use step ids and the conclusions of specific steps
test_backup_restore:
runs-on: ubuntu-latest
steps:
# We need to check out the code under test first in order to use local actions
- name: Checkout code under test
uses: actions/checkout@v3

- name: Set up Azimuth environment
uses: ./.github/actions/setup
with:
os-clouds: ${{ secrets.OS_CLOUDS }}
repository: ${{ github.repository }}
ref: ${{ github.ref }}
target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }}
install-mode: ha
environment-prefix: ci-restore
# GitHub terminates jobs after 6 hours
# We don't want jobs to acquire the lock then get timed out before they can finish
# So wait a maximum of 3 hours to acquire the lock, leaving 3 hours for other tasks in the job
timeout-minutes: 180

- name: Generate S3 credentials for Velero
run: |
set -e
source ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
VELERO_S3_ACCESS_KEY="$(openstack ec2 credentials create -f value -c access)"
VELERO_S3_SECRET_KEY="$(openstack ec2 credentials show -f value -c secret $VELERO_S3_ACCESS_KEY)"
cat >> ci.env <<EOF
export VELERO_S3_ACCESS_KEY="$VELERO_S3_ACCESS_KEY"
export VELERO_S3_SECRET_KEY="$VELERO_S3_SECRET_KEY"
EOF
- name: Provision Azimuth
uses: ./.github/actions/provision

- name: Generate test suite
id: generate-tests
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
ansible-playbook stackhpc.azimuth_ops.generate_tests -e @extra-vars.yml
- name: Create test platforms
id: tests-create
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/run-tests --include create --outputdir reports/create
- name: Verify test platforms
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/run-tests --include verify --outputdir reports/verify-create
if: ${{ !cancelled() && contains(fromJSON('["success", "failure"]'), steps.tests-create.conclusion) }}

- name: Create a backup
id: backup-create
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/seed-ssh -- \
velero backup create $AZIMUTH_ENVIRONMENT \
--kubeconfig ./kubeconfig-azimuth-$AZIMUTH_ENVIRONMENT \
--from-schedule default \
--wait
if: ${{ !cancelled() && steps.generate-tests.conclusion == 'success' }}

- name: Create pre-restore debug bundle
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/create-debug-bundle
if: ${{ !cancelled() }}

- name: Upload pre-restore debug bundle
uses: actions/upload-artifact@v3
with:
name: azimuth-pre-restore-debug-bundle
path: debug-bundle.tar.gz
if: ${{ !cancelled() }}

- name: Destroy Azimuth
id: azimuth-destroy
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
ansible-playbook stackhpc.azimuth_ops.destroy -e @extra-vars.yml
if: ${{ !cancelled() && steps.backup-create.conclusion == 'success' }}

- name: Restore from backup
id: backup-restore
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
ansible-playbook stackhpc.azimuth_ops.restore \
-e @extra-vars.yml \
-e velero_restore_backup_name=$AZIMUTH_ENVIRONMENT
if: ${{ !cancelled() && steps.azimuth-destroy.conclusion == 'success' }}

- name: Verify test platforms post restore
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/run-tests --include verify --outputdir reports/verify-post-restore
if: ${{ !cancelled() && steps.backup-restore.conclusion == 'success' }}

- name: Delete test platforms
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/run-tests --include delete --outputdir reports/delete
if: ${{ always() }}

- name: Delete backup
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/seed-ssh -- \
velero backup delete $AZIMUTH_ENVIRONMENT \
--kubeconfig ./kubeconfig-azimuth-$AZIMUTH_ENVIRONMENT \
--confirm
if: ${{ always() }}

- name: Upload test report artifacts
uses: actions/upload-artifact@v3
with:
name: azimuth-restore-test-reports
path: reports/*
if: ${{ always() }}

- name: Create debug bundle
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
./bin/create-debug-bundle
if: ${{ always() }}

- name: Upload debug bundle
uses: actions/upload-artifact@v3
with:
name: azimuth-restore-debug-bundle
path: debug-bundle.tar.gz
if: ${{ always() }}

- name: Destroy Azimuth
uses: ./.github/actions/destroy
if: ${{ always() }}

- name: Delete Velero S3 credentials
run: |
set -e
source ./ci.env
source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT"
openstack ec2 credentials delete $VELERO_S3_ACCESS_KEY
if: ${{ always() }}
48 changes: 48 additions & 0 deletions .github/workflows/test-ha.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: HA test

on:
# Allow manual execution on any branch
workflow_dispatch:
inputs:
target-cloud:
description: >-
The cloud to target for the run.
Leave blank to use the default cloud.
type: choice
options:
- ""
- arcus
- leafcloud

jobs:
# Tests a clean HA deployment + all appliances
test_ha:
runs-on: ubuntu-latest
steps:
# We need to check out the code under test first in order to use local actions
- name: Checkout code under test
uses: actions/checkout@v3

- name: Set up Azimuth environment
uses: ./.github/actions/setup
with:
os-clouds: ${{ secrets.OS_CLOUDS }}
repository: ${{ github.repository }}
ref: ${{ github.ref }}
target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }}
install-mode: ha
environment-prefix: ci-ha
# GitHub terminates jobs after 6 hours
# We don't want jobs to acquire the lock then get timed out before they can finish
# So wait a maximum of 3 hours to acquire the lock, leaving 3 hours for other tasks in the job
timeout-minutes: 180

- name: Provision Azimuth
uses: ./.github/actions/provision

- name: Run Azimuth tests
uses: ./.github/actions/test

- name: Destroy Azimuth
uses: ./.github/actions/destroy
if: ${{ always() }}
7 changes: 5 additions & 2 deletions .github/workflows/test-singlenode.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@ on:
- requirements.txt
- requirements.yml
- .github/actions/**
- .github/environments/common
- .github/environments/arcus
- .github/environments/leafcloud
- .github/workflows/test-singlenode.yml
- bin/**
- "!bin/ci-exec"
- "!bin/ci-setup"
- "!bin/create-merge-branch"
- "!bin/port-forward"
- "!bin/tilt-*"
- environments/base/**
- environments/singlenode/**
- environments/demo/**
- environments/ci/**

# Use the head ref for workflow concurrency, with cancellation
# This should mean that any previous runs of this workflow for the same PR
Expand Down
Loading

0 comments on commit b15f472

Please sign in to comment.