From b15f472ec477d4ffd7bf30f9099918e585e278ce Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 10 Jul 2024 15:46:27 +0100 Subject: [PATCH] Add backup and restore test --- .github/actions/destroy/action.yml | 10 +- .github/actions/setup/action.yml | 38 +++- .../inventory/group_vars/all/variables.yml | 10 +- .github/workflows/test-backup-restore.yml | 191 ++++++++++++++++++ .github/workflows/test-ha.yml | 48 +++++ .github/workflows/test-singlenode.yml | 7 +- .../{test-full.yml => test-upgrade.yml} | 37 +--- docs/configuration/15-disaster-recovery.md | 12 +- 8 files changed, 293 insertions(+), 60 deletions(-) create mode 100644 .github/workflows/test-backup-restore.yml create mode 100644 .github/workflows/test-ha.yml rename .github/workflows/{test-full.yml => test-upgrade.yml} (82%) diff --git a/.github/actions/destroy/action.yml b/.github/actions/destroy/action.yml index 62ccf5ff..b0c97a48 100644 --- a/.github/actions/destroy/action.yml +++ b/.github/actions/destroy/action.yml @@ -9,10 +9,13 @@ runs: set -e source ./ci.env source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" - ansible-playbook stackhpc.azimuth_ops.destroy -e @extra-vars.yml -e force_destroy=true + ansible-playbook stackhpc.azimuth_ops.destroy \ + -e @extra-vars.yml \ + -e force_destroy=true \ + -e capi_cluster_volumes_policy=delete if: ${{ always() }} - - name: Release ingress floating IP + - name: Release floating IPs shell: bash run: | set -eo pipefail @@ -21,6 +24,9 @@ runs: if [ -n "$INGRESS_IP" ]; then openstack floating ip delete $INGRESS_IP fi + if [ -n "$ZENITH_SSHD_IP" ]; then + openstack floating ip delete $ZENITH_SSHD_IP + fi if: ${{ always() }} - name: Configure S3 lock diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index e6611d73..08758dce 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -143,25 +143,41 @@ runs: action: acquire if: ${{ steps.s3-lock-config.outputs.host != '' }} - - name: Allocate floating IP for ingress + - name: Allocate floating IPs shell: bash run: | set -eo pipefail source ci.env source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" - EXTNET_ID="$( - ansible -m debug -a "var=infra_external_network_id" -e @extra-vars.yml all | - jq -r ".plays[0].tasks[0].hosts.localhost.infra_external_network_id" - )" - IP_ADDRESS="$( + + ansible_var() { + ANSIBLE_LOAD_CALLBACK_PLUGINS=true \ + ANSIBLE_STDOUT_CALLBACK=json \ + ansible -m debug -a "var=$1" -e @extra-vars.yml all | \ + jq -r ".plays[0].tasks[0].hosts.localhost.$1" + } + + EXTNET_ID="$(ansible_var infra_external_network_id)" + INSTALL_MODE="$(ansible_var install_mode)" + + INGRESS_IP="$( openstack floating ip create $EXTNET_ID \ --description "ingress IP for $AZIMUTH_ENVIRONMENT" \ --format value \ --column floating_ip_address )" - cat >> ci.env <> ci.env <<-EOF + export INGRESS_IP="$INGRESS_IP" EOF - env: - ANSIBLE_LOAD_CALLBACK_PLUGINS: "true" - ANSIBLE_STDOUT_CALLBACK: json + + if [ "$INSTALL_MODE" = "ha" ]; then + ZENITH_SSHD_IP="$( + openstack floating ip create $EXTNET_ID \ + --description "zenith SSHD IP for $AZIMUTH_ENVIRONMENT" \ + --format value \ + --column floating_ip_address + )" + cat >> ci.env <<-EOF + export ZENITH_SSHD_IP="$ZENITH_SSHD_IP" + EOF + fi diff --git a/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml b/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml index 851c28fb..a7faa87c 100644 --- a/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml +++ b/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml @@ -39,10 +39,8 @@ consul_server_replicas: 1 zenith_sshd_service_load_balancer_ip: "{{ lookup('env', 'ZENITH_SSHD_IP') }}" # Configure Velero backups -velero_enabled: true +velero_enabled: "{{ not (not velero_aws_access_key_id) }}" velero_s3_url: https://leafcloud.store -velero_bucket_name: "azimuth-{{ azimuth_environment }}-backups" -# Create the bucket if it doesn't exist -velero_bucket_create: true -velero_aws_access_key_id: -velero_aws_secret_access_key : +velero_bucket_name: azimuth-ci-backups +velero_aws_access_key_id: "{{ lookup('env', 'VELERO_S3_ACCESS_KEY') }}" +velero_aws_secret_access_key: "{{ lookup('env', 'VELERO_S3_SECRET_KEY') }}" diff --git a/.github/workflows/test-backup-restore.yml b/.github/workflows/test-backup-restore.yml new file mode 100644 index 00000000..7b8bdb53 --- /dev/null +++ b/.github/workflows/test-backup-restore.yml @@ -0,0 +1,191 @@ +name: Backup and restore test + +on: + # Allow manual execution on any branch + workflow_dispatch: + inputs: + target-cloud: + description: >- + The cloud to target for the run. + Leave blank to use the default cloud. + type: choice + options: + - "" + - arcus + - leafcloud + +jobs: + # Tests that a backup and restore re-adopts all the existing platforms correctly + # + # Note that success() and failure() consider *all previous steps*, and continue-on-failure + # prevents the job from being marked as failed if that step fails + # This means that in order to get the execution flow that we want while still resulting in a + # failed job when required, we need to use step ids and the conclusions of specific steps + test_backup_restore: + runs-on: ubuntu-latest + steps: + # We need to check out the code under test first in order to use local actions + - name: Checkout code under test + uses: actions/checkout@v3 + + - name: Set up Azimuth environment + uses: ./.github/actions/setup + with: + os-clouds: ${{ secrets.OS_CLOUDS }} + repository: ${{ github.repository }} + ref: ${{ github.ref }} + target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} + install-mode: ha + environment-prefix: ci-restore + # GitHub terminates jobs after 6 hours + # We don't want jobs to acquire the lock then get timed out before they can finish + # So wait a maximum of 3 hours to acquire the lock, leaving 3 hours for other tasks in the job + timeout-minutes: 180 + + - name: Generate S3 credentials for Velero + run: | + set -e + source ci.env + source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" + VELERO_S3_ACCESS_KEY="$(openstack ec2 credentials create -f value -c access)" + VELERO_S3_SECRET_KEY="$(openstack ec2 credentials show -f value -c secret $VELERO_S3_ACCESS_KEY)" + cat >> ci.env <- + The cloud to target for the run. + Leave blank to use the default cloud. + type: choice + options: + - "" + - arcus + - leafcloud + +jobs: + # Tests a clean HA deployment + all appliances + test_ha: + runs-on: ubuntu-latest + steps: + # We need to check out the code under test first in order to use local actions + - name: Checkout code under test + uses: actions/checkout@v3 + + - name: Set up Azimuth environment + uses: ./.github/actions/setup + with: + os-clouds: ${{ secrets.OS_CLOUDS }} + repository: ${{ github.repository }} + ref: ${{ github.ref }} + target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} + install-mode: ha + environment-prefix: ci-ha + # GitHub terminates jobs after 6 hours + # We don't want jobs to acquire the lock then get timed out before they can finish + # So wait a maximum of 3 hours to acquire the lock, leaving 3 hours for other tasks in the job + timeout-minutes: 180 + + - name: Provision Azimuth + uses: ./.github/actions/provision + + - name: Run Azimuth tests + uses: ./.github/actions/test + + - name: Destroy Azimuth + uses: ./.github/actions/destroy + if: ${{ always() }} diff --git a/.github/workflows/test-singlenode.yml b/.github/workflows/test-singlenode.yml index ef1f8eaf..2ab4160d 100644 --- a/.github/workflows/test-singlenode.yml +++ b/.github/workflows/test-singlenode.yml @@ -29,15 +29,18 @@ on: - requirements.txt - requirements.yml - .github/actions/** + - .github/environments/common + - .github/environments/arcus + - .github/environments/leafcloud - .github/workflows/test-singlenode.yml - bin/** - - "!bin/ci-exec" + - "!bin/ci-setup" - "!bin/create-merge-branch" + - "!bin/port-forward" - "!bin/tilt-*" - environments/base/** - environments/singlenode/** - environments/demo/** - - environments/ci/** # Use the head ref for workflow concurrency, with cancellation # This should mean that any previous runs of this workflow for the same PR diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-upgrade.yml similarity index 82% rename from .github/workflows/test-full.yml rename to .github/workflows/test-upgrade.yml index 1aacf221..5dba20e8 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-upgrade.yml @@ -1,4 +1,4 @@ -name: Full test suite +name: Upgrade test on: # Allow manual execution on any branch @@ -15,47 +15,14 @@ on: - leafcloud jobs: - # Tests a clean HA deployment + all appliances - test_clean_ha: - runs-on: ubuntu-latest - steps: - # We need to check out the code under test first in order to use local actions - - name: Checkout code under test - uses: actions/checkout@v3 - - - name: Set up Azimuth environment - uses: ./.github/actions/setup - with: - os-clouds: ${{ secrets.OS_CLOUDS }} - repository: ${{ github.repository }} - ref: ${{ github.ref }} - target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} - install-mode: ha - environment-prefix: ci-ha - # GitHub terminates jobs after 6 hours - # We don't want jobs to acquire the lock then get timed out before they can finish - # So wait a maximum of 3 hours to acquire the lock, leaving 3 hours for other tasks in the job - timeout-minutes: 180 - - - name: Provision Azimuth - uses: ./.github/actions/provision - - - name: Run Azimuth tests - uses: ./.github/actions/test - - - name: Destroy Azimuth - uses: ./.github/actions/destroy - if: ${{ always() }} - # Tests an Azimuth upgrade from the current latest release to the code under test # # Note that success() and failure() consider *all previous steps*, and continue-on-failure # prevents the job from being marked as failed if that step fails # This means that in order to get the execution flow that we want while still resulting in a # failed job when required, we need to use step ids and the conclusions of specific steps - test_azimuth_upgrade: + test_upgrade: runs-on: ubuntu-latest - needs: [test_clean_ha] steps: - name: Get latest tag id: latest-tag diff --git a/docs/configuration/15-disaster-recovery.md b/docs/configuration/15-disaster-recovery.md index 725eb61e..5dc758c4 100644 --- a/docs/configuration/15-disaster-recovery.md +++ b/docs/configuration/15-disaster-recovery.md @@ -33,10 +33,14 @@ velero_enabled: true # The URL of the S3 storage endpoint velero_s3_url: -# The name of the bucket to use for backups - the bucket must already exist +# The name of the bucket to use for backups velero_bucket_name: ``` +!!! warning "Bucket must already exist" + + The specified bucket must already exist - neither azimuth-ops nor Velero will create it. + You will also need to consult the documentation for your S3 provider to obtain S3 credentials for the bucket, and add the access key ID and secret to the following variables: @@ -46,7 +50,7 @@ velero_aws_access_key_id: velero_aws_secret_access_key: ``` -!!! tip +!!! tip "Generating credentials for Keystone-integrated Ceph Object Gateway" If the S3 target is [Ceph Object Gateway integrated with Keystone](https://docs.ceph.com/en/latest/radosgw/keystone/), @@ -120,10 +124,10 @@ velero_backup_schedule_enabled: true velero_backup_schedule_name: default # Schedule to use for backups (defaults to every day at midnight) # See https://en.wikipedia.org/wiki/Cron for format options -velero_backup_schedule_timings: "0 0 * * *" +velero_backup_schedule: "0 0 * * *" # Time-to-live for existing backups (defaults to 1 week) # See https://pkg.go.dev/time#ParseDuration for duration format options -velero_backup_schedule_ttl: "168h" +velero_backup_ttl: "168h" ``` !!! note