From fc74b354aa927a223652e93234ef11e2b93274ac Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 31 Jan 2024 12:29:39 +0000 Subject: [PATCH 01/27] Add release notes config --- .github/release.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/release.yml diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 00000000..a2d78e53 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,23 @@ +--- + +changelog: + categories: + - title: Breaking changes + labels: + - breaking + + - title: New features and enhancements + labels: + - "*" + exclude: + labels: + - bug + - automation + + - title: Bugs fixed + labels: + - bug + + - title: Dependency updates + labels: + - automation From d8438c3039d4efb11a2ec7a7bf2402752875653e Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Mon, 5 Feb 2024 11:21:17 +0000 Subject: [PATCH 02/27] Config changes + extra Python deps to support admin dashboards (#86) --- environments/base/inventory/group_vars/all.yml | 10 ++++++++++ .../demo/inventory/group_vars/all/variables.yml | 1 + requirements.txt | 2 ++ 3 files changed, 13 insertions(+) diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml index 923ec5fa..f1249358 100644 --- a/environments/base/inventory/group_vars/all.yml +++ b/environments/base/inventory/group_vars/all.yml @@ -76,6 +76,16 @@ ingress_harbor_core_subdomain: registry ingress_harbor_notary_subdomain: notary # The subdomain that should be used for Keycloak ingress_keycloak_subdomain: identity +# The subdomain that should be used for Grafana +ingress_grafana_subdomain: grafana +# The subdomain that should be used for Prometheus +ingress_prometheus_subdomain: prometheus +# The subdomain that should be used for Alertmanager +ingress_alertmanager_subdomain: alertmanager +# The subdomain that should be used for Consul +ingress_consul_subdomain: consul +# The subdomain that should be used for ARA +ingress_ara_subdomain: ara # Annotations for Azimuth ingress resources ingress_annotations: diff --git a/environments/demo/inventory/group_vars/all/variables.yml b/environments/demo/inventory/group_vars/all/variables.yml index e2d4d356..88d2e8a0 100644 --- a/environments/demo/inventory/group_vars/all/variables.yml +++ b/environments/demo/inventory/group_vars/all/variables.yml @@ -70,6 +70,7 @@ azimuth_openstack_verify_ssl: false azimuth_current_cloud_name: demo # Use secrets that are not really secret for ease +admin_dashboard_ingress_basic_auth_password: admin harbor_admin_password: admin harbor_secret_key: abcdefghijklmnop keycloak_admin_password: admin diff --git a/requirements.txt b/requirements.txt index 8750c455..14e55399 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,9 @@ ansible-core>=2.12 # require undef filter azimuth-robotframework +bcrypt==4.0.1 # this is the latest version that doesn't emit a warning jmespath munch +passlib python-octaviaclient python-openstackclient robotframework From c68d167c0b4469df1ecc74993d9ccc6a228036e3 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 5 Feb 2024 11:25:28 +0000 Subject: [PATCH 03/27] Add example GitHub CD workflows (#83) * Add example GitHub CD workflows * Update comment --------- Co-authored-by: sd109 --- .github-deploy-prod.yml.sample | 53 +++++++++++++++++++++++++++++++ .github-deploy-staging.yml.sample | 40 +++++++++++++++++++++++ docs/deployment/automation.md | 12 +++++++ 3 files changed, 105 insertions(+) create mode 100644 .github-deploy-prod.yml.sample create mode 100644 .github-deploy-staging.yml.sample diff --git a/.github-deploy-prod.yml.sample b/.github-deploy-prod.yml.sample new file mode 100644 index 00000000..d516e69c --- /dev/null +++ b/.github-deploy-prod.yml.sample @@ -0,0 +1,53 @@ +# This example workflow can be used to perform manually-triggered Azimuth deployments targeting production environments. +# The workflow requires a GitHub environment (https://docs.github.com/en/actions/using-jobs/using-environments-for-jobs) to +# be created in the site-specific config repo with a name which exactly matches the azimuth-config environment to be used +# for production deployments. For security, this GitHub environment should also have a deployment protection rule which +# restricts the environment workflows to only run on the main/default branch. This ensures that production deployments +# cannot be executed from arbitrary branches which could contain incorrect or unreviewed configuration. +# +# A manually-triggered workflow is used here since GitHub does not allow deployment approval rules for environments in +# private GitHub repos without a GitHub Enterprise subscription. If the site-specific config repo is public, or if an enterprise +# subscription is available, then triggering the workflow on push to main with additional approval rules in the environment is +# the recommended approach. +# +# The site-specific config repo must also define a repository secret named GIT_CRYPT_KEY_B64 which contains the base64 encoded +# git-crypt key which was used to encrypt the repository's secrets. This can be obtained by running `git-crypt export-key - | base64` +# from within an unlocked checkout of the repository. For information on defining GitHub repo secrets, see: +# https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions + +name: Azimuth deployment +on: + workflow_dispatch: + inputs: + environment: + description: "The Azimuth config environment to deploy" + type: environment + required: true +jobs: + deploy_azimuth: + environment: ${{ inputs.environment }} + runs-on: self-hosted + steps: + + - name: Ensure required host packages are installed + run: | + set -xe + sudo apt update + sudo apt install -y python3-venv python3-dev build-essential unzip git-crypt + + - name: Checkout the config repo + uses: actions/checkout@v3 + + - name: Deploy Azimuth + shell: bash + # Here we just decrypt the repo checkout then follow the steps from the Azimuth deployment docs. + # The GitHub repo should have an environment configured with a name which matches the Azimuth config environment. + # This GitHub environment should also have a branch protection rule which only allows deployments on chosen production branch (e.g. main). + # https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment + run: | + set -e + echo ${{ secrets.GIT_CRYPT_KEY_B64 }} | base64 -d | git-crypt unlock - + ./bin/ensure-venv + source ./bin/activate ${{ inputs.environment }} + ansible-galaxy install -fr ./requirements.yml + ansible-playbook stackhpc.azimuth_ops.provision diff --git a/.github-deploy-staging.yml.sample b/.github-deploy-staging.yml.sample new file mode 100644 index 00000000..b067269c --- /dev/null +++ b/.github-deploy-staging.yml.sample @@ -0,0 +1,40 @@ +# This example workflow can be adapted to perform automated Azimuth deployments targeting staging or test environments. +# The `azimuth-config-env-name` variable in the `env` section below should be set to name of the Azimuth config environment +# to be deployed. +# +# The site-specific config repo must also define a repository secret named GIT_CRYPT_KEY_B64 which contains the base64 encoded +# git-crypt key which was used to encrypt the repository's secrets. This can be obtained by running `git-crypt export-key - | base64` +# from within an unlocked checkout of the repository. For information on defining GitHub repo secrets, see: +# https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions + +name: Azimuth deployment +on: + push: + branches: + - main +env: + azimuth-config-env-name: +jobs: + deploy_azimuth: + runs-on: self-hosted + steps: + + - name: Ensure required host packages are installed + run: | + set -xe + sudo apt update + sudo apt install -y python3-venv python3-dev build-essential unzip git-crypt + + - name: Checkout the config repo + uses: actions/checkout@v3 + + - name: Deploy Azimuth + shell: bash + # Here we just decrypt the repo checkout then follow the steps from the Azimuth deployment docs. + run: | + set -e + echo ${{ secrets.GIT_CRYPT_KEY_B64 }} | base64 -d | git-crypt unlock - + ./bin/ensure-venv + source ./bin/activate ${{ env.azimuth-config-env-name }} + ansible-galaxy install -fr ./requirements.yml + ansible-playbook stackhpc.azimuth_ops.provision diff --git a/docs/deployment/automation.md b/docs/deployment/automation.md index a8784b59..da333024 100644 --- a/docs/deployment/automation.md +++ b/docs/deployment/automation.md @@ -212,3 +212,15 @@ Unfortunately, this is a paid feature and the only real alternative is to use a If you do not want to pay for Project access tokens, then you could register a separate service account that only belongs to your configuration project and issue a personal access token from that account instead. + + +## GitHub CI/CD + +For site-specific configuration repositories hosted on GitHub, `azimuth-config` provides two sample workflows +for automated deployments to a test or staging environment +([example workflow](https://github.com/stackhpc/azimuth-config/blob/main/.github-deploy-staging.yml.sample)) +and manually-triggered deployment to a production environment +([example workflow](https://github.com/stackhpc/azimuth-config/blob/main/.github-deploy-prod.yml.sample)). +These can be used with [GitHub Actions](https://docs.github.com/en/actions) to mimic some of the GitLab +functionality described above. Each sample file contains a top-level comment describing how to tailor these +workflows to a site-specific configuration repository. \ No newline at end of file From 952398d7af4d6b9cc231ab88d9fc6ae8d1398e63 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Mon, 5 Feb 2024 14:14:07 +0000 Subject: [PATCH 04/27] Changes for Slurm quota checks (#90) * Set Slurm control + login flavors as test case params instead of extra vars * Restore old configuration --- environments/ci/inventory/group_vars/all/variables.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/environments/ci/inventory/group_vars/all/variables.yml b/environments/ci/inventory/group_vars/all/variables.yml index d4afce76..be093cc0 100644 --- a/environments/ci/inventory/group_vars/all/variables.yml +++ b/environments/ci/inventory/group_vars/all/variables.yml @@ -8,6 +8,7 @@ infra_network_id: "{{ lookup('pipe', 'openstack network show portal-internal -f # The flavor to use for the Azimuth AIO VM (vm.ska.cpu.general.eighth) infra_flavor_id: 5f9def81-c93f-4c1f-a521-3b810061ff6c # The flavors to use for the Slurm login and control nodes +# TODO(mkjpryor) remove these once azimuth-ops has been updated azimuth_caas_stackhpc_slurm_appliance_login_flavor_name: vm.ska.cpu.general.small azimuth_caas_stackhpc_slurm_appliance_control_flavor_name: "{{ azimuth_caas_stackhpc_slurm_appliance_login_flavor_name }}" # The flavor to use for the workstation test case (vm.ska.cpu.general.small) @@ -16,8 +17,11 @@ generate_tests_caas_test_case_workstation_param_cluster_flavor: c8b72062-5d52-45 generate_tests_caas_test_case_repo2docker_param_cluster_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" # The flavor to use for the R-Studio test case generate_tests_caas_test_case_rstudio_param_cluster_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" -# The flavor to use for the compute nodes in the Slurm test case -generate_tests_caas_test_case_slurm_param_compute_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" +# The flavor to use for the nodes in the Slurm test case +# The flavors to use for the Slurm login and control nodes +generate_tests_caas_test_case_slurm_param_login_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" +generate_tests_caas_test_case_slurm_param_control_flavor: "{{ generate_tests_caas_test_case_slurm_param_login_flavor }}" +generate_tests_caas_test_case_slurm_param_compute_flavor: "{{ generate_tests_caas_test_case_slurm_param_login_flavor }}" # The flavors to use for the control plane and workers in Kubernetes test cases generate_tests_kubernetes_test_case_control_plane_size: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" generate_tests_kubernetes_test_case_worker_size: "{{ generate_tests_kubernetes_test_case_control_plane_size }}" From 1ee4f24ae137fbccee63e31f98438f0f731fbfbf Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Mon, 5 Feb 2024 14:14:38 +0000 Subject: [PATCH 05/27] Changes to support Helm dashboard (#87) --- environments/base/inventory/group_vars/all.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml index f1249358..3631e4c4 100644 --- a/environments/base/inventory/group_vars/all.yml +++ b/environments/base/inventory/group_vars/all.yml @@ -86,6 +86,8 @@ ingress_alertmanager_subdomain: alertmanager ingress_consul_subdomain: consul # The subdomain that should be used for ARA ingress_ara_subdomain: ara +# The subdomain that should be used for the Helm dashboard +ingress_helm_dashboard_subdomain: helm # Annotations for Azimuth ingress resources ingress_annotations: From 5ffa5ca3c28efd25cc8fd7fada293b1591e24e8d Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Mon, 5 Feb 2024 14:16:41 +0000 Subject: [PATCH 06/27] Config changes for Kubernetes dashboard (#89) --- environments/base/inventory/group_vars/all.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml index 3631e4c4..816553af 100644 --- a/environments/base/inventory/group_vars/all.yml +++ b/environments/base/inventory/group_vars/all.yml @@ -88,6 +88,8 @@ ingress_consul_subdomain: consul ingress_ara_subdomain: ara # The subdomain that should be used for the Helm dashboard ingress_helm_dashboard_subdomain: helm +# The subdomain that should be used for the Kubernetes dashboard +ingress_kubernetes_dashboard_subdomain: kubernetes # Annotations for Azimuth ingress resources ingress_annotations: From c3b023ee025bd929d8109a1f704be8e5e98397e3 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 6 Feb 2024 08:18:35 +0000 Subject: [PATCH 07/27] Changes to tests for branch renaming (#92) * Allow tests to be manually executed on any branch * Changes to tests for branch renaming --- .github/actions/setup/action.yml | 2 +- .../workflows/{test-tag.yml => test-full.yml} | 30 +++++++-- .../{test-pr.yml => test-singlenode.yml} | 15 ++--- .github/workflows/update-dependencies.yml | 65 ------------------- docs/best-practice.md | 2 +- docs/debugging/access-k3s.md | 2 +- docs/debugging/access-monitoring.md | 2 +- docs/deployment/automation.md | 6 +- docs/deployment/index.md | 2 +- docs/environments.md | 4 +- docs/repository/terraform.md | 2 +- 11 files changed, 43 insertions(+), 89 deletions(-) rename .github/workflows/{test-tag.yml => test-full.yml} (62%) rename .github/workflows/{test-pr.yml => test-singlenode.yml} (83%) delete mode 100644 .github/workflows/update-dependencies.yml diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 2ee7f04c..a04e2419 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -8,7 +8,7 @@ inputs: ref: description: The ref to use for the Azimuth configuration. required: true - default: main + default: devel config-environment: description: The config environment to use. required: true diff --git a/.github/workflows/test-tag.yml b/.github/workflows/test-full.yml similarity index 62% rename from .github/workflows/test-tag.yml rename to .github/workflows/test-full.yml index 934f382d..fd5c5bf8 100644 --- a/.github/workflows/test-tag.yml +++ b/.github/workflows/test-full.yml @@ -1,16 +1,35 @@ -name: Test Azimuth deployment +name: Full test suite on: - push: - tags: - - "*" + # Allow manual execution on any branch + workflow_dispatch: + # Execute by default on pull requests to the stable branch + pull_request: + types: + - opened + - synchronize + - ready_for_review + - reopened + branches: + - stable + paths-ignore: + - 'docs/**' jobs: + # This job exists so that PRs from outside the main repo are rejected + fail_on_remote: + runs-on: ubuntu-latest + steps: + - name: Code under test must be from a branch in the azimuth-config repo + run: exit ${{ github.repository == 'stackhpc/azimuth-config' && '0' || '1' }} + # We want jobs to wait in a queue for a slot to run, so as not to overload the test infra # GitHub concurrency _almost_ does this, except the queue length is one :-( # There is a feature request for what we need https://github.com/orgs/community/discussions/12835 # Until that is implemented, the only other viable option is a busy wait wait_in_queue: + needs: [fail_on_remote] + if: ${{ github.event_name == 'workflow_dispatch' || !github.event.pull_request.draft }} runs-on: ubuntu-latest steps: - name: Wait for an available slot @@ -18,7 +37,8 @@ jobs: with: max-concurrency: 1 - # For tags, we run a full HA test (for now) + # Run the full test suite + # Currently, this is just a HA install run_azimuth_tests: needs: [wait_in_queue] runs-on: ubuntu-latest diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-singlenode.yml similarity index 83% rename from .github/workflows/test-pr.yml rename to .github/workflows/test-singlenode.yml index fa3a3fa1..d8bcc22d 100644 --- a/.github/workflows/test-pr.yml +++ b/.github/workflows/test-singlenode.yml @@ -1,6 +1,9 @@ -name: Test Azimuth deployment +name: Single node test on: + # Allow manual execution on any branch + workflow_dispatch: + # Execute by default on pull requests to the devel branch pull_request: types: - opened @@ -8,20 +11,16 @@ on: - ready_for_review - reopened branches: - - main + - devel paths-ignore: - 'docs/**' -concurrency: - group: ${{ github.head_ref }} - cancel-in-progress: true - jobs: # This job exists so that PRs from outside the main repo are rejected fail_on_remote: runs-on: ubuntu-latest steps: - - name: PR must be from a branch in the azimuth-config repo + - name: Code under test must be from a branch in the azimuth-config repo run: exit ${{ github.repository == 'stackhpc/azimuth-config' && '0' || '1' }} # We want jobs to wait in a queue for a slot to run, so as not to overload the test infra @@ -30,6 +29,7 @@ jobs: # Until that is implemented, the only other viable option is a busy wait wait_in_queue: needs: [fail_on_remote] + if: ${{ github.event_name == 'workflow_dispatch' || !github.event.pull_request.draft }} runs-on: ubuntu-latest steps: - name: Wait for an available slot @@ -39,7 +39,6 @@ jobs: run_azimuth_tests: needs: [wait_in_queue] - if: ${{ !github.event.pull_request.draft }} runs-on: ubuntu-latest steps: # We need to check out the code under test first in order to use local actions diff --git a/.github/workflows/update-dependencies.yml b/.github/workflows/update-dependencies.yml deleted file mode 100644 index 3ec31b5a..00000000 --- a/.github/workflows/update-dependencies.yml +++ /dev/null @@ -1,65 +0,0 @@ -# This workflow proposes updates to the dependencies that dependabot cannot -name: Update dependencies - -on: - # Allow manual executions - workflow_dispatch: - # Run nightly - schedule: - - cron: '0 0 * * *' - -jobs: - propose_github_release_updates: - runs-on: ubuntu-latest - strategy: - matrix: - include: - - key: azimuth-ops - path: ./requirements.yml - repository: stackhpc/ansible-collection-azimuth-ops - prereleases: "yes" - version_jsonpath: collections[0].version - - name: ${{ matrix.key }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Check for most recent GitHub release - id: next - uses: stackhpc/github-actions/github-latest-release@master - with: - repository: ${{ matrix.repository }} - prereleases: ${{ matrix.prereleases || 'no' }} - - - name: Update dependency key - uses: stackhpc/github-actions/config-update@master - with: - path: ${{ matrix.path }} - updates: | - ${{ matrix.version_jsonpath }}=${{ steps.next.outputs.version }} - - - name: Generate app token for PR - uses: stackhpc/github-actions/generate-app-token@master - id: generate-app-token - with: - repository: ${{ github.repository }} - app-id: ${{ secrets.APP_ID }} - app-private-key: ${{ secrets.APP_PRIVATE_KEY }} - - - name: Propose changes via PR if required - uses: peter-evans/create-pull-request@v5 - with: - token: ${{ steps.generate-app-token.outputs.token }} - commit-message: >- - Update ${{ matrix.key }} to ${{ steps.next.outputs.version }} - branch: update-dependency/${{ matrix.key }} - delete-branch: true - title: >- - Update ${{ matrix.key }} to ${{ steps.next.outputs.version }} - body: > - This PR was created automatically to update - ${{ matrix.key }} to ${{ steps.next.outputs.version }}. - labels: | - automation - dependency-update diff --git a/docs/best-practice.md b/docs/best-practice.md index 0ab30e2e..4feeef94 100644 --- a/docs/best-practice.md +++ b/docs/best-practice.md @@ -100,7 +100,7 @@ manual trigger. Once the change has been validated in `staging`, the job to depl can be actioned. A -[sample GitLab CI/CD configuration](https://github.com/stackhpc/azimuth-config/tree/main/.gitlab-ci.yml.sample) +[sample GitLab CI/CD configuration](https://github.com/stackhpc/azimuth-config/tree/stable/.gitlab-ci.yml.sample) is provided that implements this workflow for GitLab-hosted repositories. ## Disaster recovery diff --git a/docs/debugging/access-k3s.md b/docs/debugging/access-k3s.md index d6968cdb..0a500d15 100644 --- a/docs/debugging/access-k3s.md +++ b/docs/debugging/access-k3s.md @@ -8,7 +8,7 @@ Cluster API management cluster for the HA cluster. In both cases, the K3S node is deployed using Terraform and the IP address and SSH key for accessing the node are in the Terraform state for the environment. The `azimuth-config` repository contains a utility script - -[seed-ssh](https://github.com/stackhpc/azimuth-config/tree/main/bin/seed-ssh) - that will +[seed-ssh](https://github.com/stackhpc/azimuth-config/tree/stable/bin/seed-ssh) - that will extract these details from the Terraform state for the active environment and use them to execute an SSH command to access the provisioned node. diff --git a/docs/debugging/access-monitoring.md b/docs/debugging/access-monitoring.md index 4f429836..75c5b48f 100644 --- a/docs/debugging/access-monitoring.md +++ b/docs/debugging/access-monitoring.md @@ -8,7 +8,7 @@ internet, an be used from your local machine to the K3S node as well. To simplify this process, the `azimuth-config` repository contains a utility script - -[port-forward](https://github.com/stackhpc/azimuth-config/tree/main/bin/port-forward) - +[port-forward](https://github.com/stackhpc/azimuth-config/tree/stable/bin/port-forward) - that can be used to set up the double port-forward for particular cluster services. To view monitoring dashboards in Grafana, use the following command to expose the Grafana diff --git a/docs/deployment/automation.md b/docs/deployment/automation.md index da333024..78a1e975 100644 --- a/docs/deployment/automation.md +++ b/docs/deployment/automation.md @@ -29,7 +29,7 @@ environments, although deployments to production typically include a manual appr ## GitLab CI/CD `azimuth-config` provides a -[sample configuration](https://github.com/stackhpc/azimuth-config/blob/main/.gitlab-ci.yml.sample) +[sample configuration](https://github.com/stackhpc/azimuth-config/blob/stable/.gitlab-ci.yml.sample) for use with [GitLab CI/CD](https://docs.gitlab.com/ee/ci/) that demonstrates how to set up continuous delivery for an Azimuth configuration repository. @@ -218,9 +218,9 @@ Unfortunately, this is a paid feature and the only real alternative is to use a For site-specific configuration repositories hosted on GitHub, `azimuth-config` provides two sample workflows for automated deployments to a test or staging environment -([example workflow](https://github.com/stackhpc/azimuth-config/blob/main/.github-deploy-staging.yml.sample)) +([example workflow](https://github.com/stackhpc/azimuth-config/blob/stable/.github-deploy-staging.yml.sample)) and manually-triggered deployment to a production environment -([example workflow](https://github.com/stackhpc/azimuth-config/blob/main/.github-deploy-prod.yml.sample)). +([example workflow](https://github.com/stackhpc/azimuth-config/blob/stable/.github-deploy-prod.yml.sample)). These can be used with [GitHub Actions](https://docs.github.com/en/actions) to mimic some of the GitLab functionality described above. Each sample file contains a top-level comment describing how to tailor these workflows to a site-specific configuration repository. \ No newline at end of file diff --git a/docs/deployment/index.md b/docs/deployment/index.md index 21e10a7d..8425b839 100644 --- a/docs/deployment/index.md +++ b/docs/deployment/index.md @@ -4,7 +4,7 @@ The Python requirements for an Azimuth deployment host, including Ansible itself, are contained in -[requirements.txt](https://github.com/stackhpc/azimuth-config/blob/main/requirements.txt) +[requirements.txt](https://github.com/stackhpc/azimuth-config/blob/stable/requirements.txt) and must be installed before you can proceed with a deployment. It is recommended to use a [virtual environment](https://docs.python.org/3/library/venv.html) in order to keep the dependencies isolated from other Python applications on the host. diff --git a/docs/environments.md b/docs/environments.md index 1226ad33..0ce816be 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -50,7 +50,7 @@ By keeping the `azimuth-config` repository as an upstream of your site configura you can rebase onto or merge the latest configuration to pick up changes to these mixins. The `azimuth-config` repository contains an example of a concrete environment in -[environments/example](https://github.com/stackhpc/azimuth-config/tree/main/environments/example) +[environments/example](https://github.com/stackhpc/azimuth-config/tree/stable/environments/example) that should be used as a basis for your own concrete environment(s). Depending how many concrete environments you have, you may wish to define mixin environments @@ -76,7 +76,7 @@ MY_VAR="some value" ``` The -[azimuth-config activate script](https://github.com/stackhpc/azimuth-config/tree/main/bin/activate) +[azimuth-config activate script](https://github.com/stackhpc/azimuth-config/tree/stable/bin/activate) exports environment variables defined in the following files: `env` and `env.secret` diff --git a/docs/repository/terraform.md b/docs/repository/terraform.md index e0c0fda3..48a2091f 100644 --- a/docs/repository/terraform.md +++ b/docs/repository/terraform.md @@ -102,7 +102,7 @@ variables respectively. If you are [using GitLab CI/CD to automate deployments](../deployment/automation.md#gitlab-cicd), then the pipeline will be issued with a suitable token. The -[sample configuration](https://github.com/stackhpc/azimuth-config/blob/main/.gitlab-ci.yml.sample) +[sample configuration](https://github.com/stackhpc/azimuth-config/blob/stable/.gitlab-ci.yml.sample) includes configuration to populate these variables using this token. If you are not using automation but your GitLab installation has From 5255187a84e65e3dde2da0b4f3b4f210e3947712 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 6 Feb 2024 10:52:23 +0000 Subject: [PATCH 08/27] Encourage sync from most recent tag (#91) * Document syncing from a tag * Add a note about how to list the available tags * Link to the releases page in github * Fix up branch names * Fix up garbled sentence * Move to use the stable branch * Rework upgrade docs --------- Co-authored-by: Matt Pryor --- docs/repository/index.md | 71 ++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/docs/repository/index.md b/docs/repository/index.md index 84015350..edc0810d 100644 --- a/docs/repository/index.md +++ b/docs/repository/index.md @@ -2,9 +2,8 @@ The `azimuth-config` repository provides best-practice configuration for Azimuth deployments that can be inherited by site-specific configuration repositories using -[Git](https://git-scm.com/). Using Git makes it easy to periodically incorporate changes to -the best practice into the configuration for your site, e.g. to pick up new Azimuth versions, -updated images, CaaS appliance versions or Kubernetes versions. +[Git](https://git-scm.com/). Using Git makes it easy to pick up new Azimuth releases when +they become available. ## Initial repository setup @@ -16,23 +15,39 @@ repository: ```sh # Clone the azimuth-config repository git clone https://github.com/stackhpc/azimuth-config.git my-azimuth-config - cd my-azimuth-config -# Maintain the existing origin remote so that we can periodically sync changes, -# but rename it to upstream +# Maintain the existing origin remote as upstream git remote rename origin upstream -# Create a new origin remote for the new repository location +# Create a new origin remote for the repository location git remote add origin git@/my-azimuth-config.git -# Push the main branch to the new origin -git push -u origin main +# Checkout stable to get the latest release +git checkout stable + +# Create a new main branch from stable +# This will be the branch that is deployed into production +git checkout -b main + +# Push the main branch to the origin +git push --set-upstream origin main ``` You now have an independent copy of the `azimuth-config` repository that has a link back to the source repository via the `upstream` remote. +!!! tip "Branch protection rules" + + It is a good idea to apply branch protection rules to the `main` branch that enforce + that all changes are made via a merge (or pull) request. This should ensure that changes + are not accidentally pushed into production without being reviewed. + + Instructions are available on how to set this up for + [GitHub](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/managing-protected-branches/managing-a-branch-protection-rule) or + [GitLab](https://docs.gitlab.com/ee/user/project/protected_branches.html). + + ## Creating a new environment Your new repository does not yet contain any site-specific configuration. The best way @@ -76,22 +91,33 @@ changes to `main`. These changes can then be reviewed before being merged to `ma If you have automated deployments, the branch may even get a dynamic environment created for it where the result of the changes can be verified before the merge takes place. -## Synchronising changes from upstream +## Upgrading to a new Azimuth release -Over time, as Azimuth changes, the best-practice configuration will also change to point -at new Azimuth versions, upgraded dependencies and new images. +When a new Azimuth release becomes available, you will need to synchronise the changes +from `azimuth-config` into your site configuration repository in order to pick up new +component versions, upgraded dependencies and new images. -!!! tip +!!! info "Choosing a release" - This process - [can be automated](../deployment/automation.md#automated-synchronisation-of-upstream-changes) - if you have the tooling available. + The available releases, with associated release notes, can be reviewed on the + [Azimuth releases page](https://github.com/stackhpc/azimuth-config/releases). -To incorporate the latest changes into your site-specific repository, use the following: +To upgrade your Azimuth configuration to a new release, use the following steps to create +a new branch containing the upgrade: ```sh -git fetch upstream -git merge upstream/main +# Make sure the local checkout is up to date with any site-specific changes +git checkout main +git pull + +# Fetch the tags from the upstream repo +git remote update + +# Create a new branch to contain the Azimuth upgrade +git checkout -b upgrade/$RELEASE_TAG + +# Merge in the tag for the new release +git merge $RELEASE_TAG ``` At this point, you will need to fix any conflicts where you have made changes to the same @@ -106,6 +132,9 @@ files that have been changed by `azimuth-config`. Once any conflicts have been resolved, you can commit and push the changes: ```sh -git commit -m "Merge changes from upstream" -git push +git commit -m "Upgrade Azimuth to $RELEASE_TAG" +git push --set-upstream origin upgrade/$RELEASE_TAG ``` + +You can now open a merge (or pull) request proposing the upgrade to your `main` branch +that can be reviewed like any other. From 96389f47c3b70cb0b8e4e28be31f4caed9edce76 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 6 Feb 2024 12:37:12 +0000 Subject: [PATCH 09/27] [skip ci] update workflow triggers for new release process (#94) --- .github/workflows/publish-docs.yml | 6 +++--- .github/workflows/test-full.yml | 11 ----------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 7ab52a29..aacde52f 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -1,9 +1,9 @@ name: Publish docs via GitHub Pages on: - push: - tags: - - "*" + release: + types: + - published jobs: deploy: diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index fd5c5bf8..b162265e 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -3,17 +3,6 @@ name: Full test suite on: # Allow manual execution on any branch workflow_dispatch: - # Execute by default on pull requests to the stable branch - pull_request: - types: - - opened - - synchronize - - ready_for_review - - reopened - branches: - - stable - paths-ignore: - - 'docs/**' jobs: # This job exists so that PRs from outside the main repo are rejected From d9779867c49fdc90306d78a6f8ce80e3c4b787df Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 6 Feb 2024 14:22:47 +0000 Subject: [PATCH 10/27] Update azimuth-ops to 0.4.0 (#95) --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index f7c40bad..0b8e1ef9 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.3.3 + version: 0.4.0 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From a2ac74af8ea3f2015dfa6aeefe2c213cb1539a0f Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 6 Feb 2024 20:52:49 +0000 Subject: [PATCH 11/27] Generate and publish consolidated release notes (#96) * Generate and publish consolidated release notes * Fix typo in action name --- .github/actions/release-notes/Dockerfile | 11 + .github/actions/release-notes/action.yml | 23 ++ .../actions/release-notes/release-notes.py | 308 ++++++++++++++++++ .github/workflows/publish-docs.yml | 20 -- .github/workflows/publish-release.yml | 36 ++ 5 files changed, 378 insertions(+), 20 deletions(-) create mode 100644 .github/actions/release-notes/Dockerfile create mode 100644 .github/actions/release-notes/action.yml create mode 100755 .github/actions/release-notes/release-notes.py delete mode 100644 .github/workflows/publish-docs.yml create mode 100644 .github/workflows/publish-release.yml diff --git a/.github/actions/release-notes/Dockerfile b/.github/actions/release-notes/Dockerfile new file mode 100644 index 00000000..d8813963 --- /dev/null +++ b/.github/actions/release-notes/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.9 + +ENV PYTHONUNBUFFERED 1 + +# Install the requirements +RUN pip install --no-cache-dir -U pip && \ + pip install --no-cache-dir easysemver requests pyyaml + +COPY release-notes.py /usr/local/bin/release-notes + +ENTRYPOINT ["release-notes"] diff --git a/.github/actions/release-notes/action.yml b/.github/actions/release-notes/action.yml new file mode 100644 index 00000000..fb60edb0 --- /dev/null +++ b/.github/actions/release-notes/action.yml @@ -0,0 +1,23 @@ +name: Generate release notes +description: >- + Generates consolidated release notes and uploads them to the target release. +inputs: + token: + description: The GitHub token for interacting with the API. + default: ${{ github.token }} + repository: + description: The GitHub repository. + default: ${{ github.repository }} + tag: + description: The tag to update release notes for. + # Assume we are running under a release event + default: ${{ github.event.release.tag_name }} +runs: + using: docker + image: Dockerfile + args: + - --token + - ${{ inputs.token }} + - --repo + - ${{ inputs.repository }} + - ${{ inputs.tag }} diff --git a/.github/actions/release-notes/release-notes.py b/.github/actions/release-notes/release-notes.py new file mode 100755 index 00000000..f258b9ca --- /dev/null +++ b/.github/actions/release-notes/release-notes.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 + +import argparse +import base64 +import os + +import requests +import yaml + +import easysemver + + +API_URL = "https://api.github.com" +COMPONENTS = [ + { + "name": "azimuth-images", + # These keys define how to extract the version from azimuth-ops + "path": "roles/community_images/defaults/main.yml", + "version_key": "community_images_azimuth_images_version", + }, + { + "name": "azimuth", + "path": "roles/azimuth/defaults/main.yml", + "version_key": "azimuth_chart_version", + }, + { + "name": "azimuth-caas-operator", + "path": "roles/azimuth_caas_operator/defaults/main.yml", + "version_key": "azimuth_caas_operator_chart_version", + }, + { + "name": "azimuth-capi-operator", + "path": "roles/azimuth_capi_operator/defaults/main.yml", + "version_key": "azimuth_capi_operator_chart_version", + }, + { + "name": "azimuth-identity-operator", + "path": "roles/azimuth_identity_operator/defaults/main.yml", + "version_key": "azimuth_identity_operator_chart_version", + }, + { + "name": "zenith", + "path": "roles/zenith/defaults/main.yml", + "version_key": "zenith_chart_version", + }, + { + "name": "cluster-api-addon-provider", + "path": "roles/clusterapi/defaults/main.yml", + "version_key": "clusterapi_addon_provider_chart_version", + }, + { + "name": "cluster-api-janitor-openstack", + "path": "roles/clusterapi/defaults/main.yml", + "version_key": "clusterapi_janitor_openstack_chart_version", + }, + { + "name": "capi-helm-charts", + "path": "roles/capi_cluster/defaults/main.yml", + "version_key": "capi_cluster_chart_version", + }, + { + "name": "caas-workstation", + "path": "roles/azimuth_caas_operator/defaults/main.yml", + "version_key": "azimuth_caas_stackhpc_workstation_git_version", + }, + { + "name": "caas-repo2docker", + "path": "roles/azimuth_caas_operator/defaults/main.yml", + "version_key": "azimuth_caas_stackhpc_repo2docker_git_version", + }, + { + "name": "caas-r-studio-server", + "path": "roles/azimuth_caas_operator/defaults/main.yml", + "version_key": "azimuth_caas_stackhpc_rstudio_git_version", + }, + { + "name": "ansible-slurm-appliance", + "path": "roles/azimuth_caas_operator/defaults/main.yml", + "version_key": "azimuth_caas_stackhpc_slurm_appliance_git_version", + }, +] + + +def github_session(token): + """ + Initialises a requests session for interacting with GitHub. + """ + session = requests.Session() + session.headers["Content-Type"] = "application/json" + if token: + session.headers["Authorization"] = f"Bearer {token}" + return session + + +def github_fetch_list(session, url): + """ + Generator that yields items from paginating a GitHub URL. + """ + next_url = url + while next_url: + response = session.get(next_url) + response.raise_for_status() + yield from response.json() + next_url = response.links.get("next", {}).get("url") + + +def is_stable(version): + """ + Returns true unless the version is SemVer and has a prerelease. + """ + try: + semver = easysemver.Version(version) + except TypeError: + return True + else: + return not semver.prerelease + + +def fetch_releases( + session, + repo, + min = None, + inclusive_min = True, + max = None, + inclusive_max = True +): + """ + Returns the stable releases for the specified repo. It assumes that the GitHub + API produces the releases in order (which it does). + """ + seen_max = False + for release in github_fetch_list(session, f"{API_URL}/repos/{repo}/releases"): + version = release['tag_name'] + # Deal with hitting the min version + if min and version == min: + if seen_max and inclusive_min and is_stable(version): + yield release + break + # Deal with hitting the max version + if not max or version == max: + seen_max = True + if max and not inclusive_max: + continue + # Only yield stable versions once we have seen the max + if seen_max and is_stable(version): + yield release + + +def fetch_release_by_tag(session, repo, tag): + """ + Fetch the release for the specified repository and tag. + """ + response = session.get(f"{API_URL}/repos/{repo}/releases/tags/{tag}") + response.raise_for_status() + return response.json() + + +def fetch_ops_tag_for_release(session, repo, tag): + """ + Returns the azimuth-ops tag used by the specified release. + """ + response = session.get( + f"{API_URL}/repos/{repo}/contents/requirements.yml", + params = { "ref": tag }, + headers = { "Content-Type": "application/vnd.github.raw+json" } + ) + response.raise_for_status() + content = base64.b64decode(response.json()["content"]) + return yaml.safe_load(content)["collections"][0]["version"] + + +def fetch_component_version_for_ops_tag(session, tag, component): + """ + Returns the version of the specified component that is used in the specified azimuth-ops tag. + """ + response = session.get( + f"{API_URL}/repos/stackhpc/ansible-collection-azimuth-ops/contents/{component['path']}", + params = { "ref": tag }, + headers = { "Content-Type": "application/vnd.github.raw+json" } + ) + response.raise_for_status() + content = base64.b64decode(response.json()["content"]) + return yaml.safe_load(content)[component["version_key"]] + + +def release_notes_for_component(session, name, from_version, to_version): + """ + Produces the release notes for a component between the specified versions. + """ + print(f"[INFO] collecting release notes for {name}") + release_notes = [] + for release in fetch_releases( + session, + f"stackhpc/{name}", + min = from_version, + inclusive_min = False, + max = to_version, + inclusive_max = True + ): + print(f"[INFO] found release - {release['tag_name']}") + release_notes.extend([ + "
", + f"{name} @ {release['tag_name']}", + "", + # Knock the headers down by two levels for formatting + *[ + f"##{line}" if line.startswith("#") else line + for line in release["body"].splitlines() + ], + "", + "
", + ]) + return release_notes + + +def main(): + parser = argparse.ArgumentParser( + description = "Gets the latest release in a GitHub repository." + ) + # Allow the token to come from an environment variable + # We use this particular form so that the empty string becomes None + env_token = os.environ.get("GITHUB_TOKEN") or None + parser.add_argument( + "--token", + help = "The GitHub token to use (can be set using GITHUB_TOKEN envvar).", + default = env_token + ) + parser.add_argument( + "--repo", + help = "The config repository to target.", + default = "stackhpc/azimuth-config" + ) + parser.add_argument("tag", help = "The tag to generate release notes for.") + args = parser.parse_args() + + session = github_session(args.token) + + print(f"[INFO] fetching release for tag - {args.tag}") + current = fetch_release_by_tag(session, args.repo, args.tag) + current_ops_tag = fetch_ops_tag_for_release(session, args.repo, current["tag_name"]) + print(f"[INFO] found azimuth-ops tag - {current_ops_tag}") + + print("[INFO] fetching previous stable release") + previous = next( + fetch_releases( + session, + args.repo, + max = current["tag_name"], + inclusive_max = False + ) + ) + print(f"[INFO] found release - {previous['tag_name']}") + previous_ops_tag = fetch_ops_tag_for_release(session, args.repo, previous["tag_name"]) + print(f"[INFO] found azimuth-ops tag - {previous_ops_tag}") + + print("[INFO] collecting release notes") + release_notes = [] + # Start with the release notes that are attached to the release + release_notes.append(current["body"]) + if current_ops_tag == previous_ops_tag: + print("[WARN] azimuth-ops version has not changed - skipping") + else: + # Add a new header to start the components section + release_notes.append("### Changes to components") + # Produce release notes for azimuth-ops changes + release_notes.extend( + release_notes_for_component( + session, + "ansible-collection-azimuth-ops", + previous_ops_tag, + current_ops_tag + ) + ) + # Produce release notes for each component in azimuth-ops + for component in COMPONENTS: + print(f"[INFO] fetching versions for component - {component['name']}") + component_vn_current = fetch_component_version_for_ops_tag( + session, + current_ops_tag, + component + ) + component_vn_previous = fetch_component_version_for_ops_tag( + session, + previous_ops_tag, + component + ) + if component_vn_current == component_vn_previous: + print(f"[WARN] found same version at both releases - skipping") + release_notes.extend( + release_notes_for_component( + session, + component['name'], + component_vn_previous, + component_vn_current + ) + ) + + print(f"[INFO] updating release notes for release - {current['tag_name']}") + response = session.patch( + f"{API_URL}/repos/{args.repo}/releases/{current['id']}", + json = { "body": "\r\n".join(release_notes) } + ) + response.raise_for_status() + print("[INFO] release notes updated successfully") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml deleted file mode 100644 index aacde52f..00000000 --- a/.github/workflows/publish-docs.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Publish docs via GitHub Pages - -on: - release: - types: - - published - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - with: - python-version: 3.x - - - run: pip install -r requirements-docs.txt - - - run: mkdocs gh-deploy --force diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml new file mode 100644 index 00000000..6aa27b71 --- /dev/null +++ b/.github/workflows/publish-release.yml @@ -0,0 +1,36 @@ +name: Publish release + +on: + release: + types: + - published + +jobs: + generate_release_notes: + name: Generate consolidated release notes + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + - name: Generate and update release notes + uses: ./.github/actions/release-notes + + + publish_docs: + name: Publish docs to GitHub pages + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.x + + - name: Install docs requirements + run: pip install -r requirements-docs.txt + + - name: Publish docs + run: mkdocs gh-deploy --force From d2ec23bb82f09b5085ea017d019adb68af647359 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 7 Feb 2024 13:47:52 +0000 Subject: [PATCH 12/27] Reinstate dependency update workflow --- .github/workflows/update-dependencies.yml | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 .github/workflows/update-dependencies.yml diff --git a/.github/workflows/update-dependencies.yml b/.github/workflows/update-dependencies.yml new file mode 100644 index 00000000..3ec31b5a --- /dev/null +++ b/.github/workflows/update-dependencies.yml @@ -0,0 +1,65 @@ +# This workflow proposes updates to the dependencies that dependabot cannot +name: Update dependencies + +on: + # Allow manual executions + workflow_dispatch: + # Run nightly + schedule: + - cron: '0 0 * * *' + +jobs: + propose_github_release_updates: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - key: azimuth-ops + path: ./requirements.yml + repository: stackhpc/ansible-collection-azimuth-ops + prereleases: "yes" + version_jsonpath: collections[0].version + + name: ${{ matrix.key }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check for most recent GitHub release + id: next + uses: stackhpc/github-actions/github-latest-release@master + with: + repository: ${{ matrix.repository }} + prereleases: ${{ matrix.prereleases || 'no' }} + + - name: Update dependency key + uses: stackhpc/github-actions/config-update@master + with: + path: ${{ matrix.path }} + updates: | + ${{ matrix.version_jsonpath }}=${{ steps.next.outputs.version }} + + - name: Generate app token for PR + uses: stackhpc/github-actions/generate-app-token@master + id: generate-app-token + with: + repository: ${{ github.repository }} + app-id: ${{ secrets.APP_ID }} + app-private-key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Propose changes via PR if required + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ steps.generate-app-token.outputs.token }} + commit-message: >- + Update ${{ matrix.key }} to ${{ steps.next.outputs.version }} + branch: update-dependency/${{ matrix.key }} + delete-branch: true + title: >- + Update ${{ matrix.key }} to ${{ steps.next.outputs.version }} + body: > + This PR was created automatically to update + ${{ matrix.key }} to ${{ steps.next.outputs.version }}. + labels: | + automation + dependency-update From e4e4f4c14a316bdba457e73183ed81ff8f7d910d Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 7 Feb 2024 16:54:46 +0000 Subject: [PATCH 13/27] Documentation updates for admin dashboard ingress (#99) --- docs/configuration/06-ingress.md | 12 +++--- docs/configuration/13-monitoring.md | 44 +++++++++++++++++++ docs/debugging/access-k3s.md | 2 +- docs/debugging/access-monitoring.md | 66 +++++++++++++++-------------- docs/debugging/caas.md | 11 ++--- docs/debugging/zenith-services.md | 16 +++---- 6 files changed, 95 insertions(+), 56 deletions(-) diff --git a/docs/configuration/06-ingress.md b/docs/configuration/06-ingress.md index f8d1d8c2..6f8ce472 100644 --- a/docs/configuration/06-ingress.md +++ b/docs/configuration/06-ingress.md @@ -1,6 +1,6 @@ # Ingress -As mentioned in the prerequisites, Azimuth and Zenith expect to be given control of entire +As mentioned in the prerequisites, Azimuth and Zenith expect to be given control of an entire subdomain, e.g. `*.azimuth.example.org`, and this domain must be assigned to a pre-allocated floating IP using a wildcard DNS entry. @@ -10,10 +10,12 @@ To tell `azimuth-ops` what domain it should use, simply set the following variab ingress_base_domain: azimuth.example.org ``` -This will result in `azimuth-ops` using `portal.azimuth.example.org` for Azimuth and -`registrar.azimuth.example.org` for the Zenith registrar. If Harbor is enabled, -`registry.azimuth.example.org` will be used for the Harbor registry. Zenith will use domains -of the form `.azimuth.example.org` for its services. +This will result in `azimuth-ops` using `portal.azimuth.example.org` for the Azimuth portal +interface, and Zenith will use domains of the form `.azimuth.example.org` +for user-facing services. Other services deployed by Azimuth, such as +[Harbor](./10-kubernetes-clusters.md#harbor-registry) and the +[monitoring and alerting dashboards](./13-monitoring.md#accessing-web-interfaces) will +also be allocated subdomains under this domain. ## Transport Layer Security (TLS) diff --git a/docs/configuration/13-monitoring.md b/docs/configuration/13-monitoring.md index a24342bc..b470498b 100644 --- a/docs/configuration/13-monitoring.md +++ b/docs/configuration/13-monitoring.md @@ -12,6 +12,50 @@ and [Promtail](https://grafana.com/docs/loki/latest/clients/promtail/) that coll from all the pods running on the cluster and the systemd services on each cluster node. These logs are available in a dashboard in Grafana, where they can be filtered and searched. +In addition to the monitoring and alerting stack, several additional dashboards are installed: + + * The [Kubernetes dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) + for browsing the current state of Kubernetes resources. + * The [Helm dashboard](https://github.com/komodorio/helm-dashboard) for browsing the current + state of Helm releases. + * The [Consul UI](https://developer.hashicorp.com/consul/tutorials/certification-associate-tutorials/get-started-explore-the-ui) + for browsing the Consul state (used by Cluster-as-a-Service and Zenith). + * The [ARA Records Ansible (ARA)](https://ara.recordsansible.org/) web interface for browsing the + Ansible playbook runs that have been recorded for operations on Cluster-as-a-Service appliances. + +All the dashboards that access Kubernetes resources are configured to be read-only. + +## Accessing web interfaces + +The monitoring and alerting web dashboards are exposed as subdomains under the `ingress_base_domain`: + + * `grafana` for the Grafana dashboards + * `prometheus` for the Prometheus web interface + * `alertmanager` for the Alertmanager web interface + * `consul` for the Consul UI + * `ara` for the ARA web interface + * `helm` for the Helm dashboard + * `kubernetes` for the Kubernetes dashboard + +The dashboards are protected by a username and password (using +[HTTP Basic Auth](https://en.wikipedia.org/wiki/Basic_access_authentication)). +The username is `admin` and a strong password must be set in your configuration: + +```yaml title="environments/my-site/inventory/group_vars/all/secrets.yml" +admin_dashboard_ingress_basic_auth_password: "" +``` + +!!! warning "Sensitive information" + + The dashboards allow read-only access to the internals of your Azimuth installation. + As such you should ensure that a strong password is used, and take care when sharing + it. + +!!! danger + + This password should be kept secret. If you want to keep the password in Git - which is + recommended - then it [must be encrypted](../repository/secrets.md). + ## Persistence and retention !!! note diff --git a/docs/debugging/access-k3s.md b/docs/debugging/access-k3s.md index 0a500d15..a17785f3 100644 --- a/docs/debugging/access-k3s.md +++ b/docs/debugging/access-k3s.md @@ -3,7 +3,7 @@ Both the single node and high-availability (HA) deployment methods have a K3S node that is provisioned using Terraform. In the single node case, this is the cluster that actually hosts Azimuth and all its dependencies. In the HA case, this cluster is configured as a -Cluster API management cluster for the HA cluster. +Cluster API management cluster for the HA cluster that actually runs Azimuth. In both cases, the K3S node is deployed using Terraform and the IP address and SSH key for accessing the node are in the Terraform state for the environment. The `azimuth-config` diff --git a/docs/debugging/access-monitoring.md b/docs/debugging/access-monitoring.md index 75c5b48f..d464af0c 100644 --- a/docs/debugging/access-monitoring.md +++ b/docs/debugging/access-monitoring.md @@ -1,34 +1,36 @@ # Accessing the monitoring -The monitoring is currently only exposed inside the cluster, so it can only be accessed using -[kubectl port-forward](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/) -from the K3S node. However because the API is not accessible to the -internet, an -[SSH local forward](https://www.ssh.com/academy/ssh/tunneling/example#local-forwarding) must -be used from your local machine to the K3S node as well. - -To simplify this process, the `azimuth-config` repository contains a utility script - -[port-forward](https://github.com/stackhpc/azimuth-config/tree/stable/bin/port-forward) - -that can be used to set up the double port-forward for particular cluster services. - -To view monitoring dashboards in Grafana, use the following command to expose the Grafana -interface on a local port: - -```sh -./bin/port-forward grafana 3000 -``` - -This will make the Grafana interface available at . Log in with the default -credentials - `admin/prom-operator` - to access the dashboards. - -In order to view firing alerts or configure silences, you can also access the Prometheus and -Alertmanager interfaces using the same method: - -```sh -./bin/port-forward prometheus 9090 -./bin/port-forward alertmanager 9093 -``` - -These commands will expose the Prometheus and Alertmanager interfaces at -and respectively. Both these interfaces are unauthenticated, although -you must have sufficient access to set up the port forward via the K3S node. +As discussed in [Monitoring and alerting](../configuration/13-monitoring.md), the monitoring +dashboards are exposed as subdomains under `ingress_base_domain` and protected by a username +and password. + +## Grafana + +Grafana is accessed as `grafana.`, e.g. `grafana.azimuth.example.org`, +and can be used to access various dashboards showing the health of the Azimuth installation +and its underlying Kubernetes cluster. For example, there are dashboards for resource +usage, network traffic, etcd, tenant Kubernetes and CaaS clusters, Zenith services, +pod logs and systemd logs. + +## Prometheus + +Prometheus is accessed as `prometheus.`, and can be used to browse the +configured alerts and see which are firing or pending. It can also be used to make ad-hoc +queries of the metrics for the installation. + +## Alertmanager + +Alertmanager is accessed as `alertmanager.`, and can be used to manage +the firing alerts and configure silences if required. + +## Kubernetes dashboard + +The Kubernetes dashboard is accessed as `kubernetes.`, and can be used to +browse the current state of Kubernetes resources in the cluster. This includes streaming the +logs of current pods. + +## Helm dashboard + +The Helm dashboard is accessed as `helm.`, and can be used to browse the +current state of the Helm releases on the cluster. The dashboard does also attempt to infer +the health of the resources deployed by Helm, however this does sometimes report false-positives. diff --git a/docs/debugging/caas.md b/docs/debugging/caas.md index b904f2c8..b1dffa21 100644 --- a/docs/debugging/caas.md +++ b/docs/debugging/caas.md @@ -103,14 +103,9 @@ Ansible playbook executions as they are run by the CaaS operator. If the job is as far as starting to run Ansible, then ARA is a much easier way to debug the Ansible for an appliance than wading through the Ansible logs from the job. -Similar to the monitoring, ARA is only accessible inside the cluster. To access it, -use the following command: - -```sh -./bin/port-forward ara 8000 -``` - -The ARA UI will then be available at . +As discussed in [Monitoring and alerting](../configuration/13-monitoring.md), the ARA +web interface is exposed as `ara.`, e.g. `ara.azimuth.example.org`, +and is protected by a username and password. Once inside, you can look at the details of the recently executed jobs, see which tasks failed and what variables were set at the time. diff --git a/docs/debugging/zenith-services.md b/docs/debugging/zenith-services.md index ca5fcd9e..99e56073 100644 --- a/docs/debugging/zenith-services.md +++ b/docs/debugging/zenith-services.md @@ -25,17 +25,13 @@ kubectl -n azimuth rollout restart deployment/zenith-server-sshd Once a client has connected to SSHD successfully, it should get registered in [Consul](https://www.consul.io/). -To determine if this is the case, it is useful to access the Consul -UI. Similar to the monitoring, this interface is only accessible inside the cluster. To -access it, use the following command: +To determine if this is the case, it is useful to access the Consul UI. As discussed +in [Monitoring and alerting](../configuration/13-monitoring.md), the Consul UI +is exposed as `consul.`, e.g. `consul.azimuth.example.org`, +and is protected by a username and password. -```sh -./bin/port-forward consul 3000 -``` - -The Consul UI will then be available at . The default view shows -Consul's view of the services, where you can check if the service is being registered -correctly. +The default view shows Consul's view of the services, where you can check if the +service is being registered correctly. Clients not registering correctly in Consul usually indicates an issue with Consul itself. Futher information for debugging Consul issues is provided in From 1b86d340c937485be476b1e944f4d1ba2da2d300 Mon Sep 17 00:00:00 2001 From: "azimuth-ci-bot[bot]" <142236172+azimuth-ci-bot[bot]@users.noreply.github.com> Date: Wed, 7 Feb 2024 19:38:19 +0000 Subject: [PATCH 14/27] Update azimuth-ops to 0.4.1 (#98) Co-authored-by: mkjpryor --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 0b8e1ef9..985bfa95 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.4.0 + version: 0.4.1 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From 417cafa2a9b0731c3f69f030d08900fdf1288427 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 7 Feb 2024 19:39:58 +0000 Subject: [PATCH 15/27] Add Azimuth upgrade test (#97) * Start job for upgrade test * Restore singlenode triggers * Fix typo * Remove dependency * Test reconfiguration of Azimuth environment * Add provision and destroy to upgrade test * Use different prefixes for different jobs * Make sure not to remove TF state on second checkout * Add setup step to pre-allocate ingress IP, as in a real install * Try using SHA in action name * Use a separate actions checkout to get the latest actions * Add debugging step * Move the actions checkout to after the main checkout * Remove debug statement * Add tag to FIP for easy deletion * Add destroy step to release ingress IP * Fix typo in destroy action * Only set the required ingress IP variable * Use azimuth-ops version with force_destroy * Reinstate full workflow * Remove unnecessary conditional * azimuth-ops version with force_destroy tagged * Use correct variable name for single-node ingress IP --- .github/actions/destroy/action.yml | 15 ++++++- .github/actions/setup/action.yml | 43 +++++++++++++++++- .github/workflows/test-full.yml | 72 ++++++++++++++++++++++++++++-- 3 files changed, 123 insertions(+), 7 deletions(-) diff --git a/.github/actions/destroy/action.yml b/.github/actions/destroy/action.yml index 3c9faec0..48b95c37 100644 --- a/.github/actions/destroy/action.yml +++ b/.github/actions/destroy/action.yml @@ -9,4 +9,17 @@ runs: set -e source ./ci.env source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" - ansible-playbook stackhpc.azimuth_ops.destroy -e @extra-vars.yml + ansible-playbook stackhpc.azimuth_ops.destroy -e @extra-vars.yml -e force_destroy=true + if: ${{ always() }} + + - name: Release ingress floating IP + shell: bash + run: | + set -eo pipefail + source ci.env + source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" + FIP_ID="$(openstack floating ip list --tags "$AZIMUTH_ENVIRONMENT" -f json | jq -r '.[0].ID // ""')" + [ -n "$FIP_ID" ] && openstack floating ip delete $FIP_ID + env: + INGRESS_IP: ${{ steps.ingress-ip.outputs.ip-address }} + if: ${{ always() }} diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index a04e2419..c48c979f 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -22,7 +22,7 @@ inputs: description: > YAML-formatted extra vars for the deployment, if required. required: true - default: "{}" + default: "" os-clouds: description: The contents of the clouds.yaml to use. required: true @@ -36,6 +36,11 @@ inputs: separated by a hyphen. required: true default: ci + allocate-ingress-ip: + description: > + Indicates whether a floating IP should be allocated for ingress. + required: true + default: "yes" runs: using: composite steps: @@ -44,6 +49,7 @@ runs: with: repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} + clean: false - name: Write clouds.yaml shell: bash @@ -79,7 +85,10 @@ runs: shell: bash run: cat > extra-vars.yml <<< "$EXTRA_VARS" env: - EXTRA_VARS: ${{ inputs.extra-vars }} + # Use a dummy variable so that Ansible treats the file as YAML even if the input is empty + EXTRA_VARS: | + this_variable_is_never_used: ever + ${{ inputs.extra-vars }} - name: Ensure Python 3.9 uses: actions/setup-python@v4 @@ -98,3 +107,33 @@ runs: source ci.env source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" ansible-galaxy install -f -r requirements.yml + + - name: Allocate floating IP for ingress + shell: bash + run: | + set -eo pipefail + source ci.env + source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" + ansible_variable() { + ansible -m debug -a "var=$1" -e @extra-vars.yml all | + jq -r ".plays[0].tasks[0].hosts.localhost.$1" + } + INSTALL_MODE="$(ansible_variable install_mode)" + EXTNET_ID="$(ansible_variable infra_external_network_id)" + IP_ADDRESS="$( + openstack floating ip create $EXTNET_ID \ + --description "ingress IP for $AZIMUTH_ENVIRONMENT" \ + --tag "$AZIMUTH_ENVIRONMENT" \ + --format value \ + --column floating_ip_address + )" + VAR_NAME="$([ "$INSTALL_MODE" = "ha" ] && echo "capi_cluster_addons_ingress_load_balancer_ip" || echo "infra_fixed_floatingip")" + echo "$VAR_NAME: $IP_ADDRESS" >> extra-vars.yml + env: + ANSIBLE_LOAD_CALLBACK_PLUGINS: "true" + ANSIBLE_STDOUT_CALLBACK: json + if: ${{ inputs.allocate-ingress-ip == 'yes' }} + + - name: Output extra-vars.yml for debugging + shell: bash + run: cat extra-vars.yml diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index b162265e..1e70fc2d 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -18,7 +18,6 @@ jobs: # Until that is implemented, the only other viable option is a busy wait wait_in_queue: needs: [fail_on_remote] - if: ${{ github.event_name == 'workflow_dispatch' || !github.event.pull_request.draft }} runs-on: ubuntu-latest steps: - name: Wait for an available slot @@ -26,9 +25,8 @@ jobs: with: max-concurrency: 1 - # Run the full test suite - # Currently, this is just a HA install - run_azimuth_tests: + # Tests a clean HA deployment + all appliances + test_clean_ha: needs: [wait_in_queue] runs-on: ubuntu-latest steps: @@ -43,6 +41,7 @@ jobs: repository: ${{ github.repository }} ref: ${{ github.ref }} config-environment: ci-ha + environment-prefix: ci-ha - name: Provision Azimuth uses: ./.github/actions/provision @@ -53,3 +52,68 @@ jobs: - name: Destroy Azimuth uses: ./.github/actions/destroy if: ${{ always() }} + + # Tests an Azimuth upgrade from the current latest release + # Currently, this just tests the Azimuth upgrade itself with no appliances + # TODO(mkjpryor) add appliance provisioning and verification before and after upgrade + test_azimuth_upgrade: + needs: [test_clean_ha] + runs-on: ubuntu-latest + steps: + - name: Get latest tag + id: latest-tag + run: | + set -eo pipefail + TAG_NAME="$(curl -fsSL "$GITHUB_API_URL/repos/$GITHUB_REPOSITORY/releases/latest" | jq -r '.tag_name')" + echo "tag-name=${TAG_NAME}" >> "$GITHUB_OUTPUT" + + - name: Checkout latest tag + uses: actions/checkout@v3 + with: + ref: ${{ steps.latest-tag.outputs.tag-name }} + + # We want to use the actions as defined in the code under test + # So check them out separately + - name: Checkout code under test into .actions directory + uses: actions/checkout@v3 + with: + path: .actions + + - name: Set up Azimuth environment + uses: ./.actions/.github/actions/setup + with: + os-clouds: ${{ secrets.CLOUD }} + repository: ${{ github.repository }} + ref: ${{ steps.latest-tag.outputs.tag-name }} + config-environment: ci-ha + environment-prefix: ci-az-upgrade + + - name: Provision Azimuth + uses: ./.actions/.github/actions/provision + + - name: Checkout code under test + uses: actions/checkout@v3 + with: + # Make sure not to remove working directories + clean: false + + - name: Install updated Python dependencies + run: | + set -e + source ci.env + source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" + python -m pip install -r requirements.txt + + - name: Upgrade Ansible dependencies + run: | + set -e + source ci.env + source ./bin/activate "$AZIMUTH_CONFIG_ENVIRONMENT" "$AZIMUTH_ENVIRONMENT" + ansible-galaxy install -f -r requirements.yml + + - name: Upgrade Azimuth + uses: ./.actions/.github/actions/provision + + - name: Destroy Azimuth + uses: ./.actions/.github/actions/destroy + if: ${{ always() }} From 89bce591a608b8c510421c59df8eb11e2722d7b9 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 7 Feb 2024 20:26:31 +0000 Subject: [PATCH 16/27] Remove queue from full test suite workflow --- .github/workflows/test-full.yml | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index 1e70fc2d..03117607 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -5,29 +5,8 @@ on: workflow_dispatch: jobs: - # This job exists so that PRs from outside the main repo are rejected - fail_on_remote: - runs-on: ubuntu-latest - steps: - - name: Code under test must be from a branch in the azimuth-config repo - run: exit ${{ github.repository == 'stackhpc/azimuth-config' && '0' || '1' }} - - # We want jobs to wait in a queue for a slot to run, so as not to overload the test infra - # GitHub concurrency _almost_ does this, except the queue length is one :-( - # There is a feature request for what we need https://github.com/orgs/community/discussions/12835 - # Until that is implemented, the only other viable option is a busy wait - wait_in_queue: - needs: [fail_on_remote] - runs-on: ubuntu-latest - steps: - - name: Wait for an available slot - uses: stackhpc/github-actions/workflow-concurrency@master - with: - max-concurrency: 1 - # Tests a clean HA deployment + all appliances test_clean_ha: - needs: [wait_in_queue] runs-on: ubuntu-latest steps: # We need to check out the code under test first in order to use local actions From 9d70fe5a8f68da99a3a883e4c97049ba300cd9c5 Mon Sep 17 00:00:00 2001 From: "azimuth-ci-bot[bot]" <142236172+azimuth-ci-bot[bot]@users.noreply.github.com> Date: Wed, 7 Feb 2024 23:54:25 +0000 Subject: [PATCH 17/27] Update azimuth-ops to 0.4.2 (#102) Co-authored-by: mkjpryor --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 985bfa95..f4b59a6f 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.4.1 + version: 0.4.2 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From 393125a087c74e436903f44511c0e019f4a3009e Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 7 Feb 2024 23:54:51 +0000 Subject: [PATCH 18/27] Make auth_url detection more robust (#101) * Make auth_url detection more robust * Fix quotes --- environments/base/inventory/group_vars/all.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/environments/base/inventory/group_vars/all.yml b/environments/base/inventory/group_vars/all.yml index 816553af..5da92e85 100644 --- a/environments/base/inventory/group_vars/all.yml +++ b/environments/base/inventory/group_vars/all.yml @@ -148,9 +148,10 @@ __os_auth_url: >- {{- lookup('file', __os_clouds_file) | from_yaml | - json_query('clouds.' + __os_cloud + '.auth.auth_url') + json_query('clouds.' + __os_cloud + '.auth.auth_url') | + trim('/') }} -azimuth_openstack_auth_url: "{{ __os_auth_url | trim('/') }}/v3" +azimuth_openstack_auth_url: "{{ __os_auth_url.removesuffix('/v3') }}/v3" # Use the current project ID for the HA CAPI cluster capi_cluster_openstack_project_id: >- From f06894dc56f3f09d76cb7ac3e59f0e3c5c240042 Mon Sep 17 00:00:00 2001 From: Bartosz Bezak Date: Fri, 9 Feb 2024 12:41:27 +0100 Subject: [PATCH 19/27] add admin dashboards secret to the example environment (#103) --- environments/example/inventory/group_vars/all/secrets.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/example/inventory/group_vars/all/secrets.yml b/environments/example/inventory/group_vars/all/secrets.yml index 0a5b3169..c713ad15 100644 --- a/environments/example/inventory/group_vars/all/secrets.yml +++ b/environments/example/inventory/group_vars/all/secrets.yml @@ -16,6 +16,8 @@ keycloak_admin_password: "" azimuth_secret_key: "" # The secret key for signing Zenith registrar tokens zenith_registrar_subdomain_token_signing_key: "" +# The admin password for Azimuth administrative dashboards +admin_dashboard_ingress_basic_auth_password: "" # The Slack webhook URL for monitoring alerts (optional) # alertmanager_config_slack_webhook_url: https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX From 4aeab5bb7819bb16a3e89fd05786cf305ce51ed8 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 21 Feb 2024 18:19:52 +0000 Subject: [PATCH 20/27] Use updated chart defaults during a Tilt deploy (#104) --- bin/tilt-images-apply | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/bin/tilt-images-apply b/bin/tilt-images-apply index 9a716256..c01e505a 100755 --- a/bin/tilt-images-apply +++ b/bin/tilt-images-apply @@ -1,10 +1,8 @@ #!/usr/bin/env python3 -import contextlib import json import os import pathlib -import re import subprocess import sys @@ -61,6 +59,22 @@ if not revision_file.exists(): exec_cmd(["helm", "dependency", "update", os.environ["TILT_CHART_PATH"]]) +# Get the current user values +helm_get_values_proc = exec_cmd( + [ + "helm", + "get", + "values", + os.environ["TILT_RELEASE_NAME"], + "--namespace", + os.environ["TILT_RELEASE_NAMESPACE"], + ], + stdout = subprocess.PIPE +) +current_values = yaml.safe_load(helm_get_values_proc.stdout) +current_values.pop("USER-SUPPLIED VALUES") + + # Build and run the Helm upgrade command # We reuse the values from the previous installation, but overwrite any images # specified in the Tiltfile @@ -71,7 +85,8 @@ helm_upgrade_command = [ os.environ["TILT_CHART_PATH"], "--namespace", os.environ["TILT_RELEASE_NAMESPACE"], - "--reuse-values", + "--values", + "-", ] idx = 0 @@ -91,7 +106,7 @@ while True: ]) idx = idx + 1 -exec_cmd(helm_upgrade_command) +exec_cmd(helm_upgrade_command, input = json.dumps(current_values).encode()) # Finally, print the currently installed manifest so Tilt knows about the resources From 0da3ebe83dcab1a30e85ae99bce3b324d02d180e Mon Sep 17 00:00:00 2001 From: "azimuth-ci-bot[bot]" <142236172+azimuth-ci-bot[bot]@users.noreply.github.com> Date: Thu, 22 Feb 2024 11:02:38 +0000 Subject: [PATCH 21/27] Update azimuth-ops to 0.5.0-rc.1 (#106) Co-authored-by: mkjpryor --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index f4b59a6f..1779057b 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ collections: - name: https://github.com/stackhpc/ansible-collection-azimuth-ops.git type: git - version: 0.4.2 + version: 0.5.0-rc.1 # For local development # - type: dir # source: ../ansible-collection-azimuth-ops From 5145d8b0b9a506e57a4dc2c45523442b6becd568 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 23 Feb 2024 13:51:36 +0000 Subject: [PATCH 22/27] Tweak variables to get HA CI to run more easily --- .../ci-ha/inventory/group_vars/all/variables.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/environments/ci-ha/inventory/group_vars/all/variables.yml b/environments/ci-ha/inventory/group_vars/all/variables.yml index 83be0d39..7c7a3267 100644 --- a/environments/ci-ha/inventory/group_vars/all/variables.yml +++ b/environments/ci-ha/inventory/group_vars/all/variables.yml @@ -6,13 +6,17 @@ infra_network_id: infra_flavor_id: c8b72062-5d52-4590-9d7a-68a670b44442 # The flavor to use for the control plane nodes capi_cluster_control_plane_flavor: vm.ska.cpu.general.small -capi_cluster_worker_flavor: vm.ska.cpu.general.eighth +# The flavor to use for worker nodes +capi_cluster_worker_flavor: vm.ska.cpu.general.small + # Although this is a "HA" test, what we are really testing is the spawning # of the CAPI cluster and deployment of Azimuth onto that +# We have also preferred to use 3 small workers rather than 1 or 2 eighth workers, +# as they are more likely to fit in the gaps between other workloads # So one control plane node and two workers is sufficient for that capi_cluster_control_plane_count: 1 -capi_cluster_worker_count: 2 -# Disable affinity for the Consul server so we can have 3 pods on two nodes -consul_release_overrides: - server: - affinity: "" +capi_cluster_worker_count: 3 + +# Use a single replica for Consul +# The risk of failed upgrades is too great, and it is going away soon +consul_server_replicas: 1 From 9d8172184d03a09f9b0f4c8e2f4712130098add6 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Thu, 29 Feb 2024 16:28:00 +0000 Subject: [PATCH 23/27] Restructure CI to support multiple clouds (#108) * Restructure CI to support multiple clouds * Debug setup * More debugging * Use correct context for target cloud * Wait for a FIP instead of failing --- .github/actions/setup/action.yml | 47 +++++++++--------- .github/environments/arcus-ha/ansible.cfg | 9 ++++ .../inventory/group_vars/all/variables.yml | 0 .../environments/arcus-ha}/inventory/hosts | 0 .github/environments/arcus/ansible.cfg | 9 ++++ .../inventory/group_vars/all/variables.yml | 0 .../environments/arcus}/inventory/hosts | 0 .github/environments/leafcloud-ha/ansible.cfg | 9 ++++ .../inventory/group_vars/all/variables.yml | 32 +++++++++++++ .../environments/leafcloud-ha/inventory/hosts | 2 + .github/environments/leafcloud/ansible.cfg | 9 ++++ .../inventory/group_vars/all/variables.yml | 48 +++++++++++++++++++ .../environments/leafcloud/inventory/hosts | 2 + .github/workflows/test-full.yml | 22 +++++++-- .github/workflows/test-singlenode.yml | 13 ++++- bin/activate | 13 +++-- environments/ci-ha/ansible.cfg | 9 ---- environments/ci/ansible.cfg | 9 ---- 18 files changed, 183 insertions(+), 50 deletions(-) create mode 100644 .github/environments/arcus-ha/ansible.cfg rename {environments/ci-ha => .github/environments/arcus-ha}/inventory/group_vars/all/variables.yml (100%) rename {environments/ci-ha => .github/environments/arcus-ha}/inventory/hosts (100%) create mode 100644 .github/environments/arcus/ansible.cfg rename {environments/ci => .github/environments/arcus}/inventory/group_vars/all/variables.yml (100%) rename {environments/ci => .github/environments/arcus}/inventory/hosts (100%) create mode 100644 .github/environments/leafcloud-ha/ansible.cfg create mode 100644 .github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml create mode 100644 .github/environments/leafcloud-ha/inventory/hosts create mode 100644 .github/environments/leafcloud/ansible.cfg create mode 100644 .github/environments/leafcloud/inventory/group_vars/all/variables.yml create mode 100644 .github/environments/leafcloud/inventory/hosts delete mode 100644 environments/ci-ha/ansible.cfg delete mode 100644 environments/ci/ansible.cfg diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index c48c979f..817b0bc3 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -9,10 +9,18 @@ inputs: description: The ref to use for the Azimuth configuration. required: true default: devel - config-environment: - description: The config environment to use. + target-cloud: + description: |- + The name of the cloud to target. + This is used as both the name of the cloud with the clouds.yaml + and to determine the config environment to use. + Currently, arcus and leafcloud are supported. required: true - default: ci + default: arcus + install-mode: + description: The install mode to use. Either singlenode or ha. + required: true + default: singlenode azimuth-ops-version: description: > The azimuth-ops version to use. If not given, the default version is used. @@ -26,21 +34,12 @@ inputs: os-clouds: description: The contents of the clouds.yaml to use. required: true - os-cloud-name: - description: The name of the cloud within the clouds.yaml to use. - required: true - default: openstack environment-prefix: description: > The environment prefix to use. The run ID will be appended to this, separated by a hyphen. required: true default: ci - allocate-ingress-ip: - description: > - Indicates whether a floating IP should be allocated for ingress. - required: true - default: "yes" runs: using: composite steps: @@ -63,9 +62,9 @@ runs: run: cat > ./ci.env <<< "$CI_ENV" env: CI_ENV: | - export OS_CLOUD="${{ inputs.os-cloud-name }}" + export OS_CLOUD="${{ inputs.target-cloud }}" export OS_CLIENT_CONFIG_FILE="$PWD/clouds.yaml" - export AZIMUTH_CONFIG_ENVIRONMENT=${{ inputs.config-environment }} + export AZIMUTH_CONFIG_ENVIRONMENT=${{ inputs.target-cloud }}${{ inputs.install-mode == 'ha' && '-ha' || '' }} export AZIMUTH_ENVIRONMENT="${{ inputs.environment-prefix }}-${{ github.run_id }}" export ANSIBLE_FORCE_COLOR=true @@ -120,19 +119,23 @@ runs: } INSTALL_MODE="$(ansible_variable install_mode)" EXTNET_ID="$(ansible_variable infra_external_network_id)" - IP_ADDRESS="$( - openstack floating ip create $EXTNET_ID \ - --description "ingress IP for $AZIMUTH_ENVIRONMENT" \ - --tag "$AZIMUTH_ENVIRONMENT" \ - --format value \ - --column floating_ip_address - )" + IP_ADDRESS= + until \ + IP_ADDRESS="$( + openstack floating ip create $EXTNET_ID \ + --description "ingress IP for $AZIMUTH_ENVIRONMENT" \ + --tag "$AZIMUTH_ENVIRONMENT" \ + --format value \ + --column floating_ip_address + )" + do + sleep 30 + done VAR_NAME="$([ "$INSTALL_MODE" = "ha" ] && echo "capi_cluster_addons_ingress_load_balancer_ip" || echo "infra_fixed_floatingip")" echo "$VAR_NAME: $IP_ADDRESS" >> extra-vars.yml env: ANSIBLE_LOAD_CALLBACK_PLUGINS: "true" ANSIBLE_STDOUT_CALLBACK: json - if: ${{ inputs.allocate-ingress-ip == 'yes' }} - name: Output extra-vars.yml for debugging shell: bash diff --git a/.github/environments/arcus-ha/ansible.cfg b/.github/environments/arcus-ha/ansible.cfg new file mode 100644 index 00000000..c32a7817 --- /dev/null +++ b/.github/environments/arcus-ha/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = ../../../environments/base/inventory,../../../environments/ha/inventory,../../../environments/demo/inventory,../arcus/inventory,./inventory +roles_path = ../../../.ansible/roles +collections_path = ../../../.ansible/collections + +host_key_checking = False + +[ssh_connection] +retries = 3 diff --git a/environments/ci-ha/inventory/group_vars/all/variables.yml b/.github/environments/arcus-ha/inventory/group_vars/all/variables.yml similarity index 100% rename from environments/ci-ha/inventory/group_vars/all/variables.yml rename to .github/environments/arcus-ha/inventory/group_vars/all/variables.yml diff --git a/environments/ci-ha/inventory/hosts b/.github/environments/arcus-ha/inventory/hosts similarity index 100% rename from environments/ci-ha/inventory/hosts rename to .github/environments/arcus-ha/inventory/hosts diff --git a/.github/environments/arcus/ansible.cfg b/.github/environments/arcus/ansible.cfg new file mode 100644 index 00000000..422d3b3d --- /dev/null +++ b/.github/environments/arcus/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = ../../../environments/base/inventory,../../../environments/singlenode/inventory,../../../environments/demo/inventory,./inventory +roles_path = ../../../.ansible/roles +collections_path = ../../../.ansible/collections + +host_key_checking = False + +[ssh_connection] +retries = 3 diff --git a/environments/ci/inventory/group_vars/all/variables.yml b/.github/environments/arcus/inventory/group_vars/all/variables.yml similarity index 100% rename from environments/ci/inventory/group_vars/all/variables.yml rename to .github/environments/arcus/inventory/group_vars/all/variables.yml diff --git a/environments/ci/inventory/hosts b/.github/environments/arcus/inventory/hosts similarity index 100% rename from environments/ci/inventory/hosts rename to .github/environments/arcus/inventory/hosts diff --git a/.github/environments/leafcloud-ha/ansible.cfg b/.github/environments/leafcloud-ha/ansible.cfg new file mode 100644 index 00000000..c1289154 --- /dev/null +++ b/.github/environments/leafcloud-ha/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = ../../../environments/base/inventory,../../../environments/ha/inventory,../../../environments/demo/inventory,../leafcloud/inventory,./inventory +roles_path = ../../../.ansible/roles +collections_path = ../../../.ansible/collections + +host_key_checking = False + +[ssh_connection] +retries = 3 diff --git a/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml b/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml new file mode 100644 index 00000000..48e258f4 --- /dev/null +++ b/.github/environments/leafcloud-ha/inventory/group_vars/all/variables.yml @@ -0,0 +1,32 @@ +# Unset the network ID so that a network + router are provisioned +infra_network_id: + +# Make sure we pick flavors that keep the costs down +# The flavor to use for the seed VM +infra_flavor_id: ec1.medium # 2 vCPUs, 4GB RAM @ leaf site +# The flavor to use for the control plane nodes +capi_cluster_control_plane_flavor: ec1.medium # 2 vCPUs, 4GB RAM @ leaf site +# The flavor to use for worker nodes +capi_cluster_worker_flavor: en1.medium # 2 vCPUs, 8GB RAM @ leaf site + +# Although this is a "HA" test, what we are really testing is the spawning +# of the CAPI cluster and deployment of Azimuth onto that +# So one control plane node is sufficient for that +capi_cluster_control_plane_count: 1 +capi_cluster_worker_count: 2 + +# Don't use explicit AZs for Kubernetes nodes +capi_cluster_control_plane_omit_failure_domain: true +capi_cluster_worker_failure_domain: null + +# Leafcloud doesn't use the default 'nova' AZ for volumes +capi_cluster_root_volume_availability_zone: europe-nl-ams1 +capi_cluster_addons_csi_cinder_availability_zone: europe-nl-ams1 + +# Use the unencrypted volume type for Kubernetes volumes +capi_cluster_root_volume_type: unencrypted +capi_cluster_addons_csi_cinder_volume_type: unencrypted + +# Use a single replica for Consul +# The risk of failed upgrades is too great, and it is going away soon +consul_server_replicas: 1 diff --git a/.github/environments/leafcloud-ha/inventory/hosts b/.github/environments/leafcloud-ha/inventory/hosts new file mode 100644 index 00000000..9dcf1df6 --- /dev/null +++ b/.github/environments/leafcloud-ha/inventory/hosts @@ -0,0 +1,2 @@ +[terraform_provision] +localhost ansible_connection=local ansible_python_interpreter="{{ ansible_playbook_python }}" diff --git a/.github/environments/leafcloud/ansible.cfg b/.github/environments/leafcloud/ansible.cfg new file mode 100644 index 00000000..422d3b3d --- /dev/null +++ b/.github/environments/leafcloud/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = ../../../environments/base/inventory,../../../environments/singlenode/inventory,../../../environments/demo/inventory,./inventory +roles_path = ../../../.ansible/roles +collections_path = ../../../.ansible/collections + +host_key_checking = False + +[ssh_connection] +retries = 3 diff --git a/.github/environments/leafcloud/inventory/group_vars/all/variables.yml b/.github/environments/leafcloud/inventory/group_vars/all/variables.yml new file mode 100644 index 00000000..98fa716a --- /dev/null +++ b/.github/environments/leafcloud/inventory/group_vars/all/variables.yml @@ -0,0 +1,48 @@ +# Use the external-facing network +infra_external_network_id: "{{ lookup('pipe', 'openstack network show external -f value -c id') }}" + +# Use the pre-existing portal-internal network so that we don't need to steal a router +infra_network_id: "{{ lookup('pipe', 'openstack network show portal-internal -f value -c id') }}" + +# The flavors only have 20GB root disks, which is not enough to unpack images for uploading +# So we need to use a Cinder root volume +# We also don't need the encypted volume type +infra_root_volume_enabled: yes +infra_root_volume_size: 40 +infra_root_volume_type: unencrypted +# But we can decrease the size of the infra data volume +infra_data_volume_size: 40 +infra_data_volume_type: unencrypted + +# Make sure we pick flavors that keep the costs down +# The flavor to use for the Azimuth AIO VM +infra_flavor_id: en1.medium # 2 vCPUs, 8GB RAM @ leaf site +# The flavor to use for the workstation test case +generate_tests_caas_test_case_workstation_param_cluster_flavor: ec1.medium # 2 vCPUs, 4GB RAM @ leaf site +# The flavor to use for the repo2docker test case +generate_tests_caas_test_case_repo2docker_param_cluster_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" +# The flavor to use for the R-Studio test case +generate_tests_caas_test_case_rstudio_param_cluster_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" +# The flavor to use for the nodes in the Slurm test case +# The flavors to use for the Slurm login and control nodes +generate_tests_caas_test_case_slurm_param_login_flavor: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" +generate_tests_caas_test_case_slurm_param_control_flavor: "{{ generate_tests_caas_test_case_slurm_param_login_flavor }}" +generate_tests_caas_test_case_slurm_param_compute_flavor: "{{ generate_tests_caas_test_case_slurm_param_login_flavor }}" +# The flavors to use for the control plane and workers in Kubernetes test cases +generate_tests_kubernetes_test_case_control_plane_size: "{{ generate_tests_caas_test_case_workstation_param_cluster_flavor }}" +generate_tests_kubernetes_test_case_worker_size: "{{ generate_tests_kubernetes_test_case_control_plane_size }}" +# The flavors to use for the control plane and workers in the Kubernetes apps test cases +generate_tests_kubernetes_apps_k8s_control_plane_size: "{{ generate_tests_kubernetes_test_case_control_plane_size }}" +generate_tests_kubernetes_apps_k8s_worker_size: "{{ generate_tests_kubernetes_test_case_worker_size }}" + +# Don't use explicit AZs for Kubernetes nodes +azimuth_capi_operator_capi_helm_control_plane_omit_failure_domain: true +azimuth_capi_operator_capi_helm_worker_failure_domain: null + +# Leafcloud doesn't use the default 'nova' AZ for volumes +azimuth_capi_operator_capi_helm_root_volume_availability_zone: europe-nl-ams1 +azimuth_capi_operator_capi_helm_csi_cinder_default_availability_zone: europe-nl-ams1 + +# Use the unencrypted volume type for Kubernetes volumes +azimuth_capi_operator_capi_helm_root_volume_type: unencrypted +azimuth_capi_operator_capi_helm_csi_cinder_default_volume_type: unencrypted diff --git a/.github/environments/leafcloud/inventory/hosts b/.github/environments/leafcloud/inventory/hosts new file mode 100644 index 00000000..9dcf1df6 --- /dev/null +++ b/.github/environments/leafcloud/inventory/hosts @@ -0,0 +1,2 @@ +[terraform_provision] +localhost ansible_connection=local ansible_python_interpreter="{{ ansible_playbook_python }}" diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml index 03117607..3bd29a95 100644 --- a/.github/workflows/test-full.yml +++ b/.github/workflows/test-full.yml @@ -3,6 +3,16 @@ name: Full test suite on: # Allow manual execution on any branch workflow_dispatch: + inputs: + target-cloud: + description: >- + The cloud to target for the run. + Leave blank to use the default cloud. + type: choice + options: + - "" + - arcus + - leafcloud jobs: # Tests a clean HA deployment + all appliances @@ -16,10 +26,11 @@ jobs: - name: Set up Azimuth environment uses: ./.github/actions/setup with: - os-clouds: ${{ secrets.CLOUD }} + os-clouds: ${{ secrets.OS_CLOUDS }} repository: ${{ github.repository }} ref: ${{ github.ref }} - config-environment: ci-ha + target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} + install-mode: ha environment-prefix: ci-ha - name: Provision Azimuth @@ -61,11 +72,12 @@ jobs: - name: Set up Azimuth environment uses: ./.actions/.github/actions/setup with: - os-clouds: ${{ secrets.CLOUD }} + os-clouds: ${{ secrets.OS_CLOUDS }} repository: ${{ github.repository }} ref: ${{ steps.latest-tag.outputs.tag-name }} - config-environment: ci-ha - environment-prefix: ci-az-upgrade + target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} + install-mode: ha + environment-prefix: ci-upgrade - name: Provision Azimuth uses: ./.actions/.github/actions/provision diff --git a/.github/workflows/test-singlenode.yml b/.github/workflows/test-singlenode.yml index d8bcc22d..36de605d 100644 --- a/.github/workflows/test-singlenode.yml +++ b/.github/workflows/test-singlenode.yml @@ -3,6 +3,16 @@ name: Single node test on: # Allow manual execution on any branch workflow_dispatch: + inputs: + target-cloud: + description: >- + The cloud to target for the run. + Leave blank to use the default cloud. + type: choice + options: + - "" + - arcus + - leafcloud # Execute by default on pull requests to the devel branch pull_request: types: @@ -48,9 +58,10 @@ jobs: - name: Set up Azimuth environment uses: ./.github/actions/setup with: - os-clouds: ${{ secrets.CLOUD }} + os-clouds: ${{ secrets.OS_CLOUDS }} repository: ${{ github.repository }} ref: ${{ github.ref }} + target-cloud: ${{ inputs.target-cloud || vars.TARGET_CLOUD }} - name: Provision Azimuth uses: ./.github/actions/provision diff --git a/bin/activate b/bin/activate index a87003f1..3affc7f2 100644 --- a/bin/activate +++ b/bin/activate @@ -17,11 +17,16 @@ AZIMUTH_CONFIG_ENVIRONMENT="$1" AZIMUTH_ENVIRONMENT="${2:-"$AZIMUTH_CONFIG_ENVIRONMENT"}" AZIMUTH_CONFIG_ROOT="$(dirname $(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})))" -AZIMUTH_CONFIG_ENVIRONMENT_ROOT="$AZIMUTH_CONFIG_ROOT/environments/$AZIMUTH_CONFIG_ENVIRONMENT" -if [ ! -d "$AZIMUTH_CONFIG_ENVIRONMENT_ROOT" ]; then - echo "Unrecognised config environment '$AZIMUTH_CONFIG_ENVIRONMENT'" >&2 - return 1 +# Most environments should exist in a top-level "environments" directory, but CI +# environments live under .github to avoid cluttering the main directory +if [ -d "$AZIMUTH_CONFIG_ROOT/environments/$AZIMUTH_CONFIG_ENVIRONMENT" ]; then + AZIMUTH_CONFIG_ENVIRONMENT_ROOT="$AZIMUTH_CONFIG_ROOT/environments/$AZIMUTH_CONFIG_ENVIRONMENT" +elif [ -d "$AZIMUTH_CONFIG_ROOT/.github/environments/$AZIMUTH_CONFIG_ENVIRONMENT" ]; then + AZIMUTH_CONFIG_ENVIRONMENT_ROOT="$AZIMUTH_CONFIG_ROOT/.github/environments/$AZIMUTH_CONFIG_ENVIRONMENT" +else + echo "Unrecognised config environment '$AZIMUTH_CONFIG_ENVIRONMENT'" >&2 + return 1 fi ANSIBLE_CONFIG="$AZIMUTH_CONFIG_ENVIRONMENT_ROOT/ansible.cfg" diff --git a/environments/ci-ha/ansible.cfg b/environments/ci-ha/ansible.cfg deleted file mode 100644 index 3a71ec09..00000000 --- a/environments/ci-ha/ansible.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[defaults] -inventory = ../base/inventory,../ha/inventory,../demo/inventory,../ci/inventory,./inventory -roles_path = ../../.ansible/roles -collections_path = ../../.ansible/collections - -host_key_checking = False - -[ssh_connection] -retries = 3 diff --git a/environments/ci/ansible.cfg b/environments/ci/ansible.cfg deleted file mode 100644 index ecb90adc..00000000 --- a/environments/ci/ansible.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[defaults] -inventory = ../base/inventory,../singlenode/inventory,../demo/inventory,./inventory -roles_path = ../../.ansible/roles -collections_path = ../../.ansible/collections - -host_key_checking = False - -[ssh_connection] -retries = 3 From c3d302fe8151135d21821d4003fe1be3844cbb18 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Thu, 29 Feb 2024 17:59:00 +0000 Subject: [PATCH 24/27] Fix seed-ssh for OpenTofu (#111) --- bin/seed-ssh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/bin/seed-ssh b/bin/seed-ssh index 6a8e55a8..4ce7a57b 100755 --- a/bin/seed-ssh +++ b/bin/seed-ssh @@ -28,6 +28,18 @@ ansible_variable() { terraform_binary_directory="$(ansible_variable terraform_binary_directory)" export PATH="$terraform_binary_directory:$PATH" +# If tofu is available in the path, use that +if which tofu >/dev/null; then + terraform_binary_path="$(which tofu)" +elif which terraform >/dev/null; then + echo "OpenTofu is not installed - falling back to Terraform" >&2 + echo "This may cause issues, especially when downloading providers" >&2 + terraform_binary_path="$(which terraform)" +else + echo "Unable to find OpenTofu or Terraform" >&2 + exit 1 +fi + # Make a working directory for seed-ssh related stuff work_dir="$(ansible_variable work_directory)/seed-ssh" mkdir -p "$work_dir" @@ -48,7 +60,7 @@ terraform { } EOF ansible_variable terraform_backend_config > "$terraform_dir/backend_config.json" - terraform \ + $terraform_binary_path \ -chdir="$terraform_dir" \ init \ -input=false \ @@ -58,7 +70,7 @@ fi # Read the required variables from the Terraform state tfstate_file="$work_dir/tfstate" -terraform -chdir="$terraform_dir" state pull > "$tfstate_file" +$terraform_binary_path -chdir="$terraform_dir" state pull > "$tfstate_file" node_ip="$(jq -r '.outputs.cluster_gateway_ip.value // ""' "$tfstate_file")" deploy_key="$work_dir/deploy-key" jq -r '.outputs.cluster_ssh_private_key.value // ""' "$tfstate_file" > "$deploy_key" From dc24759da037982a38c6a81eadef59e657c7bb99 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 1 Mar 2024 09:46:47 +0000 Subject: [PATCH 25/27] Ignore HA-only and docs-only changes in PR workflow (#110) --- .github/workflows/test-singlenode.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-singlenode.yml b/.github/workflows/test-singlenode.yml index 36de605d..9309f881 100644 --- a/.github/workflows/test-singlenode.yml +++ b/.github/workflows/test-singlenode.yml @@ -22,8 +22,22 @@ on: - reopened branches: - devel - paths-ignore: - - 'docs/**' + # Only run the tests when something changes that affects us + paths: + - .gitattributes + - .gitignore + - requirements.txt + - requirements.yml + - .github/actions/** + - .github/workflows/test-singlenode.yml + - bin/** + - "!bin/ci-exec" + - "!bin/create-merge-branch" + - "!bin/tilt-*" + - environments/base/** + - environments/singlenode/** + - environments/demo/** + - environments/ci/** jobs: # This job exists so that PRs from outside the main repo are rejected From fa3faaf51b358b1d24735d14b93b6dfb019943f1 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 1 Mar 2024 12:03:07 +0000 Subject: [PATCH 26/27] Fix broken Kubernetes docs (#112) --- docs/configuration/03-kubernetes-config.md | 44 ++++++++++++---------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/docs/configuration/03-kubernetes-config.md b/docs/configuration/03-kubernetes-config.md index 0e47b9a9..4c61cf1a 100644 --- a/docs/configuration/03-kubernetes-config.md +++ b/docs/configuration/03-kubernetes-config.md @@ -138,7 +138,6 @@ capi_cluster_addons_openstack_loadbalancer_provider: ovn # This variable applies to load-balancers created for LoadBalancer services azimuth_capi_operator_capi_helm_openstack_loadbalancer_provider: ovn ``` -``` !!! tip @@ -171,30 +170,12 @@ available. Cluster API refers to "failure domains" which, in the OpenStack provider, correspond to availability zones (AZs). -### Use specific availability zones - -To specify the availability zones for Kubernetes nodes, the following variables can be used: - -```yaml title="environments/my-site/inventory/group_vars/all/variables.yml" -#### For the HA cluster #### - -# A list of failure domains that should be considered for control plane nodes -capi_cluster_control_plane_failure_domains: [az1, az2] -# The failure domain for worker nodes -capi_cluster_worker_failure_domain: az1 - -#### For tenant clusters #### - -azimuth_capi_operator_capi_helm_control_plane_failure_domains: [az1, az2] -azimuth_capi_operator_capi_helm_worker_failure_domain: az1 -``` - ### Ignore availability zones It is possible to configure Cluster API clusters in such a way that AZs are *not specified at all* for Kubernetes nodes. This allows other placement constraints such as [flavor traits](https://docs.openstack.org/nova/latest/user/flavors.html#extra-specs-required-traits) -and [host aggregate](https://docs.openstack.org/nova/latest/admin/aggregates.html) to +and [host aggregates](https://docs.openstack.org/nova/latest/admin/aggregates.html) to be used, and a suitable AZ to be selected by OpenStack. ```yaml title="environments/my-site/inventory/group_vars/all/variables.yml" @@ -209,3 +190,26 @@ capi_cluster_worker_failure_domain: null azimuth_capi_operator_capi_helm_control_plane_omit_failure_domain: true azimuth_capi_operator_capi_helm_worker_failure_domain: null ``` + +!!! tip + + This is the recommended configuration for new deployments, unless you have a specific + need to use specific availability zones. + +### Use specific availability zones + +To use specific availability zones for Kubernetes nodes, the following variables can be used: + +```yaml title="environments/my-site/inventory/group_vars/all/variables.yml" +#### For the HA cluster #### + +# A list of failure domains that should be considered for control plane nodes +capi_cluster_control_plane_failure_domains: [az1, az2] +# The failure domain for worker nodes +capi_cluster_worker_failure_domain: az1 + +#### For tenant clusters #### + +azimuth_capi_operator_capi_helm_control_plane_failure_domains: [az1, az2] +azimuth_capi_operator_capi_helm_worker_failure_domain: az1 +``` From 19053f1226c3eff8ee0e55fa72084dfa4a285e36 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Fri, 1 Mar 2024 12:03:54 +0000 Subject: [PATCH 27/27] Add TLS section to prerequisites (#113) --- docs/configuration/01-prerequisites.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/configuration/01-prerequisites.md b/docs/configuration/01-prerequisites.md index b2c920cb..0909ca13 100644 --- a/docs/configuration/01-prerequisites.md +++ b/docs/configuration/01-prerequisites.md @@ -116,3 +116,15 @@ being deployed with a floating IP attached that routes traffic to the ingress co In order for traffic to be routed correctly for these domains, a **wildcard** DNS record must exist for `*.azimuth.example.org` that points at the floating IP of the load-balancer for the ingress controller. **Azimuth does not manage this DNS record.** + +## Transport Layer Security (TLS) + +In order to provide secure connections to users, Azimuth needs to be able to obtain a TLS +certificate and private key for any of the subdomains under its wildcard domain. + +This can be achieved in two ways: + + 1. Using a pre-existing wildcard TLS certificate for all subdomains + 2. Using an ACME server (e.g. Let's Encrypt) to issue certificates dynamically + +These approaches are discussed in more detail in the [Ingress section](../configuration/06-ingress.md).