From 66fa1824559a435423db5496236de9074d2bb991 Mon Sep 17 00:00:00 2001 From: ytimocin Date: Tue, 9 Jan 2024 23:21:05 -0800 Subject: [PATCH] Updating the Radius installation step of the long-haul workflow Signed-off-by: ytimocin --- .github/workflows/long-running-azure.yaml | 58 ++++++++++++++++++----- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/.github/workflows/long-running-azure.yaml b/.github/workflows/long-running-azure.yaml index 5a73d95ef06..e1a119ea5af 100644 --- a/.github/workflows/long-running-azure.yaml +++ b/.github/workflows/long-running-azure.yaml @@ -45,11 +45,11 @@ on: branches: - main paths: - - '.github/workflows/long-running-azure.yaml' + - ".github/workflows/long-running-azure.yaml" env: # Go version - GOVER: '^1.21' + GOVER: "^1.21" GOPROXY: https://proxy.golang.org # gotestsum version - see: https://github.com/gotestyourself/gotestsum @@ -70,21 +70,24 @@ env: # The region for AWS resources AWS_REGION: us-west-2 # The AWS account ID - AWS_ACCOUNT_ID: '${{ secrets.FUNCTEST_AWS_ACCOUNT_ID }}' + AWS_ACCOUNT_ID: "${{ secrets.FUNCTEST_AWS_ACCOUNT_ID }}" # The valid radius build time window in seconds to rebuild radius. 24 hours = 24 * 60 * 60 = 86400 VALID_RADIUS_BUILD_WINDOW: 86400 # The AKS cluster name - AKS_CLUSTER_NAME: 'radiuse2e00-aks' + AKS_CLUSTER_NAME: "radiuse2e00-aks" # The resource group for AKS_CLUSTER_NAME resource. - AKS_RESOURCE_GROUP: 'radiuse2e00' + AKS_RESOURCE_GROUP: "radiuse2e00" # Server where terraform test modules are deployed - TF_RECIPE_MODULE_SERVER_URL: 'http://tf-module-server.radius-test-tf-module-server.svc.cluster.local' + TF_RECIPE_MODULE_SERVER_URL: "http://tf-module-server.radius-test-tf-module-server.svc.cluster.local" # Radius test environment name - RADIUS_TEST_ENVIRONMENT_NAME: 'kind-radius' + RADIUS_TEST_ENVIRONMENT_NAME: "kind-radius" + + # The current GitHub action link + ACTION_LINK: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" jobs: build: @@ -349,6 +352,14 @@ jobs: --name ${{ env.AKS_CLUSTER_NAME }} --admin env: RESOURCE_GROUP: ${{ env.AZURE_TEST_RESOURCE_GROUP }} + - name: Check if tests are already running + run: | + if kubectl get configmap long-running-test-lock -n radius-system --ignore-not-found; then + echo "Tests are already running. Exiting..." + exit 1 + fi + - name: Set test run flag + run: kubectl create configmap long-running-test-lock -n radius-system - name: Clean up cluster run: ./.github/scripts/cleanup-cluster.sh - name: Download Bicep @@ -368,6 +379,12 @@ jobs: export PATH=$GITHUB_WORKSPACE/bin:$PATH which rad || { echo "cannot find rad"; exit 1; } + echo "*** Uninstalling existing Radius installation ***" + rad uninstall kubernetes + + echo "*** Deleting radius-system namespace ***" + kubectl delete namespace radius-system --ignore-not-found + echo "*** Installing Radius to Kubernetes ***" rad install kubernetes --reinstall \ --chart ${{ env.RADIUS_CHART_LOCATION }} \ @@ -388,10 +405,6 @@ jobs: rad env create ${{ env.RADIUS_TEST_ENVIRONMENT_NAME }} --namespace default rad env switch ${{ env.RADIUS_TEST_ENVIRONMENT_NAME }} - # Temporary workaround to fix the x509 certificate error in the controller. - # https://github.com/radius-project/radius/issues/6989 - kubectl delete secrets controller-cert -n radius-system --ignore-not-found - echo "*** Configuring Azure provider ***" rad env update ${{ env.RADIUS_TEST_ENVIRONMENT_NAME }} --azure-subscription-id ${{ secrets.INTEGRATION_TEST_SUBSCRIPTION_ID }} \ --azure-resource-group ${{ env.AZURE_TEST_RESOURCE_GROUP }} @@ -480,3 +493,26 @@ jobs: - name: Clean up cluster if: always() run: ./.github/scripts/cleanup-cluster.sh + - name: Clear test run flag + if: always() + run: | + # At this point, there has to be a lock, so this deletion command should not fail. + # If it fails, then there is a bug in the workflow. + kubectl delete configmap long-running-test-lock -n radius-system + report-failure: + name: Report test failure + needs: [build, tests] + runs-on: ubuntu-latest + if: failure() && github.repository == 'radius-project/radius' + steps: + - name: Create failure issue for failing long running test run + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GH_RAD_CI_BOT_PAT }} + script: | + github.rest.issues.create({ + ...context.repo, + title: `Scheduled long running test failed - Run ID: ${context.runId}`, + labels: ['bug', 'test-failure'], + body: `## Bug information \n\nThis bug is generated automatically if the scheduled long running test fails. The Radius long running test operates on a schedule of every 2 hours everyday. It's important to understand that the test may fail due to workflow infrastructure issues, like network problems, rather than the flakiness of the test itself. For the further investigation, please visit [here](${process.env.ACTION_LINK}).` + })