Updating the Radius installation step of the long-haul workflow

Signed-off-by: ytimocin <[email protected]>
radius-project · Jan 10, 2024 · ca5b247 · ca5b247
1 parent be2bc0c
commit ca5b247
Showing 1 changed file with 52 additions and 11 deletions.
diff --git a/.github/workflows/long-running-azure.yaml b/.github/workflows/long-running-azure.yaml
@@ -45,11 +45,11 @@ on:
     branches:
       - main
     paths:
-      - '.github/workflows/long-running-azure.yaml'
+      - ".github/workflows/long-running-azure.yaml"
 
 env:
   # Go version
-  GOVER: '^1.21'
+  GOVER: "^1.21"
   GOPROXY: https://proxy.golang.org
 
   # gotestsum version - see: https://github.com/gotestyourself/gotestsum
@@ -70,23 +70,38 @@ env:
   # The region for AWS resources
   AWS_REGION: us-west-2
   # The AWS account ID
-  AWS_ACCOUNT_ID: '${{ secrets.FUNCTEST_AWS_ACCOUNT_ID }}'
+  AWS_ACCOUNT_ID: "${{ secrets.FUNCTEST_AWS_ACCOUNT_ID }}"
 
   # The valid radius build time window in seconds to rebuild radius. 24 hours = 24 * 60 * 60 = 86400
   VALID_RADIUS_BUILD_WINDOW: 86400
 
   # The AKS cluster name
-  AKS_CLUSTER_NAME: 'radiuse2e00-aks'
+  AKS_CLUSTER_NAME: "radiuse2e00-aks"
   # The resource group for AKS_CLUSTER_NAME resource.
-  AKS_RESOURCE_GROUP: 'radiuse2e00'
+  AKS_RESOURCE_GROUP: "radiuse2e00"
 
   # Server where terraform test modules are deployed
-  TF_RECIPE_MODULE_SERVER_URL: 'http://tf-module-server.radius-test-tf-module-server.svc.cluster.local'
+  TF_RECIPE_MODULE_SERVER_URL: "http://tf-module-server.radius-test-tf-module-server.svc.cluster.local"
 
   # Radius test environment name
-  RADIUS_TEST_ENVIRONMENT_NAME: 'kind-radius'
+  RADIUS_TEST_ENVIRONMENT_NAME: "kind-radius"
+
+  # The current GitHub action link
+  ACTION_LINK: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
 
 jobs:
+  initial-checks:
+    name: Initial checks before running the tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check if tests are already running
+        run: |
+          if kubectl get configmap long-running-test-lock -n radius-system --ignore-not-found; then
+            echo "Tests are already running. Exiting..."
+            exit 1
+          fi
+      - name: Set test run flag
+        run: kubectl create configmap long-running-test-lock -n radius-system
   build:
     name: Build Radius for test
     runs-on: ubuntu-latest
@@ -368,6 +383,12 @@ jobs:
           export PATH=$GITHUB_WORKSPACE/bin:$PATH
           which rad || { echo "cannot find rad"; exit 1; }
 
+          echo "*** Uninstalling existing Radius installation ***"
+          rad uninstall kubernetes
+
+          echo "*** Deleting radius-system namespace ***"
+          kubectl delete namespace radius-system --ignore-not-found
+
           echo "*** Installing Radius to Kubernetes ***"
           rad install kubernetes --reinstall \
             --chart ${{ env.RADIUS_CHART_LOCATION }} \
@@ -388,10 +409,6 @@ jobs:
           rad env create ${{ env.RADIUS_TEST_ENVIRONMENT_NAME }} --namespace default
           rad env switch ${{ env.RADIUS_TEST_ENVIRONMENT_NAME }}
 
-          # Temporary workaround to fix the x509 certificate error in the controller.
-          # https://github.com/radius-project/radius/issues/6989
-          kubectl delete secrets controller-cert -n radius-system --ignore-not-found
-
           echo "*** Configuring Azure provider ***"
           rad env update ${{ env.RADIUS_TEST_ENVIRONMENT_NAME }} --azure-subscription-id ${{ secrets.INTEGRATION_TEST_SUBSCRIPTION_ID }} \
             --azure-resource-group ${{ env.AZURE_TEST_RESOURCE_GROUP }}
@@ -480,3 +497,27 @@ jobs:
       - name: Clean up cluster
         if: always()
         run: ./.github/scripts/cleanup-cluster.sh
+  post-test-checks:
+    name: Checks to be applied after running the tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clear test run flag
+        if: always()
+        run: kubectl delete configmap long-running-test-lock -n radius-system
+  report-failure:
+    name: Report test failure
+    needs: [build, tests]
+    runs-on: ubuntu-latest
+    if: failure() && github.repository == 'radius-project/radius'
+    steps:
+      - name: Create failure issue for failing long running test run
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.GH_RAD_CI_BOT_PAT }}
+          script: |
+            github.rest.issues.create({
+              ...context.repo,
+              title: `Scheduled long running test failed - Run ID: ${context.runId}`,
+              labels: ['bug', 'test-failure'],
+              body: `## Bug information \n\nThis bug is generated automatically if the scheduled long running test fails. The Radius long running test operates on a schedule of every 2 hours everyday. It's important to understand that the test may fail due to workflow infrastructure issues, like network problems, rather than the flakiness of the test itself. For the further investigation, please visit [here](${process.env.ACTION_LINK}).`
+            })