From 1a06994b9537ba43eeec3b60603adf69cc8807a6 Mon Sep 17 00:00:00 2001 From: Young Bu Park Date: Thu, 21 Mar 2024 21:17:34 -0700 Subject: [PATCH] Migrate long-running test to new cluster to enable monitoring (#7357) --- .github/scripts/cleanup-cluster.sh | 28 ++++++------ .github/workflows/long-running-azure.yaml | 12 +++--- test/infra/azure/bicepconfig.json | 8 ++-- test/infra/azure/main.bicep | 2 +- test/infra/azure/modules/akscluster.bicep | 52 +---------------------- 5 files changed, 30 insertions(+), 72 deletions(-) diff --git a/.github/scripts/cleanup-cluster.sh b/.github/scripts/cleanup-cluster.sh index 21868b638f..32150caba9 100755 --- a/.github/scripts/cleanup-cluster.sh +++ b/.github/scripts/cleanup-cluster.sh @@ -21,22 +21,26 @@ set -e echo "cleaning up cluster" # Delete all test resources in queuemessages. -echo "delete all resources in queuemessages.ucp.dev" -kubectl delete queuemessages.ucp.dev -n radius-system --all +if kubectl get crd queuemessages.ucp.dev > /dev/null 2>&1; then + echo "delete all resources in queuemessages.ucp.dev" + kubectl delete queuemessages.ucp.dev -n radius-system --all +fi # Testing deletion of deployment.apps. # Delete all test resources in resources without proxy resource. -echo "delete all resources in resources.ucp.dev" -resources=$(kubectl get resources.ucp.dev -n radius-system --no-headers -o custom-columns=":metadata.name") -for r in $resources; do - if [[ $r == scope.local.* || $r == scope.aws.* || -z "$r" ]]; then - echo "skip deletion: $r" - else - echo "delete resource: $r" - kubectl delete resources.ucp.dev $r -n radius-system --ignore-not-found=true - fi -done +if kubectl get crd resources.ucp.dev > /dev/null 2>&1; then + echo "delete all resources in resources.ucp.dev" + resources=$(kubectl get resources.ucp.dev -n radius-system --no-headers -o custom-columns=":metadata.name") + for r in $resources; do + if [[ $r == scope.local.* || $r == scope.aws.* || -z "$r" ]]; then + echo "skip deletion: $r" + else + echo "delete resource: $r" + kubectl delete resources.ucp.dev $r -n radius-system --ignore-not-found=true + fi + done +fi # Delete all test namespaces. echo "delete all test namespaces" diff --git a/.github/workflows/long-running-azure.yaml b/.github/workflows/long-running-azure.yaml index e43c7013d7..b633255c1c 100644 --- a/.github/workflows/long-running-azure.yaml +++ b/.github/workflows/long-running-azure.yaml @@ -34,7 +34,7 @@ # the previous build is still valid. If valid, the workflow skips the build steps # and uses the cached 'rad cli' for testing. # -# Grafana dashboard URL: https://radiuse2e00-dashboard-audycmffgberbghy.wus3.grafana.azure.com/ +# Grafana dashboard URL: https://radlrtest00-dashboard-e4ffc0cwggchdhba.wus3.grafana.azure.com name: Long-running test on Azure @@ -44,6 +44,8 @@ permissions: packages: write # Required for uploading the package on: + # Enable manual trigger to deploy the latest changes from main. + workflow_dispatch: schedule: # Run every 2 hours - cron: "0 */2 * * *" @@ -82,9 +84,9 @@ env: VALID_RADIUS_BUILD_WINDOW: 86400 # The AKS cluster name - AKS_CLUSTER_NAME: "radiuse2e00-aks" + AKS_CLUSTER_NAME: "radlrtest00-aks" # The resource group for AKS_CLUSTER_NAME resource. - AKS_RESOURCE_GROUP: "radiuse2e00" + AKS_RESOURCE_GROUP: "radlrtest00" # Server where terraform test modules are deployed TF_RECIPE_MODULE_SERVER_URL: "http://tf-module-server.radius-test-tf-module-server.svc.cluster.local" @@ -115,7 +117,7 @@ jobs: path: ./dist/cache key: radius-test-latest- - name: Skip build if build is still valid - if: github.event_name != 'pull_request' + if: github.event_name != 'pull_request' && github.event_name != 'workflow_dispatch' id: skip-build run: | # check if the last build time to see if we need to build again @@ -499,7 +501,7 @@ jobs: name: Report test failure needs: [build, tests] runs-on: ubuntu-latest - if: failure() && github.repository == 'radius-project/radius' + if: failure() && github.repository == 'radius-project/radius' && github.event_name == 'schedule' steps: - name: Create failure issue for failing long running test run uses: actions/github-script@v6 diff --git a/test/infra/azure/bicepconfig.json b/test/infra/azure/bicepconfig.json index 3b5f42eeef..fcebadba45 100644 --- a/test/infra/azure/bicepconfig.json +++ b/test/infra/azure/bicepconfig.json @@ -1,5 +1,5 @@ { - "experimentalFeaturesEnabled": { - "extensibility": true - } - } \ No newline at end of file + "experimentalFeaturesEnabled": { + "extensibility": true + } +} \ No newline at end of file diff --git a/test/infra/azure/main.bicep b/test/infra/azure/main.bicep index 080d9813a0..963ffcad72 100644 --- a/test/infra/azure/main.bicep +++ b/test/infra/azure/main.bicep @@ -90,7 +90,7 @@ module aksCluster './modules/akscluster.bicep' = { params: { name: aksClusterName location: location - kubernetesVersion: '1.28.3' + kubernetesVersion: '1.28.5' logAnalyticsWorkspaceId: logAnalyticsWorkspace.outputs.id systemAgentPoolName: 'agentpool' systemAgentPoolVmSize: 'Standard_D4as_v5' diff --git a/test/infra/azure/modules/akscluster.bicep b/test/infra/azure/modules/akscluster.bicep index cc298d7eaf..9b33443009 100644 --- a/test/infra/azure/modules/akscluster.bicep +++ b/test/infra/azure/modules/akscluster.bicep @@ -307,9 +307,6 @@ param autoScalerProfileMaxGracefulTerminationSec string = '600' @description('Specifies the resource id of the Log Analytics workspace.') param logAnalyticsWorkspaceId string -@description('Specifies the workspace data retention in days.') -param retentionInDays int = 30 - @description('Specifies the location.') param location string = resourceGroup().location @@ -340,42 +337,8 @@ param imageCleanerIntervalHours int = 24 @description('Specifies whether to enable Workload Identity. The default value is false.') param workloadIdentityEnabled bool = false -// Variables -var diagnosticSettingsName = 'diagnosticSettings' -var logCategories = [ - 'kube-apiserver' - 'kube-audit' - 'kube-audit-admin' - 'kube-controller-manager' - 'kube-scheduler' - 'cluster-autoscaler' - 'cloud-controller-manager' - 'guard' - 'csi-azuredisk-controller' - 'csi-azurefile-controller' - 'csi-snapshot-controller' -] -var metricCategories = [ - 'AllMetrics' -] -var logs = [for category in logCategories: { - category: category - enabled: true - retentionPolicy: { - enabled: true - days: retentionInDays - } -}] -var metrics = [for category in metricCategories: { - category: category - enabled: true - retentionPolicy: { - enabled: true - days: retentionInDays - } -}] -resource aksCluster 'Microsoft.ContainerService/managedClusters@2023-05-01' = { +resource aksCluster 'Microsoft.ContainerService/managedClusters@2023-10-01' = { name: name location: location tags: tags @@ -515,7 +478,7 @@ resource aksCluster 'Microsoft.ContainerService/managedClusters@2023-05-01' = { } // Dapr Extension -resource daprExtension 'Microsoft.KubernetesConfiguration/extensions@2022-04-02-preview' = if (daprEnabled) { +resource daprExtension 'Microsoft.KubernetesConfiguration/extensions@2022-07-01' = if (daprEnabled) { name: 'dapr' scope: aksCluster properties: { @@ -534,17 +497,6 @@ resource daprExtension 'Microsoft.KubernetesConfiguration/extensions@2022-04-02- } } -// Diagnostic Settings -resource diagnosticSettings 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = { - name: diagnosticSettingsName - scope: aksCluster - properties: { - workspaceId: logAnalyticsWorkspaceId - logs: logs - metrics: metrics - } -} - // Output output id string = aksCluster.id output name string = aksCluster.name