review phoenix alerts

giantswarm · Jun 10, 2024 · c64ff05 · c64ff05
1 parent 6a20ebf
commit c64ff05
Show file tree

Hide file tree

Showing 12 changed files with 111 additions and 57 deletions.
diff --git a/...etheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml b/...etheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml
@@ -1,4 +1,4 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+# This rule applies to vintage aws and capa workload clusters
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -18,10 +18,10 @@ spec:
       annotations:
         description: '{{`AWS load balancer controller pod {{ $labels.namespace}}/{{ $labels.pod }} on {{ $labels.cluster_id}} is throwing {{ $labels.error_code }} errors when contacting AWS API.`}}'
         opsrecipe: alb-errors
-      expr: sum(increase(aws_api_calls_total{error_code != ""}[20m])) by (error_code,namespace,pod,cluster_id) > 0
+      expr: sum(increase(aws_api_calls_total{cluster_type="workload_cluster", error_code != "", provider=~"aws|capa|eks"}[20m])) by (cluster_id, error_code, installation, namespace, pipeline, provider, pod) > 0
       for: 40m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -33,15 +33,14 @@ spec:
       annotations:
         description: '{{`AWS load balancer controller pod {{ $labels.namespace }}/{{ $labels.pod }} on {{ $labels.cluster_id }} is throwing errors while reconciling the {{ $labels.controller }} controller.`}}'
         opsrecipe: alb-errors
-      expr: sum(increase(controller_runtime_reconcile_total{service="aws-load-balancer-controller", result = "error"}[20m])) by (controller,namespace,pod,cluster_id) > 0
+      expr: sum(increase(controller_runtime_reconcile_total{cluster_type="workload_cluster", provider=~"aws|capa|eks", result = "error", service="aws-load-balancer-controller"}[20m])) by (cluster_id, controller, installation, namespace, pipeline, provider, pod) > 0
       for: 40m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: page
         team: phoenix
         topic: alb
-{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml
@@ -1,4 +1,6 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+## TODO Remove with vintage
+# This rule applies to vintage aws management clusters
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -161,32 +163,4 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
-    - alert: IRSATooManyErrors
-      annotations:
-        description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
-        opsrecipe: irsa-operator-error/
-      expr: irsa_operator_cluster_errors > 0
-      for: 10m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: phoenix
-        topic: aws
-    - alert: IRSAACMCertificateExpiringInLessThan60Days
-      annotations:
-        description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
-        opsrecipe: irsa-acm-certificate-expiring/
-      expr: min(irsa_operator_acm_certificate_not_after) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
-      for: 10m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: phoenix
-        topic: aws
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
@@ -1,4 +1,4 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+{{- if or (eq .Values.managementCluster.provider.kind "aws") (eq .Values.managementCluster.provider.kind "capa") }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -13,11 +13,12 @@ spec:
   groups:
   - name: aws
     rules:
+    ## TODO review this one (cluster-autoscaler, etcd-kubernetes-resources-count-exporter should be turtles)
     - alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS
       annotations:
         description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
         opsrecipe: container-is-restarting-too-often/
-      expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-plugin.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10
+      expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10
       for: 10m
       labels:
         area: kaas
@@ -29,10 +30,13 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+    ## TODO Remove with Vintage
+    {{- if eq .Values.managementCluster.provider.kind "aws"}}
     - alert: WorkloadClusterCriticalPodNotRunningAWS
       annotations:
         description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}'
         opsrecipe: critical-pod-is-not-running/
+      ### Those pods only exists in vintage
       expr: kube_pod_container_status_running{namespace="kube-system",container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1
       for: 20m
       labels:
@@ -43,6 +47,8 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+    {{- end }}
+    ## TODO review this one. Should it be turtles?
     - alert: WorkloadClusterControlPlaneNodeMissingAWS
       annotations:
         description: '{{`Control plane node is missing.`}}'
@@ -57,6 +63,7 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+    ## TODO review this one. Should it be turtles?
     - alert: WorkloadClusterHAControlPlaneDownForTooLong
       annotations:
         description: '{{`Control plane node in HA setup is down for a long time.`}}'
@@ -72,11 +79,12 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+    ## TODO review this one (cluster-autoscaler, etcd-kubernetes-resources-count-exporter should be turtles)
     - alert: WorkloadClusterPodPendingAWS
       annotations:
         description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
         opsrecipe: pod-stuck-in-pending/
-      expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*)",phase="Pending"} == 1
+      expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-(plugin|csi).*)",phase="Pending"} == 1
       for: 15m
       labels:
         area: kaas
@@ -88,9 +96,11 @@ spec:
         cancel_if_cluster_has_no_workers: "true"
         severity: page
         team: phoenix
+    ## TODO Remove with Vintage
+    {{- if eq .Values.managementCluster.provider.kind "aws"}}
     - alert: WorkloadClusterAWSCNIIpAlmostExhausted
       annotations:
-        description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availabvility_zone }}.`}}'
+        description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availability_zone }}.`}}'
         opsrecipe: aws-ips-exhausted/
       expr: min(aws_operator_subnet_available_ips_percentage{subnet_type="aws-cni"}) by (account, availability_zone, cluster_id, id) < 0.1
       for: 5m
@@ -111,4 +121,5 @@ spec:
         severity: page
         team: phoenix
         topic: workloadcluster
+    {{- end }}
 {{- end }}
diff --git a/.../prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/.../prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
@@ -1,3 +1,4 @@
+# This rule applies to capa management clusters
 {{- if eq .Values.managementCluster.provider.kind "capa" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
@@ -12,7 +13,7 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: capa
+  - name: capa.management-cluster
     rules:
     - alert: ManagementClusterPodPendingCAPA
       annotations:
@@ -62,18 +63,4 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
-    - alert: IRSATooManyErrors
-      annotations:
-        description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
-        opsrecipe: irsa-operator-error/
-        dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
-      expr: irsa_operator_cluster_errors > 0
-      for: 10m
-      labels:
-        area: kaas
-        cancel_if_kube_state_metrics_down: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: phoenix
-        topic: aws
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/credentiald.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/credentiald.rules.yml
@@ -1,11 +1,13 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws management clusters
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    # Kept because it will be gone with Vintage
+    # No need for .Values.mimir.enabled condition - will be gone with Vintage
     cluster_type: "management_cluster"
   name: credentiald.rules
   namespace: {{ .Values.namespace  }}

diff --git a/...heus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml b/...heus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml
@@ -1,3 +1,5 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws management clusters
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 ## TODO Remove when all vintage installations are gone
 # This rule applies to vintage aws management clusters

diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml
@@ -1,6 +1,9 @@
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
+<<<<<<< HEAD
 ## TODO Remove when all vintage installations are gone
 # This rule applies to vintage aws clusters
+=======
+>>>>>>> ba55ada (review phoenix alerts)
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:

diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml
@@ -0,0 +1,49 @@
+# This rule applies to vintage aws or capa management clusters
+{{- if or (eq .Values.managementCluster.provider.kind "aws") (eq .Values.managementCluster.provider.kind "capa") }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+{{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+{{- end }}
+  name: irsa.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: irsa-operator
+    rules:
+    - alert: IRSATooManyErrors
+      annotations:
+        description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
+        opsrecipe: irsa-operator-error/
+        ## TODO remove this when all clusters are migrated to capi
+        {{- if eq .Values.managementCluster.provider.flavor "capi" }}
+        dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
+        {{- end }}
+      expr: irsa_operator_cluster_errors{cluster_type="management_cluster"} > 0
+      for: 10m
+      labels:
+        area: kaas
+        cancel_if_kube_state_metrics_down: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: phoenix
+        topic: aws
+    - alert: IRSAACMCertificateExpiringInLessThan60Days
+      annotations:
+        description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
+        opsrecipe: irsa-acm-certificate-expiring/
+      expr: min(irsa_operator_acm_certificate_not_after{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
+      for: 10m
+      labels:
+        area: kaas
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: phoenix
+        topic: aws
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/kiam.rules.yml
@@ -1,4 +1,6 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+## TODO Remove with vintage
+# This rule applies to vintage aws workload clusters
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -20,7 +22,7 @@ spec:
       expr: increase(kiam_metadata_find_role_errors_total[10m]) > 0
       for: 15m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"

diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 {{- if eq .Values.managementCluster.provider.flavor "capi" }}
+=======
+>>>>>>> ba55ada (review phoenix alerts)
 # This rule applies to all capi management clusters
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
@@ -27,4 +30,7 @@ spec:
         cluster_control_plane_unhealthy: "true"
         team: turtles
         topic: status
+<<<<<<< HEAD
 {{- end }}
+=======
+>>>>>>> ba55ada (review phoenix alerts)
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # This rule applies to all clusters
+=======
+>>>>>>> ba55ada (review phoenix alerts)
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -9,14 +12,26 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
+<<<<<<< HEAD
   - name: inhibit.kubelet
     rules:
     - alert: InhibitionKubeletDown
       annotations:
         description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
+=======
+  - name: inhibit.all
+    rules:
+    - alert: InhibitionKubeletDown
+>>>>>>> ba55ada (review phoenix alerts)
       expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
       labels:
         kubelet_down: "true"
         area: kaas
         topic: kubernetes
         team: turtles
+<<<<<<< HEAD
+=======
+      annotations:
+        description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
+
+>>>>>>> ba55ada (review phoenix alerts)
diff --git a/.../prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml b/.../prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml
@@ -27,4 +27,8 @@ spec:
         cancel_if_outside_working_hours: "true"
         severity: page
         team: turtles
+<<<<<<< HEAD
         topic: autoscaling
+=======
+        topic: observability
+>>>>>>> ba55ada (review phoenix alerts)