review-phoenix-inhibitions (#1212)

giantswarm · Jun 6, 2024 · 04969c1 · 04969c1
1 parent b45bd99
commit 04969c1
Show file tree

Hide file tree

Showing 10 changed files with 123 additions and 81 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor.
 - Add `CiliumAPITooSlow`.
 
+### Changed
+
+- Review phoenix alerts towards Mimir.
+- Moves cluster-autoscaler and vpa alerts to turtles.
+
 ### Fixed
 
 - Fix cabbage alerts for multi-provider wcs.

diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml
diff --git a/...ules/inhibit.management-cluster.rules.yml → .../inhibit.aws.management-cluster.rules.yml b/...ules/inhibit.management-cluster.rules.yml → .../inhibit.aws.management-cluster.rules.yml
@@ -1,18 +1,19 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws management clusters
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    # No need for .Values.mimir.enabled condition - will be gone with Vintage
     cluster_type: "management_cluster"
-{{- end }}
-  name: inhibit.management-cluster.rules
+  name: inhibit.aws.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: inhibit.management-cluster
+  - name: inhibit.aws.management-cluster
     rules:
     - alert: InhibitionClusterStatusCreating
       annotations:
@@ -95,13 +96,13 @@ spec:
         instance_state_not_running: "true"
         team: phoenix
         topic: status
-    - alert: InhibitionControlPlaneUnhealthy
+    - alert: InhibitionClusterWithoutWorkerNodes
       annotations:
-        description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
-      expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
+        description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
+      expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
       labels:
         area: kaas
-        cluster_control_plane_unhealthy: "true"
+        has_worker_nodes: "false"
         team: phoenix
         topic: status
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml
@@ -0,0 +1,25 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws clusters
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: inhibit.kiam.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.kiam
+    rules:
+    - alert: InhibitionKiamErrors
+      annotations:
+        description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'
+      expr: increase(kiam_metadata_credential_fetch_errors_total[10m]) > 0 or increase(kiam_metadata_find_role_errors_total[10m]) > 0 or increase(kiam_sts_issuing_errors_total[10m]) > 0
+      labels:
+        area: kaas
+        kiam_has_errors: "true"
+        team: phoenix
+        topic: kiam
+{{- end }}
diff --git a/...erting-rules/cluster-autoscaler.rules.yml → ...erting-rules/cluster-autoscaler.rules.yml b/...erting-rules/cluster-autoscaler.rules.yml → ...erting-rules/cluster-autoscaler.rules.yml
@@ -1,4 +1,4 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+# This rule applies to all cloud workload clusters
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -18,31 +18,30 @@ spec:
       annotations:
         description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has unneeded nodes.`}}'
         opsrecipe: cluster-autoscaler-scaling/
-      expr: cluster_autoscaler_unneeded_nodes_count > 0
+      expr: cluster_autoscaler_unneeded_nodes_count{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"} > 0
       for: 240m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_outside_working_hours: "true"
         cancel_if_cluster_has_no_workers: "true"
         severity: page
-        team: phoenix
+        team: turtles
         topic: cluster-autoscaler
     - alert: ClusterAutoscalerFailedScaling
       annotations:
         description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has failed scaling up.`}}'
         opsrecipe: cluster-autoscaler-scaling/
-      expr: increase(cluster_autoscaler_failed_scale_ups_total[5m]) > 1
+      expr: increase(cluster_autoscaler_failed_scale_ups_total{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"}[5m]) > 1
       for: 15m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: phoenix
+        team: turtles
         topic: cluster-autoscaler
-{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml
@@ -0,0 +1,30 @@
+# This rule applies to all capi management clusters
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+{{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+{{- end }}
+  name: inhibit.capi.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.capi
+    rules:
+    - alert: InhibitionControlPlaneUnhealthy
+      annotations:
+        description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
+      expr: |-
+        capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="ControlPlaneComponentsHealthy", status="False"} == 1
+        or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="EtcdClusterHealthy", status="False"} == 1
+        or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="Available", status="False"} == 1
+      labels:
+        area: kaas
+        cluster_control_plane_unhealthy: "true"
+        team: turtles
+        topic: status
+{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml
@@ -0,0 +1,22 @@
+# This rule applies to all clusters
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: inhibit.kubelet.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.kubelet
+    rules:
+    - alert: InhibitionKubeletDown
+      annotations:
+        description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
+      expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
+      labels:
+        kubelet_down: "true"
+        area: kaas
+        topic: kubernetes
+        team: turtles
diff --git a/.../phoenix/alerting-rules/vpa.all.rules.yml → ...g-rules/vertical-pod-autoscaler.rules.yml b/.../phoenix/alerting-rules/vpa.all.rules.yml → ...g-rules/vertical-pod-autoscaler.rules.yml
@@ -3,11 +3,11 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: vpa.all.rules
+  name: vertical-pod-autoscaler.rules
   namespace: {{ .Values.namespace }}
 spec:
   groups:
-  - name: vpa
+  - name: vertical-pod-autoscaler
     rules:
     - alert: VpaComponentTooManyRestarts
       annotations:
@@ -27,5 +27,5 @@ spec:
         cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: phoenix
-        topic: observability
+        team: turtles
+        topic: autoscaling
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml
@@ -0,0 +1,21 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: inhibit.oncall.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.oncall
+    rules:
+    - alert: InhibitionOutsideWorkingHours
+      annotations:
+        description: '{{`Fires outside working hours.`}}'
+      expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1)
+      labels:
+        area: platform
+        outside_working_hours: "true"
+        team: atlas
+        topic: monitoring
diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml
@@ -59,7 +59,6 @@ spec:
         cancel_if_cluster_with_scaling_nodepools: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         cancel_if_cluster_has_no_workers: "true"
-        cancel_if_nodes_down: "true"
         severity: page
         team: {{ include "providerTeam" . }}
         topic: network