From f32066b842f20f0277eadbf0eab68000298ca3cb Mon Sep 17 00:00:00 2001 From: Daniel Simionato Date: Thu, 2 May 2024 18:12:33 +0200 Subject: [PATCH] Add `cluster_control_plane_unhealthy` inhibition. --- CHANGELOG.md | 4 ++++ .../alerting-rules/inhibit.management-cluster.rules.yml | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 396269241..b53cdeb4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ Add opsrecipe to `CoreDNSMaxHPAReplicasReached` - Remove cilium entry from KAAS SLOs. +### Added + +- Add `cluster_control_plane_unhealthy` inhibition. + ## [3.13.1] - 2024-04-30 ### Removed diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml index d23d0f156..12cf66a5f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml @@ -95,4 +95,13 @@ spec: instance_state_not_running: "true" team: phoenix topic: status + - alert: InhibitionControlPlaneUnhealthy + annotations: + description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}' + expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1 + labels: + area: kaas + cluster_control_plane_unhealthy: "true" + team: phoenix + topic: status {{- end }}