From 68b03046a0fce4cc8283132b9af725222ac05bd9 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Mon, 6 May 2024 09:49:17 +0200
Subject: [PATCH 1/3] fix: etcd and fairness related alerts for mimir (#1164)

---
 CHANGELOG.md                                         |  1 +
 .../deployment.workload-cluster.rules.yml            |  2 +-
 .../alerting-rules/etcd.management-cluster.rules.yml | 10 +++++-----
 .../alerting-rules/etcd.workload-cluster.rules.yml   | 12 ++++++------
 .../templates/alerting-rules/etcdbackup.rules.yml    |  2 +-
 .../templates/alerting-rules/fairness.rules.yml      |  2 +-
 6 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d27486749..27d199c36 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - Remove cilium entry from KAAS SLOs.
+- Fix etcd alerts for Mimir.
 - Add missing labels for apiserver alerts.
 
 ## [3.13.1] - 2024-04-30
diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
index bdd35da5b..e0c8f0dcb 100644
--- a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
@@ -41,7 +41,7 @@ spec:
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: workload-cluster-deployment-not-satisfied/
-      expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
+      expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
       for: 30m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml
index c76ce0d56..09a07bfe0 100644
--- a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml
@@ -32,7 +32,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
         opsrecipe: etcd-high-commit-duration/
-      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0
+      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!="eks"}[5m])) > 1.0
       for: 15m
       labels:
         area: kaas
@@ -44,7 +44,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
         opsrecipe: etcd-db-size-too-large/
-      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80
+      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!="eks"}) * 100 > 80
       for: 90m
       labels:
         area: kaas
@@ -55,7 +55,7 @@ spec:
     - alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh
       annotations:
         description: '{{`Etcd has too many leader changes.`}}'
-      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8
+      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!="eks"}[1h]) > 8
       labels:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
@@ -66,7 +66,7 @@ spec:
       annotations:
         description: '{{`Etcd has no leader.`}}'
         opsrecipe: etcd-has-no-leader/
-      expr: etcd_server_has_leader{cluster_type="management_cluster", provider!~"eks"} == 0
+      expr: etcd_server_has_leader{cluster_type="management_cluster", provider!="eks"} == 0
       for: 5m
       labels:
         area: kaas
@@ -78,7 +78,7 @@ spec:
       annotations:
         description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
         opsrecipe: etcd-metrics-missing/
-      expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id)
+      expr: count(up{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider)
       for: 1h
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml
index 0cb34158d..916c8abad 100644
--- a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}'
         opsrecipe: etcd-down/
-      expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0
+      expr: up{cluster_type="workload_cluster", app="etcd", provider!="eks"} == 0
       for: 20m
       labels:
         area: kaas
@@ -33,7 +33,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
         opsrecipe: etcd-high-commit-duration/
-      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0
+      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!="eks"}[5m])) > 1.0
       for: 15m
       labels:
         area: kaas
@@ -45,7 +45,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
         opsrecipe: etcd-db-size-too-large/
-      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80
+      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!="eks"}) * 100 > 80
       for: 15m
       labels:
         area: kaas
@@ -56,7 +56,7 @@ spec:
     - alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh
       annotations:
         description: '{{`Etcd has too many leader changes.`}}'
-      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8
+      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!="eks"}[1h]) > 8
       labels:
         area: kaas
         severity: notify
@@ -66,7 +66,7 @@ spec:
       annotations:
         description: '{{`Etcd has no leader.`}}'
         opsrecipe: etcd-has-no-leader/
-      expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0
+      expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!="eks"} == 0
       for: 35m
       labels:
         area: kaas
@@ -78,7 +78,7 @@ spec:
       annotations:
         description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
         opsrecipe: etcd-metrics-missing/
-      expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id)
+      expr: count(up{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider)
       for: 1h
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml
index 4f199839c..73dd77fa7 100644
--- a/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml
@@ -30,7 +30,7 @@ spec:
       annotations:
         description: '{{`Latest successfull ETCD backup for {{ $labels.cluster_id }}/{{ $labels.tenant_cluster_id }} was more than 48h ago.`}}'
         opsrecipe: etcd-backup-failed/
-      expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (tenant_cluster_id)  > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (tenant_cluster_id)
+      expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)  > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)
       for: 5m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
index 1b29fb629..992a942e9 100644
--- a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
@@ -25,7 +25,7 @@ spec:
       annotations:
         description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
         opsrecipe: flowcontrol-rejected-requests/
-      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
+      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, installation, pipeline, provider, flow_schema, priority_level) > (min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
       for: 15m
       labels:
         area: kaas

From 1e34b674b16c9af424d4639cecf13029fb002336 Mon Sep 17 00:00:00 2001
From: Daniel Simionato <weseven@users.noreply.github.com>
Date: Mon, 6 May 2024 09:52:55 +0200
Subject: [PATCH 2/3] Add `cluster_control_plane_unhealthy` inhibition. (#1155)

Co-authored-by: Quentin Bisson <quentin@giantswarm.io>
---
 CHANGELOG.md                                             | 4 ++++
 .../alerting-rules/inhibit.management-cluster.rules.yml  | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27d199c36..057155e4f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix etcd alerts for Mimir.
 - Add missing labels for apiserver alerts.
 
+### Added
+
+- Add `cluster_control_plane_unhealthy` inhibition.
+
 ## [3.13.1] - 2024-04-30
 
 ### Removed
diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
index d23d0f156..12cf66a5f 100644
--- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
@@ -95,4 +95,13 @@ spec:
         instance_state_not_running: "true"
         team: phoenix
         topic: status
+    - alert: InhibitionControlPlaneUnhealthy
+      annotations:
+        description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
+      expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
+      labels:
+        area: kaas
+        cluster_control_plane_unhealthy: "true"
+        team: phoenix
+        topic: status
 {{- end }}

From be49a73992e972ef41477cfae0e598bb1e31c2b7 Mon Sep 17 00:00:00 2001
From: Daniel Simionato <weseven@users.noreply.github.com>
Date: Mon, 6 May 2024 09:55:30 +0200
Subject: [PATCH 3/3] Add inhibitions expressions for CAPI clusters (#1153)

* Added inhibitions expressions for CAPI clusters.

* Add parenthesis from code review
---
 CHANGELOG.md                                       |  4 ++++
 .../inhibit.management-cluster.rules.yml           | 14 +++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 057155e4f..466624d27 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add `cluster_control_plane_unhealthy` inhibition.
 
+### Added
+
+- Added inhibitions expressions for CAPI clusters.
+
 ## [3.13.1] - 2024-04-30
 
 ### Removed
diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
index 12cf66a5f..0bfc3fe2e 100644
--- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
     - alert: InhibitionClusterStatusCreating
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Creating''.`}}'
-      expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1
+      expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Provisioning"}[30m]) == 1
       labels:
         area: kaas
         cluster_status_creating: "true"
@@ -26,7 +26,7 @@ spec:
     - alert: InhibitionClusterStatusCreated
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Created''.`}}'
-      expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1
+      expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1 or (capi_cluster_status_phase{phase="Provisioned"} == 1 and capi_cluster_status_condition{type="Ready", status="True"} == 1)
       labels:
         area: kaas
         cluster_status_created: "true"
@@ -35,7 +35,7 @@ spec:
     - alert: InhibitionClusterStatusUpdating
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updating''.`}}'
-      expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1
+      expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1 or (capi_cluster_status_condition{type="Ready", status="False"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="False"} == 1))
       labels:
         area: kaas
         cluster_status_updating: "true"
@@ -44,7 +44,7 @@ spec:
     - alert: InhibitionClusterStatusUpdated
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updated''.`}}'
-      expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1
+      expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1 or (capi_cluster_status_condition{type="Ready", status="True"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="True"} == 1 and capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="True"} == 1))
       labels:
         area: kaas
         cluster_status_updated: "true"
@@ -53,7 +53,7 @@ spec:
     - alert: InhibitionClusterStatusDeleting
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Deleting''.`}}'
-      expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1
+      expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Deleting"}[30m]) == 1
       labels:
         area: kaas
         cluster_status_deleting: "true"
@@ -71,7 +71,7 @@ spec:
     - alert: InhibitionClusterScalingNodePools
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} node pools are scaling.`}}'
-      expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers
+      expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers or (capi_machinepool_status_phase{phase="ScalingUp"} == 1 or capi_machinepool_status_phase{phase="ScalingDown"} == 1)
       labels:
         area: kaas
         cluster_with_scaling_nodepools: "true"
@@ -80,7 +80,7 @@ spec:
     - alert: InhibitionClusterNodePoolsNotReady
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} node pools are not ready. Either they have been scaled down to 0 or they are not up yet.`}}'
-      expr: cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0
+      expr: (cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0) or capi_machinepool_status_condition{type="Ready", status="False"} == 1
       labels:
         area: kaas
         cluster_with_notready_nodepools: "true"