Merge branch 'master' into fix-cilium-rules-for-mimir

giantswarm · May 6, 2024 · 4368f1f · 4368f1f
2 parents d789a59 + be49a73
commit 4368f1f
Show file tree

Hide file tree

Showing 7 changed files with 39 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,8 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Remove cilium entry from KAAS SLOs.
 - Fix cilium related alerts for mimir.
+- Fix etcd alerts for Mimir.
 - Add missing labels for apiserver alerts.
 
+### Added
+
+- Add `cluster_control_plane_unhealthy` inhibition.
+
+### Added
+
+- Added inhibitions expressions for CAPI clusters.
+
 ## [3.13.1] - 2024-04-30
 
 ### Removed

diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
@@ -41,7 +41,7 @@ spec:
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: workload-cluster-deployment-not-satisfied/
-      expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
+      expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
       for: 30m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml
@@ -32,7 +32,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
         opsrecipe: etcd-high-commit-duration/
-      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0
+      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!="eks"}[5m])) > 1.0
       for: 15m
       labels:
         area: kaas
@@ -44,7 +44,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
         opsrecipe: etcd-db-size-too-large/
-      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80
+      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!="eks"}) * 100 > 80
       for: 90m
       labels:
         area: kaas
@@ -55,7 +55,7 @@ spec:
     - alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh
       annotations:
         description: '{{`Etcd has too many leader changes.`}}'
-      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8
+      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!="eks"}[1h]) > 8
       labels:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
@@ -66,7 +66,7 @@ spec:
       annotations:
         description: '{{`Etcd has no leader.`}}'
         opsrecipe: etcd-has-no-leader/
-      expr: etcd_server_has_leader{cluster_type="management_cluster", provider!~"eks"} == 0
+      expr: etcd_server_has_leader{cluster_type="management_cluster", provider!="eks"} == 0
       for: 5m
       labels:
         area: kaas
@@ -78,7 +78,7 @@ spec:
       annotations:
         description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
         opsrecipe: etcd-metrics-missing/
-      expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id)
+      expr: count(up{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider)
       for: 1h
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}'
         opsrecipe: etcd-down/
-      expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0
+      expr: up{cluster_type="workload_cluster", app="etcd", provider!="eks"} == 0
       for: 20m
       labels:
         area: kaas
@@ -33,7 +33,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
         opsrecipe: etcd-high-commit-duration/
-      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0
+      expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!="eks"}[5m])) > 1.0
       for: 15m
       labels:
         area: kaas
@@ -45,7 +45,7 @@ spec:
       annotations:
         description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
         opsrecipe: etcd-db-size-too-large/
-      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80
+      expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!="eks"}) * 100 > 80
       for: 15m
       labels:
         area: kaas
@@ -56,7 +56,7 @@ spec:
     - alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh
       annotations:
         description: '{{`Etcd has too many leader changes.`}}'
-      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8
+      expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!="eks"}[1h]) > 8
       labels:
         area: kaas
         severity: notify
@@ -66,7 +66,7 @@ spec:
       annotations:
         description: '{{`Etcd has no leader.`}}'
         opsrecipe: etcd-has-no-leader/
-      expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0
+      expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!="eks"} == 0
       for: 35m
       labels:
         area: kaas
@@ -78,7 +78,7 @@ spec:
       annotations:
         description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
         opsrecipe: etcd-metrics-missing/
-      expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id)
+      expr: count(up{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider)
       for: 1h
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml
@@ -30,7 +30,7 @@ spec:
       annotations:
         description: '{{`Latest successfull ETCD backup for {{ $labels.cluster_id }}/{{ $labels.tenant_cluster_id }} was more than 48h ago.`}}'
         opsrecipe: etcd-backup-failed/
-      expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (tenant_cluster_id)  > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (tenant_cluster_id)
+      expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)  > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)
       for: 5m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
@@ -25,7 +25,7 @@ spec:
       annotations:
         description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
         opsrecipe: flowcontrol-rejected-requests/
-      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
+      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, installation, pipeline, provider, flow_schema, priority_level) > (min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
       for: 15m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
     - alert: InhibitionClusterStatusCreating
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Creating''.`}}'
-      expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1
+      expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Provisioning"}[30m]) == 1
       labels:
         area: kaas
         cluster_status_creating: "true"
@@ -26,7 +26,7 @@ spec:
     - alert: InhibitionClusterStatusCreated
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Created''.`}}'
-      expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1
+      expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1 or (capi_cluster_status_phase{phase="Provisioned"} == 1 and capi_cluster_status_condition{type="Ready", status="True"} == 1)
       labels:
         area: kaas
         cluster_status_created: "true"
@@ -35,7 +35,7 @@ spec:
     - alert: InhibitionClusterStatusUpdating
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updating''.`}}'
-      expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1
+      expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1 or (capi_cluster_status_condition{type="Ready", status="False"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="False"} == 1))
       labels:
         area: kaas
         cluster_status_updating: "true"
@@ -44,7 +44,7 @@ spec:
     - alert: InhibitionClusterStatusUpdated
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updated''.`}}'
-      expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1
+      expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1 or (capi_cluster_status_condition{type="Ready", status="True"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="True"} == 1 and capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="True"} == 1))
       labels:
         area: kaas
         cluster_status_updated: "true"
@@ -53,7 +53,7 @@ spec:
     - alert: InhibitionClusterStatusDeleting
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Deleting''.`}}'
-      expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1
+      expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Deleting"}[30m]) == 1
       labels:
         area: kaas
         cluster_status_deleting: "true"
@@ -71,7 +71,7 @@ spec:
     - alert: InhibitionClusterScalingNodePools
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} node pools are scaling.`}}'
-      expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers
+      expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers or (capi_machinepool_status_phase{phase="ScalingUp"} == 1 or capi_machinepool_status_phase{phase="ScalingDown"} == 1)
       labels:
         area: kaas
         cluster_with_scaling_nodepools: "true"
@@ -80,7 +80,7 @@ spec:
     - alert: InhibitionClusterNodePoolsNotReady
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} node pools are not ready. Either they have been scaled down to 0 or they are not up yet.`}}'
-      expr: cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0
+      expr: (cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0) or capi_machinepool_status_condition{type="Ready", status="False"} == 1
       labels:
         area: kaas
         cluster_with_notready_nodepools: "true"
@@ -95,4 +95,13 @@ spec:
         instance_state_not_running: "true"
         team: phoenix
         topic: status
+    - alert: InhibitionControlPlaneUnhealthy
+      annotations:
+        description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
+      expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
+      labels:
+        area: kaas
+        cluster_control_plane_unhealthy: "true"
+        team: phoenix
+        topic: status
 {{- end }}