From 68b03046a0fce4cc8283132b9af725222ac05bd9 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Mon, 6 May 2024 09:49:17 +0200 Subject: [PATCH 1/3] fix: etcd and fairness related alerts for mimir (#1164) --- CHANGELOG.md | 1 + .../deployment.workload-cluster.rules.yml | 2 +- .../alerting-rules/etcd.management-cluster.rules.yml | 10 +++++----- .../alerting-rules/etcd.workload-cluster.rules.yml | 12 ++++++------ .../templates/alerting-rules/etcdbackup.rules.yml | 2 +- .../templates/alerting-rules/fairness.rules.yml | 2 +- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d27486749..27d199c36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Remove cilium entry from KAAS SLOs. +- Fix etcd alerts for Mimir. - Add missing labels for apiserver alerts. ## [3.13.1] - 2024-04-30 diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml index bdd35da5b..e0c8f0dcb 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml @@ -41,7 +41,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: workload-cluster-deployment-not-satisfied/ - expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0 + expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml index c76ce0d56..09a07bfe0 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml @@ -32,7 +32,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}' opsrecipe: etcd-high-commit-duration/ - expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0 + expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!="eks"}[5m])) > 1.0 for: 15m labels: area: kaas @@ -44,7 +44,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}' opsrecipe: etcd-db-size-too-large/ - expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80 + expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!="eks"}) * 100 > 80 for: 90m labels: area: kaas @@ -55,7 +55,7 @@ spec: - alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh annotations: description: '{{`Etcd has too many leader changes.`}}' - expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8 + expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!="eks"}[1h]) > 8 labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} @@ -66,7 +66,7 @@ spec: annotations: description: '{{`Etcd has no leader.`}}' opsrecipe: etcd-has-no-leader/ - expr: etcd_server_has_leader{cluster_type="management_cluster", provider!~"eks"} == 0 + expr: etcd_server_has_leader{cluster_type="management_cluster", provider!="eks"} == 0 for: 5m labels: area: kaas @@ -78,7 +78,7 @@ spec: annotations: description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}' opsrecipe: etcd-metrics-missing/ - expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) + expr: count(up{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml index 0cb34158d..916c8abad 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}' opsrecipe: etcd-down/ - expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0 + expr: up{cluster_type="workload_cluster", app="etcd", provider!="eks"} == 0 for: 20m labels: area: kaas @@ -33,7 +33,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}' opsrecipe: etcd-high-commit-duration/ - expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0 + expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!="eks"}[5m])) > 1.0 for: 15m labels: area: kaas @@ -45,7 +45,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}' opsrecipe: etcd-db-size-too-large/ - expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80 + expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!="eks"}) * 100 > 80 for: 15m labels: area: kaas @@ -56,7 +56,7 @@ spec: - alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh annotations: description: '{{`Etcd has too many leader changes.`}}' - expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8 + expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!="eks"}[1h]) > 8 labels: area: kaas severity: notify @@ -66,7 +66,7 @@ spec: annotations: description: '{{`Etcd has no leader.`}}' opsrecipe: etcd-has-no-leader/ - expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0 + expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!="eks"} == 0 for: 35m labels: area: kaas @@ -78,7 +78,7 @@ spec: annotations: description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}' opsrecipe: etcd-metrics-missing/ - expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) + expr: count(up{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml index 4f199839c..73dd77fa7 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcdbackup.rules.yml @@ -30,7 +30,7 @@ spec: annotations: description: '{{`Latest successfull ETCD backup for {{ $labels.cluster_id }}/{{ $labels.tenant_cluster_id }} was more than 48h ago.`}}' opsrecipe: etcd-backup-failed/ - expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (tenant_cluster_id) > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (tenant_cluster_id) + expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, tenant_cluster_id) > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (cluster_id, installation, pipeline, provider, tenant_cluster_id) for: 5m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml index 1b29fb629..992a942e9 100644 --- a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml @@ -25,7 +25,7 @@ spec: annotations: description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}' opsrecipe: flowcontrol-rejected-requests/ - expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit)) + expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, installation, pipeline, provider, flow_schema, priority_level) > (min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit)) for: 15m labels: area: kaas From 1e34b674b16c9af424d4639cecf13029fb002336 Mon Sep 17 00:00:00 2001 From: Daniel Simionato Date: Mon, 6 May 2024 09:52:55 +0200 Subject: [PATCH 2/3] Add `cluster_control_plane_unhealthy` inhibition. (#1155) Co-authored-by: Quentin Bisson --- CHANGELOG.md | 4 ++++ .../alerting-rules/inhibit.management-cluster.rules.yml | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27d199c36..057155e4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix etcd alerts for Mimir. - Add missing labels for apiserver alerts. +### Added + +- Add `cluster_control_plane_unhealthy` inhibition. + ## [3.13.1] - 2024-04-30 ### Removed diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml index d23d0f156..12cf66a5f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml @@ -95,4 +95,13 @@ spec: instance_state_not_running: "true" team: phoenix topic: status + - alert: InhibitionControlPlaneUnhealthy + annotations: + description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}' + expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1 + labels: + area: kaas + cluster_control_plane_unhealthy: "true" + team: phoenix + topic: status {{- end }} From be49a73992e972ef41477cfae0e598bb1e31c2b7 Mon Sep 17 00:00:00 2001 From: Daniel Simionato Date: Mon, 6 May 2024 09:55:30 +0200 Subject: [PATCH 3/3] Add inhibitions expressions for CAPI clusters (#1153) * Added inhibitions expressions for CAPI clusters. * Add parenthesis from code review --- CHANGELOG.md | 4 ++++ .../inhibit.management-cluster.rules.yml | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 057155e4f..466624d27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `cluster_control_plane_unhealthy` inhibition. +### Added + +- Added inhibitions expressions for CAPI clusters. + ## [3.13.1] - 2024-04-30 ### Removed diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml index 12cf66a5f..0bfc3fe2e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml @@ -17,7 +17,7 @@ spec: - alert: InhibitionClusterStatusCreating annotations: description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Creating''.`}}' - expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1 + expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Provisioning"}[30m]) == 1 labels: area: kaas cluster_status_creating: "true" @@ -26,7 +26,7 @@ spec: - alert: InhibitionClusterStatusCreated annotations: description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Created''.`}}' - expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1 + expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1 or (capi_cluster_status_phase{phase="Provisioned"} == 1 and capi_cluster_status_condition{type="Ready", status="True"} == 1) labels: area: kaas cluster_status_created: "true" @@ -35,7 +35,7 @@ spec: - alert: InhibitionClusterStatusUpdating annotations: description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updating''.`}}' - expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1 + expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1 or (capi_cluster_status_condition{type="Ready", status="False"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="False"} == 1)) labels: area: kaas cluster_status_updating: "true" @@ -44,7 +44,7 @@ spec: - alert: InhibitionClusterStatusUpdated annotations: description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updated''.`}}' - expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1 + expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1 or (capi_cluster_status_condition{type="Ready", status="True"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="True"} == 1 and capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="True"} == 1)) labels: area: kaas cluster_status_updated: "true" @@ -53,7 +53,7 @@ spec: - alert: InhibitionClusterStatusDeleting annotations: description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Deleting''.`}}' - expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1 + expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Deleting"}[30m]) == 1 labels: area: kaas cluster_status_deleting: "true" @@ -71,7 +71,7 @@ spec: - alert: InhibitionClusterScalingNodePools annotations: description: '{{`Cluster {{ $labels.cluster_id }} node pools are scaling.`}}' - expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers + expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers or (capi_machinepool_status_phase{phase="ScalingUp"} == 1 or capi_machinepool_status_phase{phase="ScalingDown"} == 1) labels: area: kaas cluster_with_scaling_nodepools: "true" @@ -80,7 +80,7 @@ spec: - alert: InhibitionClusterNodePoolsNotReady annotations: description: '{{`Cluster {{ $labels.cluster_id }} node pools are not ready. Either they have been scaled down to 0 or they are not up yet.`}}' - expr: cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0 + expr: (cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0) or capi_machinepool_status_condition{type="Ready", status="False"} == 1 labels: area: kaas cluster_with_notready_nodepools: "true"