From 5e8fe73449927b2c039e38969baea21948ca40fb Mon Sep 17 00:00:00 2001 From: QuantumEnigmaa Date: Tue, 19 Sep 2023 11:02:55 +0200 Subject: [PATCH] split ksm alerts in 2 separate ones --- CHANGELOG.md | 4 +++ .../templates/alerting-rules/up.all.rules.yml | 26 +++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fda50a2b..646189fec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics` + ## [2.132.0] - 2023-09-15 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index 0ab25784b..de8abc1dd 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -58,11 +58,6 @@ spec: # vintage clusters without servicemonitor label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) ) - or - ( - # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) < 10 - ) for: 15m labels: area: kaas @@ -76,3 +71,24 @@ spec: severity: page team: atlas topic: observability + + - alert: KubeStateMetricsNotRetrievingMetrics + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + # When it looks up but we don't have metrics + count({app="kube-state-metrics"}) < 10 + for: 60m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: atlas + topic: observability