diff --git a/CHANGELOG.md b/CHANGELOG.md index 79037e0df..66218a9d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics` + ## [2.133.0] - 2023-09-19 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 39a2fd571..e635ae988 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -10,6 +10,34 @@ spec: groups: - name: kube-state-metrics rules: + - alert: KubeStateMetricsDown + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + ( + # modern clusters + label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) + ) + and + ( + # vintage clusters without servicemonitor + label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) + ) + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: atlas + topic: observability - alert: KubeStateMetricsSlow annotations: description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}' @@ -28,6 +56,27 @@ spec: severity: page team: atlas topic: observability + - alert: KubeStateMetricsNotRetrievingMetrics + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + # When it looks up but we don't have metrics + count({app="kube-state-metrics"}) < 10 + for: 20m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index e2990b11a..edc29786e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -46,36 +46,3 @@ spec: severity: page team: atlas topic: observability - - alert: KubeStateMetricsDown - annotations: - description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' - opsrecipe: kube-state-metrics-down/ - expr: |- - ( - # modern clusters - label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) - ) - and - ( - # vintage clusters without servicemonitor - label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) - ) - or - ( - # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) < 10 - ) - for: 15m - labels: - area: kaas - cancel_if_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - inhibit_kube_state_metrics_down: "true" - cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: "false" - cancel_if_prometheus_agent_down: "true" - severity: page - team: atlas - topic: observability diff --git a/test/tests/providers/global/up.all.rules.test.yml b/test/tests/providers/global/kube-state-metrics.rules.test.yml similarity index 99% rename from test/tests/providers/global/up.all.rules.test.yml rename to test/tests/providers/global/kube-state-metrics.rules.test.yml index ca4d0fbe2..8f5891193 100644 --- a/test/tests/providers/global/up.all.rules.test.yml +++ b/test/tests/providers/global/kube-state-metrics.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: -- up.all.rules.yml +- kube-state-metrics.rules.yml tests: # KubeStateMetricsDown tests