Skip to content

Commit

Permalink
split ksm alerts in 2 separate ones (#912)
Browse files Browse the repository at this point in the history
* split ksm alerts in 2 separate ones

* move new alert to the adequate file

* moved KSMDown alert to the adequate file

* fix rules

* fix chart

* minor fixes

* lowered down time to trigger to 20m

* changelog
  • Loading branch information
QuantumEnigmaa authored Sep 21, 2023
1 parent 42721e2 commit 47fcf66
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 34 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics`

## [2.133.0] - 2023-09-19

### Changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,34 @@ spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsDown
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}'
opsrecipe: kube-state-metrics-down/
expr: |-
(
# modern clusters
label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1)
)
and
(
# vintage clusters without servicemonitor
label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
)
for: 15m
labels:
area: kaas
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_has_no_workers: "true"
inhibit_kube_state_metrics_down: "true"
cancel_if_prometheus_agent_down: "true"
cancel_if_kubelet_down: "true"
cancel_if_outside_working_hours: "false"
severity: page
team: atlas
topic: observability
- alert: KubeStateMetricsSlow
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}'
Expand All @@ -28,6 +56,27 @@ spec:
severity: page
team: atlas
topic: observability
- alert: KubeStateMetricsNotRetrievingMetrics
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}'
opsrecipe: kube-state-metrics-down/
expr: |-
# When it looks up but we don't have metrics
count({app="kube-state-metrics"}) < 10
for: 20m
labels:
area: kaas
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_has_no_workers: "true"
inhibit_kube_state_metrics_down: "true"
cancel_if_kubelet_down: "true"
cancel_if_kube_state_metrics_down: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: KubeConfigMapCreatedMetricMissing
annotations:
description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
Expand Down
33 changes: 0 additions & 33 deletions helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,36 +46,3 @@ spec:
severity: page
team: atlas
topic: observability
- alert: KubeStateMetricsDown
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}'
opsrecipe: kube-state-metrics-down/
expr: |-
(
# modern clusters
label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1)
)
and
(
# vintage clusters without servicemonitor
label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
)
or
(
# When it looks up but we don't have metrics
count({app="kube-state-metrics"}) < 10
)
for: 15m
labels:
area: kaas
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_has_no_workers: "true"
inhibit_kube_state_metrics_down: "true"
cancel_if_kubelet_down: "true"
cancel_if_outside_working_hours: "false"
cancel_if_prometheus_agent_down: "true"
severity: page
team: atlas
topic: observability
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
rule_files:
- up.all.rules.yml
- kube-state-metrics.rules.yml

tests:
# KubeStateMetricsDown tests
Expand Down

0 comments on commit 47fcf66

Please sign in to comment.