diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 39a2fd571..8edc2cded 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -28,6 +28,27 @@ spec: severity: page team: atlas topic: observability + - alert: KubeStateMetricsNotRetrievingMetrics + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + # When it looks up but we don't have metrics + count({app="kube-state-metrics"}) < 10 + for: 60m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: atlas + topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index de8abc1dd..5cf4e056a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -72,23 +72,3 @@ spec: team: atlas topic: observability - - alert: KubeStateMetricsNotRetrievingMetrics - annotations: - description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' - opsrecipe: kube-state-metrics-down/ - expr: |- - # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) < 10 - for: 60m - labels: - area: kaas - cancel_if_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - inhibit_kube_state_metrics_down: "true" - cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: "false" - severity: page - team: atlas - topic: observability