From 737a5e989ff13a86f868eb825ea0e7d71e8842fd Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 19 Sep 2023 11:04:26 +0200 Subject: [PATCH] Add missing prometheusagentfailing inhibition (#911) --- CHANGELOG.md | 6 +++++- .../alerting-rules/aws.management-cluster.rules.yml | 2 +- .../templates/alerting-rules/up.all.rules.yml | 3 +++ .../templates/alerting-rules/vault.rules.yml | 1 + test/tests/providers/global/up.all.rules.test.yml | 5 +++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fda50a2b..b6d93a9ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Add missing prometheus-agent inhibition to `KubeStateMetricsDown` alert +- Change time duration before `ManagementClusterDeploymentMissingAWS` pages because it is dependant on the `PrometheusAgentFailing` alert. + ## [2.132.0] - 2023-09-15 ### Changed @@ -168,7 +173,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [2.115.0] - 2023-07-20 - ### Added - New alert `KubeStateMetricsSlow` that inhibits KSM related alerts. diff --git a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml index f25741587..956f72321 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml @@ -149,7 +149,7 @@ spec: description: '{{`Deployment {{ $labels.deployment }} is missing.`}}' opsrecipe: management-cluster-deployment-is-missing/ expr: absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-admission-controller"}) - for: 5m + for: 15m labels: area: kaas cancel_if_prometheus_agent_down: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index 0ab25784b..e2990b11a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -25,6 +25,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: notify team: honeybadger topic: releng @@ -41,6 +42,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: "true" + cancel_if_prometheus_agent_down: "true" severity: page team: atlas topic: observability @@ -73,6 +75,7 @@ spec: inhibit_kube_state_metrics_down: "true" cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" + cancel_if_prometheus_agent_down: "true" severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml index 13cd2a260..1707c4360 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml @@ -57,6 +57,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: page team: {{ include "providerTeam" . }} topic: vault diff --git a/test/tests/providers/global/up.all.rules.test.yml b/test/tests/providers/global/up.all.rules.test.yml index 88ed88926..ca4d0fbe2 100644 --- a/test/tests/providers/global/up.all.rules.test.yml +++ b/test/tests/providers/global/up.all.rules.test.yml @@ -57,6 +57,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -79,6 +80,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -107,6 +109,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -160,6 +163,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability" @@ -182,6 +186,7 @@ tests: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" severity: "page" team: "atlas" topic: "observability"