From a10d0e54e59977d893b502de1d43aec96465ab39 Mon Sep 17 00:00:00 2001 From: Giovanni Tirloni Date: Thu, 4 Jul 2024 13:24:33 -0300 Subject: [PATCH] ceph: Add CephHealthDetail alerts (#1495) Appends new CephHealthDetail* alerts and disables redundant alerts from ceph-mixin. --- .../files/jsonnet/mixins.libsonnet | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet index cd88e7ca2..7c8c45eb4 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet +++ b/roles/kube_prometheus_stack/files/jsonnet/mixins.libsonnet @@ -8,6 +8,10 @@ local disabledAlerts = [ // * Dropped `CephNodeNetworkPacketDrops` due to noisy alerts with // no actionable items to fix it. 'CephNodeNetworkPacketDrops', + + // Superseded by CephHealthDetail* alerts + 'CephHealthWarning', + 'CephHealthError', ]; // NOTE(mnaser): This is the default mapping for severities: @@ -55,7 +59,37 @@ local mixins = { alertmanagerClusterLabels: 'namespace,service,cluster', }, }, - ceph: (import 'vendor/github.com/ceph/ceph/monitoring/ceph-mixin/mixin.libsonnet'), + ceph: (import 'vendor/github.com/ceph/ceph/monitoring/ceph-mixin/mixin.libsonnet') + { + prometheusAlerts+:: { + groups+: [ + { + name: 'cluster health detail', + rules: [ + { + alert: 'CephHealthDetailError', + 'for': '5m', + expr: 'ceph_health_detail{severity="HEALTH_ERROR"} == 1', + labels: { severity: 'critical' }, + annotations: { + summary: 'Ceph is in the ERROR state', + description: "Health check {{ $labels.name }} has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.", + }, + }, + { + alert: 'CephHealthDetailWarning', + 'for': '15m', + expr: 'ceph_health_detail{severity="HEALTH_WARN"} == 1', + labels: { severity: 'warning' }, + annotations: { + summary: 'Ceph is in the WARNING state', + description: "Health check {{ $labels.name }} has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.", + }, + }, + ], + }, + ], + } + }, coredns: (import 'vendor/github.com/povilasv/coredns-mixin/mixin.libsonnet') + { _config+:: { corednsSelector: 'job="coredns"',