From 1e4e334e892c906f89720d360de8038e8a72d0b0 Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 6 Dec 2023 19:08:59 +0530 Subject: [PATCH] Alert user if ceph metadata server is consuming cpu at threshold point. Signed-off-by: Manish --- metrics/deploy/prometheus-ocs-rules.yaml | 13 +++++++++++++ metrics/mixin/alerts/perf.libsonnet | 16 ++++++++++++++++ metrics/mixin/config.libsonnet | 2 ++ 3 files changed, 31 insertions(+) diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index de6d342a66..3471044450 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -286,3 +286,16 @@ spec: for: 15m labels: severity: warning + - alert: MDSCPUUsageHigh + annotations: + description: |- + Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage. + Please consider increasing the number of active metadata servers, + it can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR. + message: Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage + severity_level: warning + expr: | + pod:container_cpu_usage:sum{pod=~"rook-ceph-mds.*"}/ on(pod) kube_pod_resource_limit{resource='cpu',pod=~"rook-ceph-mds.*"} > 0.67 + for: 6h + labels: + severity: warning diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet index 3f1d4de6f3..ae385de236 100644 --- a/metrics/mixin/alerts/perf.libsonnet +++ b/metrics/mixin/alerts/perf.libsonnet @@ -34,6 +34,22 @@ severity_level: 'warning', }, }, + + { + alert: 'MDSCPUUsageHigh', + expr: ||| + pod:container_cpu_usage:sum{%(mdsSelector)s}/ on(pod) kube_pod_resource_limit{resource='cpu',%(mdsSelector)s} > 0.67 + ||| % $._config, + 'for': $._config.mds_cpu_usage_high_threshold_duration, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage', + description: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.\nPlease consider increasing the number of active metadata servers,\nit can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR.', + severity_level: 'warning', + }, + }, ], }, ], diff --git a/metrics/mixin/config.libsonnet b/metrics/mixin/config.libsonnet index bcfafcb9be..2883e43b55 100644 --- a/metrics/mixin/config.libsonnet +++ b/metrics/mixin/config.libsonnet @@ -16,6 +16,8 @@ ocsStorageClusterKMSConnectionAlert: '5s', mdsCacheUsageAlertTime: '5m', osdCPULoadHighAlertTime: '15m', + mdsSelector: 'pod=~"rook-ceph-mds.*"', + mds_cpu_usage_high_threshold_duration: '6h', // Constants objectStorageType: 'RGW',