From a2f386dfec4c4e8d0ad90cd48ff84c6a6eed6c8b Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 6 Dec 2023 19:08:59 +0530 Subject: [PATCH] Alert user if ceph metadata server is consuming cpu at threshold point. Signed-off-by: Manish --- metrics/deploy/prometheus-ocs-rules.yaml | 13 +++++++++++++ metrics/mixin/alerts/perf.libsonnet | 16 ++++++++++++++++ metrics/mixin/config.libsonnet | 2 ++ 3 files changed, 31 insertions(+) diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 0aae30b74c..bb7d504c01 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -277,3 +277,16 @@ spec: for: 5m labels: severity: critical + - alert: MDSCPUUsageHigh + annotations: + description: |- + Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage. + Please consider increasing the number of active metadata servers, + it can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR. + message: Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage + severity_level: warning + expr: | + pod:container_cpu_usage:sum{pod=~"rook-ceph-mds.*"}/ on(pod) kube_pod_resource_limit{resource='cpu',pod=~"rook-ceph-mds.*"} > 0.67 + for: 6h + labels: + severity: warning diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet index add41743e7..0f24c3b3d1 100644 --- a/metrics/mixin/alerts/perf.libsonnet +++ b/metrics/mixin/alerts/perf.libsonnet @@ -19,6 +19,22 @@ severity_level: 'error', }, }, + + { + alert: 'MDSCPUUsageHigh', + expr: ||| + pod:container_cpu_usage:sum{%(mdsSelector)s}/ on(pod) kube_pod_resource_limit{resource='cpu',%(mdsSelector)s} > 0.67 + ||| % $._config, + 'for': $._config.mds_cpu_usage_high_threshold_duration, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage', + description: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.\nPlease consider increasing the number of active metadata servers,\nit can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR.', + severity_level: 'warning', + }, + }, ], }, ], diff --git a/metrics/mixin/config.libsonnet b/metrics/mixin/config.libsonnet index 1b982fcd92..bfb90ea317 100644 --- a/metrics/mixin/config.libsonnet +++ b/metrics/mixin/config.libsonnet @@ -14,6 +14,8 @@ blockedRBDClientAlertTime: '10s', ocsStorageClusterKMSConnectionAlert: '5s', mdsCacheUsageAlertTime: '5m', + mdsSelector: 'pod=~"rook-ceph-mds.*"', + mds_cpu_usage_high_threshold_duration: '6h', // Constants objectStorageType: 'RGW',