diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 0aae30b74c..bb7d504c01 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -277,3 +277,16 @@ spec: for: 5m labels: severity: critical + - alert: MDSCPUUsageHigh + annotations: + description: |- + Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage. + Please consider increasing the number of active metadata servers, + it can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR. + message: Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage + severity_level: warning + expr: | + pod:container_cpu_usage:sum{pod=~"rook-ceph-mds.*"}/ on(pod) kube_pod_resource_limit{resource='cpu',pod=~"rook-ceph-mds.*"} > 0.67 + for: 6h + labels: + severity: warning diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet index add41743e7..0f24c3b3d1 100644 --- a/metrics/mixin/alerts/perf.libsonnet +++ b/metrics/mixin/alerts/perf.libsonnet @@ -19,6 +19,22 @@ severity_level: 'error', }, }, + + { + alert: 'MDSCPUUsageHigh', + expr: ||| + pod:container_cpu_usage:sum{%(mdsSelector)s}/ on(pod) kube_pod_resource_limit{resource='cpu',%(mdsSelector)s} > 0.67 + ||| % $._config, + 'for': $._config.mds_cpu_usage_high_threshold_duration, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage', + description: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.\nPlease consider increasing the number of active metadata servers,\nit can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR.', + severity_level: 'warning', + }, + }, ], }, ], diff --git a/metrics/mixin/config.libsonnet b/metrics/mixin/config.libsonnet index 1b982fcd92..bfb90ea317 100644 --- a/metrics/mixin/config.libsonnet +++ b/metrics/mixin/config.libsonnet @@ -14,6 +14,8 @@ blockedRBDClientAlertTime: '10s', ocsStorageClusterKMSConnectionAlert: '5s', mdsCacheUsageAlertTime: '5m', + mdsSelector: 'pod=~"rook-ceph-mds.*"', + mds_cpu_usage_high_threshold_duration: '6h', // Constants objectStorageType: 'RGW',