From 43d1b0778b2a881c036c1498e5b2a9e78be187aa Mon Sep 17 00:00:00 2001 From: Manish Date: Wed, 6 Dec 2023 19:08:59 +0530 Subject: [PATCH] Alert user if ceph metadata server is consuming cpu at threshold point. Signed-off-by: Manish --- metrics/mixin/alerts/alerts.libsonnet | 3 ++- metrics/mixin/alerts/perf.libsonnet | 26 ++++++++++++++++++++++++++ metrics/mixin/config.libsonnet | 2 ++ 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 metrics/mixin/alerts/perf.libsonnet diff --git a/metrics/mixin/alerts/alerts.libsonnet b/metrics/mixin/alerts/alerts.libsonnet index 5fe4084d96..69a8527598 100644 --- a/metrics/mixin/alerts/alerts.libsonnet +++ b/metrics/mixin/alerts/alerts.libsonnet @@ -3,4 +3,5 @@ (import 'services.libsonnet') + (import 'blocklist.libsonnet') + (import 'encryption.libsonnet') + -(import 'storage-client.libsonnet') +(import 'storage-client.libsonnet') + +(import 'ceph-overload.libsonnet') diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet new file mode 100644 index 0000000000..5014de2a0f --- /dev/null +++ b/metrics/mixin/alerts/perf.libsonnet @@ -0,0 +1,26 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'ODF-ceph-mds-high-cpu-warnings.rules', + rules: [ + { + alert: 'MDS-high-cpu', + expr: ||| + pod:container_cpu_usage:sum{%(mdsSelector)s}/ on(pod) kube_pod_resource_limit{resource='cpu',%(mdsSelector)s} > 0.67 + ||| % $._config, + 'for': $._config.mds_cpu_usage_high_threshold_duration, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage', + description: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.\nPlease consider increasing the number of active metadata servers,\nit can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR.', + severity_level: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/metrics/mixin/config.libsonnet b/metrics/mixin/config.libsonnet index 46a4c368e9..340b853179 100644 --- a/metrics/mixin/config.libsonnet +++ b/metrics/mixin/config.libsonnet @@ -13,6 +13,8 @@ odfPoolMirroringImageHealthCriticalAlertTime: '10s', blockedRBDClientAlertTime: '10s', ocsStorageClusterKMSConnectionAlert: '5s', + mdsSelector: 'pod=~"rook-ceph-mds.*"', + mds_cpu_usage_high_threshold_duration: '6h', // Constants objectStorageType: 'RGW',