From be3612be260e39f5f2e34de7418556968de5d87b Mon Sep 17 00:00:00 2001 From: sp98 Date: Tue, 5 Dec 2023 16:01:44 +0530 Subject: [PATCH] add alert for mds memory and cache over use Signed-off-by: sp98 --- metrics/deploy/prometheus-ocs-rules.yaml | 12 +++++++++++ metrics/mixin/alerts/alerts.libsonnet | 3 ++- metrics/mixin/alerts/perf.libsonnet | 26 ++++++++++++++++++++++++ metrics/mixin/config.libsonnet | 1 + 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 metrics/mixin/alerts/perf.libsonnet diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index f7234be2c0..0aae30b74c 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -265,3 +265,15 @@ spec: floor((ocs_storage_provider_operator_version>0)/1000) - ignoring(storage_consumer_name) group_right() floor((ocs_storage_client_operator_version>0)/1000) > 1 or floor((ocs_storage_client_operator_version>0)/1000) - ignoring(storage_consumer_name) group_left() floor((ocs_storage_provider_operator_version>0)/1000) >= 1 labels: severity: critical + - name: mds-performance-alerts.rules + rules: + - alert: MDSCacheUsageHigh + annotations: + description: MDS cache usage for the daemon {{ $labels.ceph_daemon }} has exceeded above 95% of the requested value. Increase the memory request for {{ $labels.ceph_daemon }} pod. + message: High MDS cache usage for the daemon {{ $labels.ceph_daemon }}. + severity_level: error + expr: | + ceph_mds_mem_rss / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 + for: 5m + labels: + severity: critical diff --git a/metrics/mixin/alerts/alerts.libsonnet b/metrics/mixin/alerts/alerts.libsonnet index 5fe4084d96..3a263cebea 100644 --- a/metrics/mixin/alerts/alerts.libsonnet +++ b/metrics/mixin/alerts/alerts.libsonnet @@ -3,4 +3,5 @@ (import 'services.libsonnet') + (import 'blocklist.libsonnet') + (import 'encryption.libsonnet') + -(import 'storage-client.libsonnet') +(import 'storage-client.libsonnet') + +(import 'perf.libsonnet') diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet new file mode 100644 index 0000000000..add41743e7 --- /dev/null +++ b/metrics/mixin/alerts/perf.libsonnet @@ -0,0 +1,26 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'mds-performance-alerts.rules', + rules: [ + { + alert: 'MDSCacheUsageHigh', + expr: ||| + ceph_mds_mem_rss / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 + ||| % $._config, + 'for': $._config.mdsCacheUsageAlertTime, + labels: { + severity: 'critical', + }, + annotations: { + message: 'High MDS cache usage for the daemon {{ $labels.ceph_daemon }}.', + description: 'MDS cache usage for the daemon {{ $labels.ceph_daemon }} has exceeded above 95% of the requested value. Increase the memory request for {{ $labels.ceph_daemon }} pod.', + severity_level: 'error', + }, + }, + ], + }, + ], + }, +} diff --git a/metrics/mixin/config.libsonnet b/metrics/mixin/config.libsonnet index 46a4c368e9..1b982fcd92 100644 --- a/metrics/mixin/config.libsonnet +++ b/metrics/mixin/config.libsonnet @@ -13,6 +13,7 @@ odfPoolMirroringImageHealthCriticalAlertTime: '10s', blockedRBDClientAlertTime: '10s', ocsStorageClusterKMSConnectionAlert: '5s', + mdsCacheUsageAlertTime: '5m', // Constants objectStorageType: 'RGW',