diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 0aae30b74c..de6d342a66 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -265,7 +265,7 @@ spec: floor((ocs_storage_provider_operator_version>0)/1000) - ignoring(storage_consumer_name) group_right() floor((ocs_storage_client_operator_version>0)/1000) > 1 or floor((ocs_storage_client_operator_version>0)/1000) - ignoring(storage_consumer_name) group_left() floor((ocs_storage_provider_operator_version>0)/1000) >= 1 labels: severity: critical - - name: mds-performance-alerts.rules + - name: ceph-daemon-performance-alerts.rules rules: - alert: MDSCacheUsageHigh annotations: @@ -277,3 +277,12 @@ spec: for: 5m labels: severity: critical + - alert: OSDCPULoadHigh + annotations: + description: High CPU usage in the OSD container on pod {{ $labels.pod }}. Please create more OSDs to increase performance + message: High CPU usage detected in OSD container on pod {{ $labels.pod}}. + severity_level: warning + expr: "pod:container_cpu_usage:sum{pod=~\"rook-ceph-osd-.*\"} > 0.35 \n" + for: 15m + labels: + severity: warning diff --git a/metrics/mixin/alerts/perf.libsonnet b/metrics/mixin/alerts/perf.libsonnet index add41743e7..3f1d4de6f3 100644 --- a/metrics/mixin/alerts/perf.libsonnet +++ b/metrics/mixin/alerts/perf.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: { groups+: [ { - name: 'mds-performance-alerts.rules', + name: 'ceph-daemon-performance-alerts.rules', rules: [ { alert: 'MDSCacheUsageHigh', @@ -17,6 +17,21 @@ message: 'High MDS cache usage for the daemon {{ $labels.ceph_daemon }}.', description: 'MDS cache usage for the daemon {{ $labels.ceph_daemon }} has exceeded above 95% of the requested value. Increase the memory request for {{ $labels.ceph_daemon }} pod.', severity_level: 'error', + } + }, + { + alert: 'OSDCPULoadHigh', + expr: ||| + pod:container_cpu_usage:sum{%(osdSelector)s} > 0.35 + ||| % $._config, + 'for': $._config.osdCPULoadHighAlertTime, + labels: { + severity: 'warning', + }, + annotations: { + message: 'High CPU usage detected in OSD container on pod {{ $labels.pod}}.', + description: 'High CPU usage in the OSD container on pod {{ $labels.pod }}. Please create more OSDs to increase performance', + severity_level: 'warning', }, }, ], diff --git a/metrics/mixin/config.libsonnet b/metrics/mixin/config.libsonnet index 1b982fcd92..bcfafcb9be 100644 --- a/metrics/mixin/config.libsonnet +++ b/metrics/mixin/config.libsonnet @@ -2,6 +2,7 @@ _config+:: { // Selectors are inserted between {} in Prometheus queries. ocsExporterSelector: 'job="ocs-metrics-exporter"', + osdSelector: 'pod=~"rook-ceph-osd-.*"', // Duration to raise various Alerts clusterObjectStoreStateAlertTime: '15s', @@ -14,6 +15,7 @@ blockedRBDClientAlertTime: '10s', ocsStorageClusterKMSConnectionAlert: '5s', mdsCacheUsageAlertTime: '5m', + osdCPULoadHighAlertTime: '15m', // Constants objectStorageType: 'RGW',