Skip to content

Commit

Permalink
Merge pull request #2309 from sp98/mds-alerts
Browse files Browse the repository at this point in the history
add alert for mds cache over use
  • Loading branch information
openshift-merge-bot[bot] authored Dec 11, 2023
2 parents 19293d0 + be3612b commit e452756
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 1 deletion.
12 changes: 12 additions & 0 deletions metrics/deploy/prometheus-ocs-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,15 @@ spec:
floor((ocs_storage_provider_operator_version>0)/1000) - ignoring(storage_consumer_name) group_right() floor((ocs_storage_client_operator_version>0)/1000) > 1 or floor((ocs_storage_client_operator_version>0)/1000) - ignoring(storage_consumer_name) group_left() floor((ocs_storage_provider_operator_version>0)/1000) >= 1
labels:
severity: critical
- name: mds-performance-alerts.rules
rules:
- alert: MDSCacheUsageHigh
annotations:
description: MDS cache usage for the daemon {{ $labels.ceph_daemon }} has exceeded above 95% of the requested value. Increase the memory request for {{ $labels.ceph_daemon }} pod.
message: High MDS cache usage for the daemon {{ $labels.ceph_daemon }}.
severity_level: error
expr: |
ceph_mds_mem_rss / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95
for: 5m
labels:
severity: critical
3 changes: 2 additions & 1 deletion metrics/mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
(import 'services.libsonnet') +
(import 'blocklist.libsonnet') +
(import 'encryption.libsonnet') +
(import 'storage-client.libsonnet')
(import 'storage-client.libsonnet') +
(import 'perf.libsonnet')
26 changes: 26 additions & 0 deletions metrics/mixin/alerts/perf.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'mds-performance-alerts.rules',
rules: [
{
alert: 'MDSCacheUsageHigh',
expr: |||
ceph_mds_mem_rss / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95
||| % $._config,
'for': $._config.mdsCacheUsageAlertTime,
labels: {
severity: 'critical',
},
annotations: {
message: 'High MDS cache usage for the daemon {{ $labels.ceph_daemon }}.',
description: 'MDS cache usage for the daemon {{ $labels.ceph_daemon }} has exceeded above 95% of the requested value. Increase the memory request for {{ $labels.ceph_daemon }} pod.',
severity_level: 'error',
},
},
],
},
],
},
}
1 change: 1 addition & 0 deletions metrics/mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
odfPoolMirroringImageHealthCriticalAlertTime: '10s',
blockedRBDClientAlertTime: '10s',
ocsStorageClusterKMSConnectionAlert: '5s',
mdsCacheUsageAlertTime: '5m',

// Constants
objectStorageType: 'RGW',
Expand Down

0 comments on commit e452756

Please sign in to comment.