Skip to content

Commit

Permalink
Alert user if ceph metadata server is consuming cpu at threshold point.
Browse files Browse the repository at this point in the history
Signed-off-by: Manish <[email protected]>
  • Loading branch information
manishym committed Dec 11, 2023
1 parent f2da20a commit a2f386d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 0 deletions.
13 changes: 13 additions & 0 deletions metrics/deploy/prometheus-ocs-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,16 @@ spec:
for: 5m
labels:
severity: critical
- alert: MDSCPUUsageHigh
annotations:
description: |-
Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.
Please consider increasing the number of active metadata servers,
it can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR.
message: Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage
severity_level: warning
expr: |
pod:container_cpu_usage:sum{pod=~"rook-ceph-mds.*"}/ on(pod) kube_pod_resource_limit{resource='cpu',pod=~"rook-ceph-mds.*"} > 0.67
for: 6h
labels:
severity: warning
16 changes: 16 additions & 0 deletions metrics/mixin/alerts/perf.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@
severity_level: 'error',
},
},

{
alert: 'MDSCPUUsageHigh',
expr: |||
pod:container_cpu_usage:sum{%(mdsSelector)s}/ on(pod) kube_pod_resource_limit{resource='cpu',%(mdsSelector)s} > 0.67
||| % $._config,
'for': $._config.mds_cpu_usage_high_threshold_duration,
labels: {
severity: 'warning',
},
annotations: {
message: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage',
description: 'Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.\nPlease consider increasing the number of active metadata servers,\nit can be done by increasing the number of activeMetadataServers parameter in the StorageCluster CR.',
severity_level: 'warning',
},
},
],
},
],
Expand Down
2 changes: 2 additions & 0 deletions metrics/mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
blockedRBDClientAlertTime: '10s',
ocsStorageClusterKMSConnectionAlert: '5s',
mdsCacheUsageAlertTime: '5m',
mdsSelector: 'pod=~"rook-ceph-mds.*"',
mds_cpu_usage_high_threshold_duration: '6h',

// Constants
objectStorageType: 'RGW',
Expand Down

0 comments on commit a2f386d

Please sign in to comment.