diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index ca254b769a..cae98266b7 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -181,12 +181,14 @@ spec: rules: - alert: ClusterObjectStoreState annotations: - description: RGW endpoint of the Ceph object store is in a failure state for more than 15s. Please check the health of the Ceph cluster. - message: Cluster Object Store is in unhealthy state. Please check Ceph cluster health. + description: RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than 15s. Please check the health of the Ceph cluster and the deployments. + message: Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas. severity_level: error storage_type: RGW expr: | ocs_rgw_health_status{job="ocs-metrics-exporter"} == 2 + or + kube_deployment_status_replicas_ready{deployment=~"rook-ceph-rgw-.*"} < kube_deployment_spec_replicas{deployment=~"rook-ceph-rgw-.*"} for: 15s labels: severity: critical diff --git a/metrics/mixin/alerts/services.libsonnet b/metrics/mixin/alerts/services.libsonnet index 02800ad15a..a012033a71 100644 --- a/metrics/mixin/alerts/services.libsonnet +++ b/metrics/mixin/alerts/services.libsonnet @@ -8,14 +8,16 @@ alert: 'ClusterObjectStoreState', expr: ||| ocs_rgw_health_status{%(ocsExporterSelector)s} == 2 + or + kube_deployment_status_replicas_ready{deployment=~"rook-ceph-rgw-.*"} < kube_deployment_spec_replicas{deployment=~"rook-ceph-rgw-.*"} ||| % $._config, 'for': $._config.clusterObjectStoreStateAlertTime, labels: { severity: 'critical', }, annotations: { - message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health.', - description: 'RGW endpoint of the Ceph object store is in a failure state for more than %s. Please check the health of the Ceph cluster.' % $._config.clusterObjectStoreStateAlertTime, + message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.', + description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments.' % $._config.clusterObjectStoreStateAlertTime, storage_type: $._config.objectStorageType, severity_level: 'error', },