red-hat-storage · openshift-cherrypick-robot · Aug 31, 2023
diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml
@@ -181,12 +181,14 @@ spec:
     rules:
     - alert: ClusterObjectStoreState
       annotations:
-        description: RGW endpoint of the Ceph object store is in a failure state for more than 15s. Please check the health of the Ceph cluster.
-        message: Cluster Object Store is in unhealthy state. Please check Ceph cluster health.
+        description: RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than 15s. Please check the health of the Ceph cluster and the deployments.
+        message: Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.
         severity_level: error
         storage_type: RGW
       expr: |
         ocs_rgw_health_status{job="ocs-metrics-exporter"} == 2
+        or
+        kube_deployment_status_replicas_ready{deployment=~"rook-ceph-rgw-.*"} < kube_deployment_spec_replicas{deployment=~"rook-ceph-rgw-.*"}
       for: 15s
       labels:
         severity: critical

diff --git a/metrics/mixin/alerts/services.libsonnet b/metrics/mixin/alerts/services.libsonnet
@@ -8,14 +8,16 @@
             alert: 'ClusterObjectStoreState',
             expr: |||
               ocs_rgw_health_status{%(ocsExporterSelector)s} == 2
+              or
+              kube_deployment_status_replicas_ready{deployment=~"rook-ceph-rgw-.*"} < kube_deployment_spec_replicas{deployment=~"rook-ceph-rgw-.*"}
             ||| % $._config,
             'for': $._config.clusterObjectStoreStateAlertTime,
             labels: {
               severity: 'critical',
             },
             annotations: {
-              message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health.',
-              description: 'RGW endpoint of the Ceph object store is in a failure state for more than %s. Please check the health of the Ceph cluster.' % $._config.clusterObjectStoreStateAlertTime,
+              message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.',
+              description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments.' % $._config.clusterObjectStoreStateAlertTime,
               storage_type: $._config.objectStorageType,
               severity_level: 'error',
             },