From 42ce94e5bfbd6cbd86773c1e87309ff2166a5e22 Mon Sep 17 00:00:00 2001 From: Divyansh Kamboj Date: Thu, 31 Aug 2023 18:21:41 +0530 Subject: [PATCH] Add deployment check for rgw gateway pods This commit adds a check for the number of ready deployment of rook-ceph-rgw-* pods, this change is done because, ODF/Rook used to run a routine that regularly created a bucket and then wrote/read the bucket to test the RGW health, now the status checking is removed. We now need to reflect the "Readyness" and the "Connected" nature of status of the CephObjectStore. Signed-off-by: Divyansh Kamboj --- metrics/deploy/prometheus-ocs-rules.yaml | 6 ++++-- metrics/mixin/alerts/services.libsonnet | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index ca254b769a..cae98266b7 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -181,12 +181,14 @@ spec: rules: - alert: ClusterObjectStoreState annotations: - description: RGW endpoint of the Ceph object store is in a failure state for more than 15s. Please check the health of the Ceph cluster. - message: Cluster Object Store is in unhealthy state. Please check Ceph cluster health. + description: RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than 15s. Please check the health of the Ceph cluster and the deployments. + message: Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas. severity_level: error storage_type: RGW expr: | ocs_rgw_health_status{job="ocs-metrics-exporter"} == 2 + or + kube_deployment_status_replicas_ready{deployment=~"rook-ceph-rgw-.*"} < kube_deployment_spec_replicas{deployment=~"rook-ceph-rgw-.*"} for: 15s labels: severity: critical diff --git a/metrics/mixin/alerts/services.libsonnet b/metrics/mixin/alerts/services.libsonnet index 02800ad15a..a012033a71 100644 --- a/metrics/mixin/alerts/services.libsonnet +++ b/metrics/mixin/alerts/services.libsonnet @@ -8,14 +8,16 @@ alert: 'ClusterObjectStoreState', expr: ||| ocs_rgw_health_status{%(ocsExporterSelector)s} == 2 + or + kube_deployment_status_replicas_ready{deployment=~"rook-ceph-rgw-.*"} < kube_deployment_spec_replicas{deployment=~"rook-ceph-rgw-.*"} ||| % $._config, 'for': $._config.clusterObjectStoreStateAlertTime, labels: { severity: 'critical', }, annotations: { - message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health.', - description: 'RGW endpoint of the Ceph object store is in a failure state for more than %s. Please check the health of the Ceph cluster.' % $._config.clusterObjectStoreStateAlertTime, + message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.', + description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments.' % $._config.clusterObjectStoreStateAlertTime, storage_type: $._config.objectStorageType, severity_level: 'error', },