From 08a5f896f93e0bb4fbed78de27240ba6ffee5523 Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Mon, 11 Dec 2023 06:58:25 +0530 Subject: [PATCH] Add namespace and cluster name to alert messages Signed-off-by: Arun Kumar Mohan --- .../deploy/prometheus-ocs-rules-external.yaml | 24 +++---- metrics/deploy/prometheus-ocs-rules.yaml | 68 +++++++++---------- metrics/mixin/alerts/blocklist.libsonnet | 4 +- .../alerts/encryption-external.libsonnet | 4 +- metrics/mixin/alerts/encryption.libsonnet | 4 +- metrics/mixin/alerts/mirroring.libsonnet | 24 +++---- metrics/mixin/alerts/obc.libsonnet | 16 ++--- .../mixin/alerts/services-external.libsonnet | 4 +- metrics/mixin/alerts/services.libsonnet | 4 +- metrics/mixin/alerts/storage-client.libsonnet | 16 ++--- 10 files changed, 84 insertions(+), 84 deletions(-) diff --git a/metrics/deploy/prometheus-ocs-rules-external.yaml b/metrics/deploy/prometheus-ocs-rules-external.yaml index 470e280885..aba6892480 100644 --- a/metrics/deploy/prometheus-ocs-rules-external.yaml +++ b/metrics/deploy/prometheus-ocs-rules-external.yaml @@ -67,8 +67,8 @@ spec: rules: - alert: ObcQuotaBytesAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. - message: OBC has crossed 80% of the quota(bytes). + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. + message: OBC has crossed 80% of the quota(bytes) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: RGW expr: | @@ -78,8 +78,8 @@ spec: severity: warning - alert: ObcQuotaObjectsAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. - message: OBC has crossed 80% of the quota(object). + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. + message: OBC has crossed 80% of the quota(object) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: RGW expr: | @@ -89,8 +89,8 @@ spec: severity: warning - alert: ObcQuotaBytesExhausedAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. - message: OBC reached quota(bytes) limit. + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. + message: OBC reached quota(bytes) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: RGW expr: | @@ -100,8 +100,8 @@ spec: severity: critical - alert: ObcQuotaObjectsExhausedAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. - message: OBC reached quota(object) limit. + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. + message: OBC reached quota(object) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: RGW expr: | @@ -113,8 +113,8 @@ spec: rules: - alert: ClusterObjectStoreState annotations: - description: Cluster Object Store is in unhealthy state for more than 15s. Please check Ceph cluster health or RGW connection. - message: Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection. + description: Cluster Object Store is in unhealthy state for more than 15s. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: RGW expr: | @@ -126,8 +126,8 @@ spec: rules: - alert: KMSServerConnectionAlert annotations: - description: Storage Cluster KMS Server is in un-connected state for more than 5s. Please check KMS config. - message: Storage Cluster KMS Server is in un-connected state. Please check KMS config. + description: Storage Cluster KMS Server is in un-connected state for more than 5s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: ceph expr: | diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 3471044450..7a8f27fc3c 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -67,8 +67,8 @@ spec: rules: - alert: OdfMirrorDaemonStatus annotations: - description: Mirror daemon is in unhealthy status for more than 1m. Mirroring on this cluster is not working as expected. - message: Mirror daemon is unhealthy. + description: Mirror daemon is in unhealthy status for more than 1m. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Mirror daemon is unhealthy in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: ceph expr: | @@ -78,8 +78,8 @@ spec: severity: critical - alert: OdfPoolMirroringImageHealth annotations: - description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than 1m. Mirroring might not work as expected. - message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state. + description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than 1m. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: ceph expr: | @@ -90,8 +90,8 @@ spec: severity: warning - alert: OdfPoolMirroringImageHealth annotations: - description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than 1m. Mirroring might not work as expected. - message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state. + description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than 1m. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: ceph expr: | @@ -102,8 +102,8 @@ spec: severity: warning - alert: OdfPoolMirroringImageHealth annotations: - description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than 10s. Mirroring is not working as expected. - message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state. + description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than 10s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: ceph expr: | @@ -114,8 +114,8 @@ spec: severity: critical - alert: ODFPersistentVolumeMirrorStatus annotations: - description: Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than 1m. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. - message: Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }}. + description: Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than 1m. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: ceph expr: | @@ -125,8 +125,8 @@ spec: severity: critical - alert: ODFPersistentVolumeMirrorStatus annotations: - description: Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than 1m. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. - message: Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }}. + description: Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than 1m. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }} + message: Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: ceph expr: | @@ -138,8 +138,8 @@ spec: rules: - alert: ObcQuotaBytesAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. - message: OBC has crossed 80% of the quota(bytes). + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. + message: OBC has crossed 80% of the quota(bytes) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: RGW expr: | @@ -149,8 +149,8 @@ spec: severity: warning - alert: ObcQuotaObjectsAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. - message: OBC has crossed 80% of the quota(object). + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource. + message: OBC has crossed 80% of the quota(object) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning storage_type: RGW expr: | @@ -160,8 +160,8 @@ spec: severity: warning - alert: ObcQuotaBytesExhausedAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. - message: OBC reached quota(bytes) limit. + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. + message: OBC reached quota(bytes) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: RGW expr: | @@ -171,8 +171,8 @@ spec: severity: critical - alert: ObcQuotaObjectsExhausedAlert annotations: - description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. - message: OBC reached quota(object) limit. + description: ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately. + message: OBC reached quota(object) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: RGW expr: | @@ -184,8 +184,8 @@ spec: rules: - alert: ClusterObjectStoreState annotations: - description: RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than 15s. Please check the health of the Ceph cluster and the deployments. - message: Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas. + description: RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than 15s. Please check the health of the Ceph cluster and the deployments in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: RGW expr: | @@ -199,8 +199,8 @@ spec: rules: - alert: ODFRBDClientBlocked annotations: - description: An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details. - message: An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. + description: An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details. + message: An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error expr: | ( @@ -218,8 +218,8 @@ spec: rules: - alert: KMSServerConnectionAlert annotations: - description: Storage Cluster KMS Server is in un-connected state for more than 5s. Please check KMS config. - message: Storage Cluster KMS Server is in un-connected state. Please check KMS config. + description: Storage Cluster KMS Server is in un-connected state for more than 5s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: error storage_type: ceph expr: | @@ -231,8 +231,8 @@ spec: rules: - alert: StorageClientHeartbeatMissed annotations: - description: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 120 (s). Lossy network connectivity might exist - message: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 120 (s) + description: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 120 (s). Lossy network connectivity might exist in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 120 (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning expr: | (time() - 120) > (ocs_storage_client_last_heartbeat > 0) @@ -240,8 +240,8 @@ spec: severity: warning - alert: StorageClientHeartbeatMissed annotations: - description: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 300 (s). Client might have lost internet connectivity - message: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 300 (s) + description: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 300 (s). Client might have lost internet connectivity in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than 300 (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: critical expr: | (time() - 300) > (ocs_storage_client_last_heartbeat > 0) @@ -249,8 +249,8 @@ spec: severity: critical - alert: StorageClientIncompatibleOperatorVersion annotations: - description: Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by 1 minor version. Client configuration may be incompatible - message: Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by 1 minor version + description: Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by 1 minor version. Client configuration may be incompatible in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by 1 minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: warning expr: | floor((ocs_storage_provider_operator_version>0)/1000) - ignoring(storage_consumer_name) group_right() floor((ocs_storage_client_operator_version>0)/1000) == 1 @@ -258,8 +258,8 @@ spec: severity: warning - alert: StorageClientIncompatibleOperatorVersion annotations: - description: Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than 1 minor version. Client configuration may be incompatible and unsupported - message: Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than 1 minor version + description: Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than 1 minor version. Client configuration may be incompatible and unsupported in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. + message: Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than 1 minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. severity_level: critical expr: | floor((ocs_storage_provider_operator_version>0)/1000) - ignoring(storage_consumer_name) group_right() floor((ocs_storage_client_operator_version>0)/1000) > 1 or floor((ocs_storage_client_operator_version>0)/1000) - ignoring(storage_consumer_name) group_left() floor((ocs_storage_provider_operator_version>0)/1000) >= 1 diff --git a/metrics/mixin/alerts/blocklist.libsonnet b/metrics/mixin/alerts/blocklist.libsonnet index bf00d9a006..ec435bf66e 100644 --- a/metrics/mixin/alerts/blocklist.libsonnet +++ b/metrics/mixin/alerts/blocklist.libsonnet @@ -21,8 +21,8 @@ severity: 'warning', }, annotations: { - message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}.', - description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.', + message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.', severity_level: 'error', }, }, diff --git a/metrics/mixin/alerts/encryption-external.libsonnet b/metrics/mixin/alerts/encryption-external.libsonnet index 775ed735f7..c32742b313 100644 --- a/metrics/mixin/alerts/encryption-external.libsonnet +++ b/metrics/mixin/alerts/encryption-external.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config.', - description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert, + message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.ocsStorageClusterKMSConnectionAlert, storage_type: $._config.cephStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/encryption.libsonnet b/metrics/mixin/alerts/encryption.libsonnet index 05f1a6f086..7ace0f025f 100644 --- a/metrics/mixin/alerts/encryption.libsonnet +++ b/metrics/mixin/alerts/encryption.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config.', - description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert, + message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.ocsStorageClusterKMSConnectionAlert, storage_type: $._config.cephStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/mirroring.libsonnet b/metrics/mixin/alerts/mirroring.libsonnet index 194c352429..29419cde86 100644 --- a/metrics/mixin/alerts/mirroring.libsonnet +++ b/metrics/mixin/alerts/mirroring.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Mirror daemon is unhealthy.', - description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring on this cluster is not working as expected.' % $._config.odfMirrorDaemonStatusAlertTime, + message: 'Mirror daemon is unhealthy in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfMirrorDaemonStatusAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', }, @@ -31,8 +31,8 @@ mirroring_image_status: 'unknown', }, annotations: { - message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state.', - description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', }, @@ -48,8 +48,8 @@ mirroring_image_status: 'warning', }, annotations: { - message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state.', - description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', }, @@ -65,8 +65,8 @@ mirroring_image_status: 'error', }, annotations: { - message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state.', - description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime, + message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', }, @@ -81,8 +81,8 @@ severity: 'critical', }, annotations: { - message: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }}.', - description: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', }, @@ -97,8 +97,8 @@ severity: 'warning', }, annotations: { - message: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }}.', - description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', }, diff --git a/metrics/mixin/alerts/obc.libsonnet b/metrics/mixin/alerts/obc.libsonnet index ae3744e601..a25ac02ecb 100644 --- a/metrics/mixin/alerts/obc.libsonnet +++ b/metrics/mixin/alerts/obc.libsonnet @@ -14,8 +14,8 @@ severity: 'warning', }, annotations: { - message: 'OBC has crossed 80% of the quota(bytes).', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', + message: 'OBC has crossed 80% of the quota(bytes) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', storage_type: $._config.objectStorageType, severity_level: 'warning', }, @@ -30,8 +30,8 @@ severity: 'warning', }, annotations: { - message: 'OBC has crossed 80% of the quota(object).', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', + message: 'OBC has crossed 80% of the quota(object) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', storage_type: $._config.objectStorageType, severity_level: 'warning', }, @@ -46,8 +46,8 @@ severity: 'critical', }, annotations: { - message: 'OBC reached quota(bytes) limit.', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', + message: 'OBC reached quota(bytes) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', storage_type: $._config.objectStorageType, severity_level: 'error', }, @@ -62,8 +62,8 @@ severity: 'critical', }, annotations: { - message: 'OBC reached quota(object) limit.', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', + message: 'OBC reached quota(object) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', storage_type: $._config.objectStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/services-external.libsonnet b/metrics/mixin/alerts/services-external.libsonnet index 5a41d5b0ea..487b8ae733 100644 --- a/metrics/mixin/alerts/services-external.libsonnet +++ b/metrics/mixin/alerts/services-external.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection.', - description: 'Cluster Object Store is in unhealthy state for more than %s. Please check Ceph cluster health or RGW connection.' % $._config.clusterObjectStoreStateAlertTime, + message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Cluster Object Store is in unhealthy state for more than %s. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clusterObjectStoreStateAlertTime, storage_type: $._config.objectStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/services.libsonnet b/metrics/mixin/alerts/services.libsonnet index a012033a71..a9adfc8119 100644 --- a/metrics/mixin/alerts/services.libsonnet +++ b/metrics/mixin/alerts/services.libsonnet @@ -16,8 +16,8 @@ severity: 'critical', }, annotations: { - message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.', - description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments.' % $._config.clusterObjectStoreStateAlertTime, + message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clusterObjectStoreStateAlertTime, storage_type: $._config.objectStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/storage-client.libsonnet b/metrics/mixin/alerts/storage-client.libsonnet index 6382d6c564..af801c956e 100644 --- a/metrics/mixin/alerts/storage-client.libsonnet +++ b/metrics/mixin/alerts/storage-client.libsonnet @@ -13,8 +13,8 @@ severity: 'warning', }, annotations: { - message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s)' % $._config.clientCheckinWarnSec, - description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Lossy network connectivity might exist' % $._config.clientCheckinWarnSec, + message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinWarnSec, + description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Lossy network connectivity might exist in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinWarnSec, severity_level: 'warning', }, }, @@ -27,8 +27,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s)' % $._config.clientCheckinCritSec, - description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Client might have lost internet connectivity' % $._config.clientCheckinCritSec, + message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinCritSec, + description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Client might have lost internet connectivity in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinCritSec, severity_level: 'critical', }, }, @@ -43,8 +43,8 @@ severity: 'warning', }, annotations: { - message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version' % $._config.clientOperatorMinorVerDiff, - description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version. Client configuration may be incompatible' % $._config.clientOperatorMinorVerDiff, + message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, + description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version. Client configuration may be incompatible in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, severity_level: 'warning', }, }, @@ -60,8 +60,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version' % $._config.clientOperatorMinorVerDiff, - description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version. Client configuration may be incompatible and unsupported' % $._config.clientOperatorMinorVerDiff, + message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, + description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version. Client configuration may be incompatible and unsupported in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, severity_level: 'critical', }, },