From bbb9fba25c990bdfdd88e8b95cf3d84d87756a03 Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Mon, 11 Dec 2023 06:58:25 +0530 Subject: [PATCH] Add namespace and cluster name to alert messages Signed-off-by: Arun Kumar Mohan --- metrics/mixin/alerts/blocklist.libsonnet | 4 ++-- .../alerts/encryption-external.libsonnet | 4 ++-- metrics/mixin/alerts/encryption.libsonnet | 4 ++-- metrics/mixin/alerts/mirroring.libsonnet | 24 +++++++++---------- metrics/mixin/alerts/obc.libsonnet | 16 ++++++------- .../mixin/alerts/services-external.libsonnet | 4 ++-- metrics/mixin/alerts/services.libsonnet | 4 ++-- metrics/mixin/alerts/storage-client.libsonnet | 16 ++++++------- 8 files changed, 38 insertions(+), 38 deletions(-) diff --git a/metrics/mixin/alerts/blocklist.libsonnet b/metrics/mixin/alerts/blocklist.libsonnet index bf00d9a006..ec435bf66e 100644 --- a/metrics/mixin/alerts/blocklist.libsonnet +++ b/metrics/mixin/alerts/blocklist.libsonnet @@ -21,8 +21,8 @@ severity: 'warning', }, annotations: { - message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}.', - description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.', + message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.', severity_level: 'error', }, }, diff --git a/metrics/mixin/alerts/encryption-external.libsonnet b/metrics/mixin/alerts/encryption-external.libsonnet index 775ed735f7..c32742b313 100644 --- a/metrics/mixin/alerts/encryption-external.libsonnet +++ b/metrics/mixin/alerts/encryption-external.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config.', - description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert, + message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.ocsStorageClusterKMSConnectionAlert, storage_type: $._config.cephStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/encryption.libsonnet b/metrics/mixin/alerts/encryption.libsonnet index 05f1a6f086..7ace0f025f 100644 --- a/metrics/mixin/alerts/encryption.libsonnet +++ b/metrics/mixin/alerts/encryption.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config.', - description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert, + message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.ocsStorageClusterKMSConnectionAlert, storage_type: $._config.cephStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/mirroring.libsonnet b/metrics/mixin/alerts/mirroring.libsonnet index 194c352429..29419cde86 100644 --- a/metrics/mixin/alerts/mirroring.libsonnet +++ b/metrics/mixin/alerts/mirroring.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Mirror daemon is unhealthy.', - description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring on this cluster is not working as expected.' % $._config.odfMirrorDaemonStatusAlertTime, + message: 'Mirror daemon is unhealthy in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfMirrorDaemonStatusAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', }, @@ -31,8 +31,8 @@ mirroring_image_status: 'unknown', }, annotations: { - message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state.', - description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', }, @@ -48,8 +48,8 @@ mirroring_image_status: 'warning', }, annotations: { - message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state.', - description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', }, @@ -65,8 +65,8 @@ mirroring_image_status: 'error', }, annotations: { - message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state.', - description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime, + message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', }, @@ -81,8 +81,8 @@ severity: 'critical', }, annotations: { - message: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }}.', - description: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', }, @@ -97,8 +97,8 @@ severity: 'warning', }, annotations: { - message: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }}.', - description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, + message: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', }, diff --git a/metrics/mixin/alerts/obc.libsonnet b/metrics/mixin/alerts/obc.libsonnet index ae3744e601..a25ac02ecb 100644 --- a/metrics/mixin/alerts/obc.libsonnet +++ b/metrics/mixin/alerts/obc.libsonnet @@ -14,8 +14,8 @@ severity: 'warning', }, annotations: { - message: 'OBC has crossed 80% of the quota(bytes).', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', + message: 'OBC has crossed 80% of the quota(bytes) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', storage_type: $._config.objectStorageType, severity_level: 'warning', }, @@ -30,8 +30,8 @@ severity: 'warning', }, annotations: { - message: 'OBC has crossed 80% of the quota(object).', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', + message: 'OBC has crossed 80% of the quota(object) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.', storage_type: $._config.objectStorageType, severity_level: 'warning', }, @@ -46,8 +46,8 @@ severity: 'critical', }, annotations: { - message: 'OBC reached quota(bytes) limit.', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', + message: 'OBC reached quota(bytes) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', storage_type: $._config.objectStorageType, severity_level: 'error', }, @@ -62,8 +62,8 @@ severity: 'critical', }, annotations: { - message: 'OBC reached quota(object) limit.', - description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', + message: 'OBC reached quota(object) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.', storage_type: $._config.objectStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/services-external.libsonnet b/metrics/mixin/alerts/services-external.libsonnet index 5a41d5b0ea..487b8ae733 100644 --- a/metrics/mixin/alerts/services-external.libsonnet +++ b/metrics/mixin/alerts/services-external.libsonnet @@ -14,8 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection.', - description: 'Cluster Object Store is in unhealthy state for more than %s. Please check Ceph cluster health or RGW connection.' % $._config.clusterObjectStoreStateAlertTime, + message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'Cluster Object Store is in unhealthy state for more than %s. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clusterObjectStoreStateAlertTime, storage_type: $._config.objectStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/services.libsonnet b/metrics/mixin/alerts/services.libsonnet index a012033a71..a9adfc8119 100644 --- a/metrics/mixin/alerts/services.libsonnet +++ b/metrics/mixin/alerts/services.libsonnet @@ -16,8 +16,8 @@ severity: 'critical', }, annotations: { - message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.', - description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments.' % $._config.clusterObjectStoreStateAlertTime, + message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.', + description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clusterObjectStoreStateAlertTime, storage_type: $._config.objectStorageType, severity_level: 'error', }, diff --git a/metrics/mixin/alerts/storage-client.libsonnet b/metrics/mixin/alerts/storage-client.libsonnet index 6382d6c564..af801c956e 100644 --- a/metrics/mixin/alerts/storage-client.libsonnet +++ b/metrics/mixin/alerts/storage-client.libsonnet @@ -13,8 +13,8 @@ severity: 'warning', }, annotations: { - message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s)' % $._config.clientCheckinWarnSec, - description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Lossy network connectivity might exist' % $._config.clientCheckinWarnSec, + message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinWarnSec, + description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Lossy network connectivity might exist in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinWarnSec, severity_level: 'warning', }, }, @@ -27,8 +27,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s)' % $._config.clientCheckinCritSec, - description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Client might have lost internet connectivity' % $._config.clientCheckinCritSec, + message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinCritSec, + description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Client might have lost internet connectivity in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinCritSec, severity_level: 'critical', }, }, @@ -43,8 +43,8 @@ severity: 'warning', }, annotations: { - message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version' % $._config.clientOperatorMinorVerDiff, - description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version. Client configuration may be incompatible' % $._config.clientOperatorMinorVerDiff, + message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, + description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version. Client configuration may be incompatible in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, severity_level: 'warning', }, }, @@ -60,8 +60,8 @@ severity: 'critical', }, annotations: { - message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version' % $._config.clientOperatorMinorVerDiff, - description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version. Client configuration may be incompatible and unsupported' % $._config.clientOperatorMinorVerDiff, + message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, + description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version. Client configuration may be incompatible and unsupported in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff, severity_level: 'critical', }, },