Skip to content

Commit

Permalink
Add namespace and cluster name to alert messages
Browse files Browse the repository at this point in the history
Signed-off-by: Arun Kumar Mohan <[email protected]>
  • Loading branch information
aruniiird committed Dec 11, 2023
1 parent 64596db commit bbb9fba
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 38 deletions.
4 changes: 2 additions & 2 deletions metrics/mixin/alerts/blocklist.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
severity: 'warning',
},
annotations: {
message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}.',
description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.',
message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.',
severity_level: 'error',
},
},
Expand Down
4 changes: 2 additions & 2 deletions metrics/mixin/alerts/encryption-external.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
severity: 'critical',
},
annotations: {
message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config.',
description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert,
message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.ocsStorageClusterKMSConnectionAlert,
storage_type: $._config.cephStorageType,
severity_level: 'error',
},
Expand Down
4 changes: 2 additions & 2 deletions metrics/mixin/alerts/encryption.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
severity: 'critical',
},
annotations: {
message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config.',
description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert,
message: 'Storage Cluster KMS Server is in un-connected state. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.ocsStorageClusterKMSConnectionAlert,
storage_type: $._config.cephStorageType,
severity_level: 'error',
},
Expand Down
24 changes: 12 additions & 12 deletions metrics/mixin/alerts/mirroring.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
severity: 'critical',
},
annotations: {
message: 'Mirror daemon is unhealthy.',
description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring on this cluster is not working as expected.' % $._config.odfMirrorDaemonStatusAlertTime,
message: 'Mirror daemon is unhealthy in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfMirrorDaemonStatusAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'error',
},
Expand All @@ -31,8 +31,8 @@
mirroring_image_status: 'unknown',
},
annotations: {
message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state.',
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'warning',
},
Expand All @@ -48,8 +48,8 @@
mirroring_image_status: 'warning',
},
annotations: {
message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state.',
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'warning',
},
Expand All @@ -65,8 +65,8 @@
mirroring_image_status: 'error',
},
annotations: {
message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state.',
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime,
message: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'error',
},
Expand All @@ -81,8 +81,8 @@
severity: 'critical',
},
annotations: {
message: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }}.',
description: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
message: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Persistent volume {{ $labels.name }}/{{ $labels.namespace }} is not mirrored properly to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'error',
},
Expand All @@ -97,8 +97,8 @@
severity: 'warning',
},
annotations: {
message: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }}.',
description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
message: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'warning',
},
Expand Down
16 changes: 8 additions & 8 deletions metrics/mixin/alerts/obc.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
severity: 'warning',
},
annotations: {
message: 'OBC has crossed 80% of the quota(bytes).',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.',
message: 'OBC has crossed 80% of the quota(bytes) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(bytes) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.',
storage_type: $._config.objectStorageType,
severity_level: 'warning',
},
Expand All @@ -30,8 +30,8 @@
severity: 'warning',
},
annotations: {
message: 'OBC has crossed 80% of the quota(object).',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.',
message: 'OBC has crossed 80% of the quota(object) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed 80% of the size limit set by the quota(objects) and will become read-only on reaching the quota limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource.',
storage_type: $._config.objectStorageType,
severity_level: 'warning',
},
Expand All @@ -46,8 +46,8 @@
severity: 'critical',
},
annotations: {
message: 'OBC reached quota(bytes) limit.',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.',
message: 'OBC reached quota(bytes) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(bytes) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.',
storage_type: $._config.objectStorageType,
severity_level: 'error',
},
Expand All @@ -62,8 +62,8 @@
severity: 'critical',
},
annotations: {
message: 'OBC reached quota(object) limit.',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.',
message: 'OBC reached quota(object) limit in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'ObjectBucketClaim {{ $labels.objectbucketclaim }} has crossed the limit set by the quota(objects) and will be read-only now in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}. Increase the quota in the {{ $labels.objectbucketclaim }} OBC custom resource immediately.',
storage_type: $._config.objectStorageType,
severity_level: 'error',
},
Expand Down
4 changes: 2 additions & 2 deletions metrics/mixin/alerts/services-external.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
severity: 'critical',
},
annotations: {
message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection.',
description: 'Cluster Object Store is in unhealthy state for more than %s. Please check Ceph cluster health or RGW connection.' % $._config.clusterObjectStoreStateAlertTime,
message: 'Cluster Object Store is in unhealthy state. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'Cluster Object Store is in unhealthy state for more than %s. Please check Ceph cluster health or RGW connection in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clusterObjectStoreStateAlertTime,
storage_type: $._config.objectStorageType,
severity_level: 'error',
},
Expand Down
4 changes: 2 additions & 2 deletions metrics/mixin/alerts/services.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
severity: 'critical',
},
annotations: {
message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas.',
description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments.' % $._config.clusterObjectStoreStateAlertTime,
message: 'Cluster Object Store is in unhealthy state or number of ready replicas for Rook Ceph RGW deployments is less than the desired replicas in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.',
description: 'RGW endpoint of the Ceph object store is in a failure state or one or more Rook Ceph RGW deployments have fewer ready replicas than required for more than %s. Please check the health of the Ceph cluster and the deployments in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clusterObjectStoreStateAlertTime,
storage_type: $._config.objectStorageType,
severity_level: 'error',
},
Expand Down
16 changes: 8 additions & 8 deletions metrics/mixin/alerts/storage-client.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
severity: 'warning',
},
annotations: {
message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s)' % $._config.clientCheckinWarnSec,
description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Lossy network connectivity might exist' % $._config.clientCheckinWarnSec,
message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinWarnSec,
description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Lossy network connectivity might exist in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinWarnSec,
severity_level: 'warning',
},
},
Expand All @@ -27,8 +27,8 @@
severity: 'critical',
},
annotations: {
message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s)' % $._config.clientCheckinCritSec,
description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Client might have lost internet connectivity' % $._config.clientCheckinCritSec,
message: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s) in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinCritSec,
description: 'Storage Client ({{ $labels.storage_consumer_name }}) heartbeat missed for more than %d (s). Client might have lost internet connectivity in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientCheckinCritSec,
severity_level: 'critical',
},
},
Expand All @@ -43,8 +43,8 @@
severity: 'warning',
},
annotations: {
message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version' % $._config.clientOperatorMinorVerDiff,
description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version. Client configuration may be incompatible' % $._config.clientOperatorMinorVerDiff,
message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff,
description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) lags by %d minor version. Client configuration may be incompatible in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff,
severity_level: 'warning',
},
},
Expand All @@ -60,8 +60,8 @@
severity: 'critical',
},
annotations: {
message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version' % $._config.clientOperatorMinorVerDiff,
description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version. Client configuration may be incompatible and unsupported' % $._config.clientOperatorMinorVerDiff,
message: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff,
description: 'Storage Client Operator ({{ $labels.storage_consumer_name }}) differs by more than %d minor version. Client configuration may be incompatible and unsupported in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.clientOperatorMinorVerDiff,
severity_level: 'critical',
},
},
Expand Down

0 comments on commit bbb9fba

Please sign in to comment.