Skip to content

Commit

Permalink
Merge pull request #2325 from aruniiird/alerts-changes-for-multicluster
Browse files Browse the repository at this point in the history
Changed alerts queries to support multicluster mode
  • Loading branch information
openshift-merge-bot[bot] authored Dec 18, 2023
2 parents 15b8d1c + 8f67291 commit 723ff01
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 26 deletions.
8 changes: 4 additions & 4 deletions metrics/deploy/prometheus-ocs-rules-external.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ spec:
severity_level: warning
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) > 0.80
(ocs_objectbucketclaim_info * on (namespace, objectbucket, managedBy) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) > 0.80
for: 10s
labels:
severity: warning
Expand All @@ -84,7 +84,7 @@ spec:
severity_level: warning
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) > 0.80
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) > 0.80
for: 10s
labels:
severity: warning
Expand All @@ -95,7 +95,7 @@ spec:
severity_level: error
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) >= 1
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) >= 1
for: 0s
labels:
severity: critical
Expand All @@ -106,7 +106,7 @@ spec:
severity_level: error
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) >= 1
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) >= 1
for: 0s
labels:
severity: critical
Expand Down
25 changes: 14 additions & 11 deletions metrics/deploy/prometheus-ocs-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ spec:
severity_level: error
storage_type: ceph
expr: |
((count by(namespace) (ocs_mirror_daemon_count{job="ocs-metrics-exporter"} == 0)) * on(namespace) group_left() (count by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"} == 1))) > 0
((count by(namespace, managedBy) (ocs_mirror_daemon_count{job="ocs-metrics-exporter"} == 0)) * on(namespace, managedBy) group_left() (count by(namespace, managedBy) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"} == 1))) > 0
for: 1m
labels:
severity: critical
Expand All @@ -85,7 +85,7 @@ spec:
severity_level: warning
storage_type: ceph
expr: |
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 1
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace, managedBy) group_left() (max by(namespace, managedBy) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 1
for: 1m
labels:
mirroring_image_status: unknown
Expand All @@ -94,10 +94,11 @@ spec:
annotations:
description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than 1m. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.
message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md
severity_level: warning
storage_type: ceph
expr: |
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 2
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace, managedBy) group_left() (max by(namespace, managedBy) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 2
for: 1m
labels:
mirroring_image_status: warning
Expand All @@ -106,10 +107,11 @@ spec:
annotations:
description: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than 10s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.
message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md
severity_level: error
storage_type: ceph
expr: |
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 3
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace, managedBy) group_left() (max by(namespace, managedBy) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 3
for: 10s
labels:
mirroring_image_status: error
Expand All @@ -122,18 +124,19 @@ spec:
severity_level: error
storage_type: ceph
expr: |
ocs_rbd_mirror_image_state{job="ocs-metrics-exporter"} * on(image,pool_name) group_left(name,namespace) ocs_rbd_pv_metadata{job="ocs-metrics-exporter"} == 1
ocs_rbd_mirror_image_state{job="ocs-metrics-exporter"} * on(image,pool_name) group_left(name,namespace,managedBy) ocs_rbd_pv_metadata{job="ocs-metrics-exporter"} == 1
for: 1m
labels:
severity: critical
- alert: ODFPersistentVolumeMirrorStatus
annotations:
description: Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than 1m. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}
message: Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} mirroring to peer site {{ $labels.site_name }} in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ODFPersistentVolumeMirrorStatus.md
severity_level: warning
storage_type: ceph
expr: |
ocs_rbd_mirror_image_state{job="ocs-metrics-exporter"} * on(image,pool_name) group_left(name,namespace) ocs_rbd_pv_metadata{job="ocs-metrics-exporter"} == 0
ocs_rbd_mirror_image_state{job="ocs-metrics-exporter"} * on(image,pool_name) group_left(name,namespace,managedBy) ocs_rbd_pv_metadata{job="ocs-metrics-exporter"} == 0
for: 1m
labels:
severity: warning
Expand All @@ -147,7 +150,7 @@ spec:
severity_level: warning
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) > 0.80
(ocs_objectbucketclaim_info * on (namespace, objectbucket, managedBy) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) > 0.80
for: 10s
labels:
severity: warning
Expand All @@ -158,7 +161,7 @@ spec:
severity_level: warning
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) > 0.80
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) > 0.80
for: 10s
labels:
severity: warning
Expand All @@ -169,7 +172,7 @@ spec:
severity_level: error
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) >= 1
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) >= 1
for: 0s
labels:
severity: critical
Expand All @@ -180,7 +183,7 @@ spec:
severity_level: error
storage_type: RGW
expr: |
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) >= 1
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) >= 1
for: 0s
labels:
severity: critical
Expand Down Expand Up @@ -213,7 +216,7 @@ spec:
)
and on(node) (
kube_pod_container_status_waiting_reason{reason="CreateContainerError"}
* on(pod, namespace) group_left(node)
* on(pod, namespace, managedBy) group_left(node)
kube_pod_info
) > 0
for: 10s
Expand Down
2 changes: 1 addition & 1 deletion metrics/mixin/alerts/blocklist.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
and on(node) (
kube_pod_container_status_waiting_reason{reason="CreateContainerError"}
* on(pod, namespace) group_left(node)
* on(pod, namespace, managedBy) group_left(node)
kube_pod_info
) > 0
||| % $._config,
Expand Down
15 changes: 9 additions & 6 deletions metrics/mixin/alerts/mirroring.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
alert: 'OdfMirrorDaemonStatus',
expr: |||
((count by(namespace) (ocs_mirror_daemon_count{%(ocsExporterSelector)s} == 0)) * on(namespace) group_left() (count by(namespace) (ocs_pool_mirroring_status{%(ocsExporterSelector)s} == 1))) > 0
((count by(namespace, managedBy) (ocs_mirror_daemon_count{%(ocsExporterSelector)s} == 0)) * on(namespace, managedBy) group_left() (count by(namespace, managedBy) (ocs_pool_mirroring_status{%(ocsExporterSelector)s} == 1))) > 0
||| % $._config,
'for': $._config.odfMirrorDaemonStatusAlertTime,
labels: {
Expand All @@ -24,7 +24,7 @@
{
alert: 'OdfPoolMirroringImageHealth',
expr: |||
(ocs_pool_mirroring_image_health{%(ocsExporterSelector)s} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{%(ocsExporterSelector)s}))) == 1
(ocs_pool_mirroring_image_health{%(ocsExporterSelector)s} * on (namespace, managedBy) group_left() (max by(namespace, managedBy) (ocs_pool_mirroring_status{%(ocsExporterSelector)s}))) == 1
||| % $._config,
'for': $._config.odfPoolMirroringImageHealthWarningAlertTime,
labels: {
Expand All @@ -42,7 +42,7 @@
{
alert: 'OdfPoolMirroringImageHealth',
expr: |||
(ocs_pool_mirroring_image_health{%(ocsExporterSelector)s} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{%(ocsExporterSelector)s}))) == 2
(ocs_pool_mirroring_image_health{%(ocsExporterSelector)s} * on (namespace, managedBy) group_left() (max by(namespace, managedBy) (ocs_pool_mirroring_status{%(ocsExporterSelector)s}))) == 2
||| % $._config,
'for': $._config.odfPoolMirroringImageHealthWarningAlertTime,
labels: {
Expand All @@ -54,12 +54,13 @@
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state for more than %s. Mirroring might not work as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'warning',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md',
},
},
{
alert: 'OdfPoolMirroringImageHealth',
expr: |||
(ocs_pool_mirroring_image_health{%(ocsExporterSelector)s} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{%(ocsExporterSelector)s}))) == 3
(ocs_pool_mirroring_image_health{%(ocsExporterSelector)s} * on (namespace, managedBy) group_left() (max by(namespace, managedBy) (ocs_pool_mirroring_status{%(ocsExporterSelector)s}))) == 3
||| % $._config,
'for': $._config.odfPoolMirroringImageHealthCriticalAlertTime,
labels: {
Expand All @@ -71,12 +72,13 @@
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state for more than %s. Mirroring is not working as expected in namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}.' % $._config.odfPoolMirroringImageHealthCriticalAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'error',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md',
},
},
{
alert: 'ODFPersistentVolumeMirrorStatus',
expr: |||
ocs_rbd_mirror_image_state{%(ocsExporterSelector)s} * on(image,pool_name) group_left(name,namespace) ocs_rbd_pv_metadata{%(ocsExporterSelector)s} == 1
ocs_rbd_mirror_image_state{%(ocsExporterSelector)s} * on(image,pool_name) group_left(name,namespace,managedBy) ocs_rbd_pv_metadata{%(ocsExporterSelector)s} == 1
||| % $._config,
'for': $._config.odfPoolMirroringImageHealthWarningAlertTime,
labels: {
Expand All @@ -93,7 +95,7 @@
{
alert: 'ODFPersistentVolumeMirrorStatus',
expr: |||
ocs_rbd_mirror_image_state{%(ocsExporterSelector)s} * on(image,pool_name) group_left(name,namespace) ocs_rbd_pv_metadata{%(ocsExporterSelector)s} == 0
ocs_rbd_mirror_image_state{%(ocsExporterSelector)s} * on(image,pool_name) group_left(name,namespace,managedBy) ocs_rbd_pv_metadata{%(ocsExporterSelector)s} == 0
||| % $._config,
'for': $._config.odfPoolMirroringImageHealthWarningAlertTime,
labels: {
Expand All @@ -104,6 +106,7 @@
description: 'Status unknown for Persistent volume {{ $labels.name }}/{{ $labels.namespace }} to peer site {{ $labels.site_name }} for more than %s. RBD image={{ $labels.image }} and CephBlockPool={{ $labels.pool_name }}. Please check namespace:cluster {{ $labels.namespace }}:{{ $labels.managedBy }}' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'warning',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ODFPersistentVolumeMirrorStatus.md'
},
},
],
Expand Down
8 changes: 4 additions & 4 deletions metrics/mixin/alerts/obc.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
alert: 'ObcQuotaBytesAlert',
expr: |||
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) > 0.80
(ocs_objectbucketclaim_info * on (namespace, objectbucket, managedBy) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) > 0.80
||| % $._config,
'for': $._config.odfObcQuotaAlertTime,
labels: {
Expand All @@ -24,7 +24,7 @@
{
alert: 'ObcQuotaObjectsAlert',
expr: |||
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) > 0.80
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) > 0.80
||| % $._config,
'for': $._config.odfObcQuotaAlertTime,
labels: {
Expand All @@ -40,7 +40,7 @@
{
alert: 'ObcQuotaBytesExhausedAlert',
expr: |||
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) >= 1
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_used_bytes/ocs_objectbucket_max_bytes)) >= 1
||| % $._config,
'for': $._config.odfObcQuotaCriticalAlertTime,
labels: {
Expand All @@ -56,7 +56,7 @@
{
alert: 'ObcQuotaObjectsExhausedAlert',
expr: |||
(ocs_objectbucketclaim_info * on (namespace, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) >= 1
(ocs_objectbucketclaim_info * on (namespace, managedBy, objectbucket) group_left() (ocs_objectbucket_objects_total/ocs_objectbucket_max_objects)) >= 1
||| % $._config,
'for': $._config.odfObcQuotaCriticalAlertTime,
labels: {
Expand Down

0 comments on commit 723ff01

Please sign in to comment.