Skip to content

Commit

Permalink
Add runbook_url annotation in alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
jmolmo committed Dec 11, 2023
1 parent e452756 commit 301ca34
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 1 deletion.
1 change: 1 addition & 0 deletions metrics/deploy/prometheus-ocs-rules-external.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ spec:
message: Storage Cluster KMS Server is in un-connected state. Please check KMS config.
severity_level: error
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/KMSServerConnectionAlert.md
expr: |
ocs_storagecluster_kms_connection_status{job="ocs-metrics-exporter"} == 1
for: 5s
Expand Down
6 changes: 6 additions & 0 deletions metrics/deploy/prometheus-ocs-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ spec:
message: Mirror daemon is unhealthy.
severity_level: error
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfMirrorDaemonStatus.md
expr: |
((count by(namespace) (ocs_mirror_daemon_count{job="ocs-metrics-exporter"} == 0)) * on(namespace) group_left() (count by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"} == 1))) > 0
for: 1m
Expand All @@ -82,6 +83,7 @@ spec:
message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state.
severity_level: warning
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md
expr: |
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 1
for: 1m
Expand All @@ -94,6 +96,7 @@ spec:
message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state.
severity_level: warning
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md
expr: |
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 2
for: 1m
Expand All @@ -106,6 +109,7 @@ spec:
message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state.
severity_level: error
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md
expr: |
(ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 3
for: 10s
Expand Down Expand Up @@ -202,6 +206,7 @@ spec:
description: An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.
message: An RBD client might be blocked by Ceph on node {{ $labels.node_name }}.
severity_level: error
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ODFRBDClientBlocked.md'
expr: |
(
ocs_rbd_client_blocklisted{node=~".+"} == 1
Expand All @@ -222,6 +227,7 @@ spec:
message: Storage Cluster KMS Server is in un-connected state. Please check KMS config.
severity_level: error
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/KMSServerConnectionAlert.md
expr: |
ocs_storagecluster_kms_connection_status{job="ocs-metrics-exporter"} == 1
for: 5s
Expand Down
3 changes: 2 additions & 1 deletion metrics/mixin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ The scope of this directory is to provide OCS specific Prometheus rule files usi
clusterRequestsAlertTime,
storage_type: $._config.storageType,
severity_level: 'error',
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ClusterRequests.md
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ClusterRequests.md',
},
},
],
Expand All @@ -77,6 +77,7 @@ The scope of this directory is to provide OCS specific Prometheus rule files usi
```
* Define constants like clusterRequestsAlertTime, storageType in the metrics/mixin/config.libsonnet file.
* Double check there is 'runbook' file for document the alert properly. 'runbook_url' annotation will store the link.
* Add this file to **metrics/mixin/alerts/alerts.libsonnet** or **metrics/mixin/alerts/alerts-external.libsonnet** depending on the type(For internal or external cluster)
* Test the alert/rule generation by using targets in metrics/mixin/Makefile. Eg: `make prometheus_alert_rules.yaml`. This is **optional** and can be used to isolate issues.
Expand Down
1 change: 1 addition & 0 deletions metrics/mixin/alerts/blocklist.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}.',
description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.',
severity_level: 'error',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ODFRBDClientBlocked.md',
},
},
],
Expand Down
1 change: 1 addition & 0 deletions metrics/mixin/alerts/encryption.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert,
storage_type: $._config.cephStorageType,
severity_level: 'error',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/KMSServerConnectionAlert.md',
},
},
],
Expand Down
2 changes: 2 additions & 0 deletions metrics/mixin/alerts/mirroring.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring on this cluster is not working as expected.' % $._config.odfMirrorDaemonStatusAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'error',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfMirrorDaemonStatus.md',
},
},
{
Expand All @@ -35,6 +36,7 @@
description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime,
storage_type: $._config.cephStorageType,
severity_level: 'warning',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md',
},
},
{
Expand Down

0 comments on commit 301ca34

Please sign in to comment.