From 301ca34c3e20414448ce230aaac62dfd27406115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Miguel=20Olmo=20Mart=C3=ADnez?= Date: Tue, 5 Dec 2023 11:11:18 +0100 Subject: [PATCH] Add runbook_url annotation in alerts --- metrics/deploy/prometheus-ocs-rules-external.yaml | 1 + metrics/deploy/prometheus-ocs-rules.yaml | 6 ++++++ metrics/mixin/README.md | 3 ++- metrics/mixin/alerts/blocklist.libsonnet | 1 + metrics/mixin/alerts/encryption.libsonnet | 1 + metrics/mixin/alerts/mirroring.libsonnet | 2 ++ 6 files changed, 13 insertions(+), 1 deletion(-) diff --git a/metrics/deploy/prometheus-ocs-rules-external.yaml b/metrics/deploy/prometheus-ocs-rules-external.yaml index 470e280885..29c34f2b69 100644 --- a/metrics/deploy/prometheus-ocs-rules-external.yaml +++ b/metrics/deploy/prometheus-ocs-rules-external.yaml @@ -130,6 +130,7 @@ spec: message: Storage Cluster KMS Server is in un-connected state. Please check KMS config. severity_level: error storage_type: ceph + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/KMSServerConnectionAlert.md expr: | ocs_storagecluster_kms_connection_status{job="ocs-metrics-exporter"} == 1 for: 5s diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 0aae30b74c..3b67d6bc57 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -71,6 +71,7 @@ spec: message: Mirror daemon is unhealthy. severity_level: error storage_type: ceph + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfMirrorDaemonStatus.md expr: | ((count by(namespace) (ocs_mirror_daemon_count{job="ocs-metrics-exporter"} == 0)) * on(namespace) group_left() (count by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"} == 1))) > 0 for: 1m @@ -82,6 +83,7 @@ spec: message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state. severity_level: warning storage_type: ceph + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md expr: | (ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 1 for: 1m @@ -94,6 +96,7 @@ spec: message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Warning state. severity_level: warning storage_type: ceph + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md expr: | (ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 2 for: 1m @@ -106,6 +109,7 @@ spec: message: Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Error state. severity_level: error storage_type: ceph + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md expr: | (ocs_pool_mirroring_image_health{job="ocs-metrics-exporter"} * on (namespace) group_left() (max by(namespace) (ocs_pool_mirroring_status{job="ocs-metrics-exporter"}))) == 3 for: 10s @@ -202,6 +206,7 @@ spec: description: An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details. message: An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. severity_level: error + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ODFRBDClientBlocked.md' expr: | ( ocs_rbd_client_blocklisted{node=~".+"} == 1 @@ -222,6 +227,7 @@ spec: message: Storage Cluster KMS Server is in un-connected state. Please check KMS config. severity_level: error storage_type: ceph + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/KMSServerConnectionAlert.md expr: | ocs_storagecluster_kms_connection_status{job="ocs-metrics-exporter"} == 1 for: 5s diff --git a/metrics/mixin/README.md b/metrics/mixin/README.md index 0d00b5a0a1..d5a54265f3 100644 --- a/metrics/mixin/README.md +++ b/metrics/mixin/README.md @@ -66,7 +66,7 @@ The scope of this directory is to provide OCS specific Prometheus rule files usi clusterRequestsAlertTime, storage_type: $._config.storageType, severity_level: 'error', - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ClusterRequests.md + runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ClusterRequests.md', }, }, ], @@ -77,6 +77,7 @@ The scope of this directory is to provide OCS specific Prometheus rule files usi ``` * Define constants like clusterRequestsAlertTime, storageType in the metrics/mixin/config.libsonnet file. + * Double check there is 'runbook' file for document the alert properly. 'runbook_url' annotation will store the link. * Add this file to **metrics/mixin/alerts/alerts.libsonnet** or **metrics/mixin/alerts/alerts-external.libsonnet** depending on the type(For internal or external cluster) * Test the alert/rule generation by using targets in metrics/mixin/Makefile. Eg: `make prometheus_alert_rules.yaml`. This is **optional** and can be used to isolate issues. diff --git a/metrics/mixin/alerts/blocklist.libsonnet b/metrics/mixin/alerts/blocklist.libsonnet index bf00d9a006..3e8572e5db 100644 --- a/metrics/mixin/alerts/blocklist.libsonnet +++ b/metrics/mixin/alerts/blocklist.libsonnet @@ -24,6 +24,7 @@ message: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}.', description: 'An RBD client might be blocked by Ceph on node {{ $labels.node_name }}. This alert is triggered when the ocs_rbd_client_blocklisted metric reports a value of 1 for the node and there are pods in a CreateContainerError state on the node. This may cause the filesystem for the PVCs to be in a read-only state. Please check the pod description for more details.', severity_level: 'error', + runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/ODFRBDClientBlocked.md', }, }, ], diff --git a/metrics/mixin/alerts/encryption.libsonnet b/metrics/mixin/alerts/encryption.libsonnet index 05f1a6f086..5ad205397a 100644 --- a/metrics/mixin/alerts/encryption.libsonnet +++ b/metrics/mixin/alerts/encryption.libsonnet @@ -18,6 +18,7 @@ description: 'Storage Cluster KMS Server is in un-connected state for more than %s. Please check KMS config.' % $._config.ocsStorageClusterKMSConnectionAlert, storage_type: $._config.cephStorageType, severity_level: 'error', + runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/KMSServerConnectionAlert.md', }, }, ], diff --git a/metrics/mixin/alerts/mirroring.libsonnet b/metrics/mixin/alerts/mirroring.libsonnet index 194c352429..d37fcb4c18 100644 --- a/metrics/mixin/alerts/mirroring.libsonnet +++ b/metrics/mixin/alerts/mirroring.libsonnet @@ -18,6 +18,7 @@ description: 'Mirror daemon is in unhealthy status for more than %s. Mirroring on this cluster is not working as expected.' % $._config.odfMirrorDaemonStatusAlertTime, storage_type: $._config.cephStorageType, severity_level: 'error', + runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfMirrorDaemonStatus.md', }, }, { @@ -35,6 +36,7 @@ description: 'Mirroring image(s) (PV) in the pool {{ $labels.name }} are in Unknown state for more than %s. Mirroring might not work as expected.' % $._config.odfPoolMirroringImageHealthWarningAlertTime, storage_type: $._config.cephStorageType, severity_level: 'warning', + runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/OdfPoolMirroringImageHealth.md', }, }, {