Update disk alerts to not trigger when node-problem-detector is alrea…

…dy remediating the issue. (#1203)
giantswarm · May 30, 2024 · a943a83 · a943a83
1 parent bddf46d
commit a943a83
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Updated `ContainerdVolumeSpaceTooLow`, `KubeletVolumeSpaceTooLow` and `LogVolumeSpaceTooLow` alerts to not trigger when the node-problem-detector is already remediating the issue.
+
 ## [4.1.1] - 2024-05-30
 
 ### Changed

diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/disk.management-cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/disk.management-cluster.rules.yml
@@ -27,9 +27,11 @@ spec:
         topic: storage
     - alert: ContainerdVolumeSpaceTooLow
       annotations:
-        description: '{{`Containerd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
+        description: '{{`Containerd volume /var/lib/containerd on {{ $labels.node }} does not have enough free space.`}}'
         opsrecipe: low-disk-space/#containerd-volume
-      expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10
+      # See below comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector.
+      # We are also alerted if the free space is less than 10% for 10 minutes.
+      expr: (( 100 * (node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} +1) / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="ContainerdDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} +1)/ node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/containerd"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
       for: 10m
       labels:
         area: kaas
@@ -51,9 +53,14 @@ spec:
         topic: storage
     - alert: KubeletVolumeSpaceTooLow
       annotations:
-        description: '{{`Kubelet volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
-        opsrecipe: low-disk-space/#root-volume
-      expr: node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/lib/kubelet"} < (2 * 1024 * 1024 * 1024)
+        description: '{{`Kubelet volume /var/lib/kubelet on {{ $labels.node }} does not have enough free space.`}}'
+        opsrecipe: low-disk-space/#kubelet-volume
+      # In clusters where the node-problem-detector-app (https://github.com/giantswarm/node-problem-detector-app/) is installed, we don't want to get alerted if the node-problem-detector is already remediating the issue. 
+      # When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0. 
+      # The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector.
+      # Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0.
+      # We are also alerted if the free space is less than 2GB for 10 minutes.
+      expr: (( node_filesystem_free_bytes{cluster_type="management_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="management_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
       for: 10m
       labels:
         area: kaas
@@ -63,9 +70,11 @@ spec:
         topic: storage
     - alert: LogVolumeSpaceTooLow
       annotations:
-        description: '{{`Log volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
-        opsrecipe: low-disk-space/#root-volume
-      expr: 100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 20
+        description: '{{`Log volume /var/log on {{ $labels.node }} does not have enough free space.`}}'
+        opsrecipe: low-disk-space/#log-volume
+      # See above comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector.
+      # We are also alerted if the free space is less than 10% for 30 minutes.
+      expr: (( 100 * (node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} +1) / node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="VarLogDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} +1)/ node_filesystem_size_bytes{cluster_type="management_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
       for: 30m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/disk.workload-cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/disk.workload-cluster.rules.yml
@@ -38,9 +38,14 @@ spec:
         topic: storage
     - alert: KubeletVolumeSpaceTooLow
       annotations:
-        description: '{{`Kubelet volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
-        opsrecipe: low-disk-space/#root-volume
-      expr: node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/lib/kubelet"} < (2 * 1024 * 1024 * 1024)
+        description: '{{`Kubelet volume /var/lib/kubelet on {{ $labels.node }} does not have enough free space.`}}'
+        opsrecipe: low-disk-space/#kubelet-volume
+      # In clusters where the node-problem-detector-app (https://github.com/giantswarm/node-problem-detector-app/) is installed, we don't want to get alerted if the node-problem-detector is already remediating the issue. 
+      # When this happens, the problem_gauge metric has value 1, so we do a multiply join on that metric - 1 to get 0 when the metric is present and active, and keep the series values that are > 0. 
+      # The right hand side of the or is necessary because we need to be alerted in clusters without the node-problem-detector.
+      # Note that we add 1 to the disk free space so we still get alerted when the free bytes are 0.
+      # We are also alerted if the free space is less than 2GB for 30 minutes.
+      expr: (( node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="KubeletDiskIsFull"}) or sum (node_filesystem_free_bytes{cluster_type="workload_cluster",mountpoint=~"(/rootfs)?/var/lib/kubelet"} +1 < (2 * 1024 * 1024 * 1024)) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
       for: 30m
       labels:
         area: kaas
@@ -49,9 +54,11 @@ spec:
         topic: storage
     - alert: LogVolumeSpaceTooLow
       annotations:
-        description: '{{`Log volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}'
-        opsrecipe: low-disk-space/#root-volume
-      expr: 100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 20
+        description: '{{`Log volume /var/log on {{ $labels.node }} does not have enough free space.`}}'
+        opsrecipe: low-disk-space/#log-volume
+      # See above comment for the KubeletVolumeSpaceTooLow alert regarding the node-problem-detector.
+      # We are also alerted if the free space is less than 10% for 30 minutes.
+      expr: (( 100 * (node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} +1) / node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) * on (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer) (1 - problem_gauge{reason="VarLogDiskIsFull"}) or sum ((100 * node_filesystem_free_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} +1)/ node_filesystem_size_bytes{cluster_type="workload_cluster", mountpoint=~"(/rootfs)?/var/log"} < 10) by (node, cluster_type, cluster_id, installation, organization, pipeline, region, customer)) > 0
       for: 30m
       labels:
         area: kaas