From cfea8c4f4debe5807e92d5ee5db647c5d4f40cc8 Mon Sep 17 00:00:00 2001 From: Dong Ma Date: Fri, 29 Nov 2024 13:40:29 +0800 Subject: [PATCH] [ATMOSPHERE-367] Add the NodeTimeSkewDetected alert (#2151) Depends-On: #2150 Reviewed-by: Mohammed Naser --- .../files/jsonnet/tests.yml | 33 +++++++++++++++++++ .../docs/node-mixin/alerts/alerts.libsonnet | 14 ++++++++ 2 files changed, 47 insertions(+) diff --git a/roles/kube_prometheus_stack/files/jsonnet/tests.yml b/roles/kube_prometheus_stack/files/jsonnet/tests.yml index 9ccba908e..4775bb2b8 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/tests.yml +++ b/roles/kube_prometheus_stack/files/jsonnet/tests.yml @@ -72,3 +72,36 @@ tests: exp_annotations: summary: "Nova service group down" description: "All instances of a specific Nova service have been down for more than 5 minutes." + + - interval: 1m + input_series: + - series: 'node_time_seconds{instance="instance1", job="node"}' + values: '0 60 120 180 240 300' + - series: 'node_time_seconds{instance="instance2", job="node"}' + values: '1 61 121 181 241 301' + - series: 'node_time_seconds{instance="instance3", job="node"}' + values: '2 62 122 182 242 302' + alert_rule_test: + - eval_time: 5m + alertname: NodeTimeSkewDetected + exp_alerts: + - exp_labels: + severity: P3 + instance: instance3 + job: node + exp_annotations: + summary: "Node instance3 has a time difference." + description: "Node instance3 has a time difference 2." + + - interval: 1m + input_series: + - series: 'node_time_seconds{instance="instance1", job="node"}' + values: '0 60 120 180 240 300' + - series: 'node_time_seconds{instance="instance2", job="node"}' + values: '0 60 120 180 240 300' + - series: 'node_time_seconds{instance="instance3", job="node"}' + values: '0 60 120 180 240 300' + alert_rule_test: + - eval_time: 5m + alertname: NodeTimeSkewDetected + exp_alerts: [] diff --git a/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet index 1eaedd3d2..7712033bf 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet +++ b/roles/kube_prometheus_stack/files/jsonnet/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet @@ -407,6 +407,20 @@ description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', }, }, + { + alert: 'NodeTimeSkewDetected', + expr: ||| + abs(timestamp(node_time_seconds{%(nodeExporterSelector)s}) - node_time_seconds{%(nodeExporterSelector)s}) > 1 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Node {{ $labels.instance }} has a time difference.', + description: 'Node {{ $labels.instance }} has a time difference {{ $value }}.', + }, + }, ], }, ],