diff --git a/helm/prometheus-rules/templates/alerting-rules/alertmanager.rules.yml b/helm/prometheus-rules/templates/alerting-rules/alertmanager.rules.yml index d17a9da63..e5c1d5e3d 100644 --- a/helm/prometheus-rules/templates/alerting-rules/alertmanager.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/alertmanager.rules.yml @@ -15,8 +15,11 @@ spec: annotations: description: '{{`AlertManager {{ $labels.integration }} notifications are failing.`}}' opsrecipe: alert-manager-notifications-failing/ - expr: rate(alertmanager_notifications_failed_total{integration!="opsgenie"}[5m]) > 0 - for: 10m + # Interval = 20m because currently AlertManager config set `group_interval=15m` that means that if a notification fails, it will be retried after 15m + # so the counter will stay flat during this time. + # Here, we decide to page after 3 successive failures, so we need to wait 3*15m = 45m before paging. + expr: rate(alertmanager_notifications_failed_total{integration!="opsgenie"}[20m]) > 0 + for: 45m labels: area: empowerment severity: page @@ -27,8 +30,9 @@ spec: annotations: description: '{{`AlertManager {{ $labels.integration }} notifications are failing.`}}' opsrecipe: alert-manager-notifications-failing/ - expr: rate(alertmanager_notifications_failed_total{integration="opsgenie"}[5m]) > 0 - for: 2m + # Here, we decide to notify after 2 successive failures (opsgenie notification), so we need to wait 2*15m = 30m before notifying. + expr: rate(alertmanager_notifications_failed_total{integration="opsgenie"}[20m]) > 0 + for: 30m labels: area: empowerment severity: notify diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index 3112fea9d..34875327a 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -1,6 +1,5 @@ templates/alerting-rules-test/capo.rules.test.yml templates/alerting-rules/alertmanager-dashboard.rules.yml -templates/alerting-rules/alertmanager.rules.yml templates/alerting-rules/apiserver.management-cluster.rules.yml templates/alerting-rules/apiserver.workload-cluster.rules.yml templates/alerting-rules/argocd.rules.yml diff --git a/test/tests/providers/global/alertmanager.rules.test.yml b/test/tests/providers/global/alertmanager.rules.test.yml new file mode 100644 index 000000000..a2e03f3c6 --- /dev/null +++ b/test/tests/providers/global/alertmanager.rules.test.yml @@ -0,0 +1,62 @@ +--- +rule_files: + - alertmanager.rules.yml + +tests: + - interval: 1m + input_series: + # after 1h, slack notification fails during 2h then works again (15m group_interval) => alert fires after 3 successive failures + - series: 'alertmanager_notifications_failed_total{integration="slack"}' + values: "0x60 1+0x15 2+0x15 3+0x15 4+0x15 5+0x15 6+0x15 7+0x15 8+0x15 8+0x120" + # after 1h, slack notification fails 2 times during 30mn than works again => alert must not fires + - series: 'alertmanager_notifications_failed_total{integration="webhook"}' + values: "0x60 1+0x15 2+0x15 2+0x15 2+0x15 2+0x15 2+0x15 2+0x15 2+0x15 2+0x120" + alert_rule_test: + - alertname: AlertmanagerPageNotificationsFailing + eval_time: 10m + - alertname: AlertmanagerPageNotificationsFailing + eval_time: 90m + - alertname: AlertmanagerPageNotificationsFailing + eval_time: 95m + - alertname: AlertmanagerPageNotificationsFailing + eval_time: 106m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: monitoring + integration: slack + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "AlertManager slack notifications are failing." + opsrecipe: alert-manager-notifications-failing/ + - alertname: AlertmanagerPageNotificationsFailing + eval_time: 240m + - interval: 1m + input_series: + # after 1h, opsgenie notification fails during 45m then works again for 1h, finally fails 1 time (group_interval=15m) + # => alert fires after 2 successive failures only + - series: 'alertmanager_notifications_failed_total{integration="opsgenie"}' + values: "0x60 1+0x15 2+0x15 2+0x15 2+0x60 3+0x15 3+0x60" + alert_rule_test: + - alertname: AlertmanagerNotifyNotificationsFailing + eval_time: 10m + - alertname: AlertmanagerNotifyNotificationsFailing + eval_time: 75m + - alertname: AlertmanagerNotifyNotificationsFailing + eval_time: 91m + exp_alerts: + - exp_labels: + area: empowerment + severity: notify + team: atlas + topic: monitoring + integration: opsgenie + exp_annotations: + description: "AlertManager opsgenie notifications are failing." + opsrecipe: alert-manager-notifications-failing/ + - alertname: AlertmanagerNotifyNotificationsFailing + eval_time: 180m + - alertname: AlertmanagerNotifyNotificationsFailing + eval_time: 210m