From d544bf6b67b415c8f149fbe295811e1c328f1f08 Mon Sep 17 00:00:00 2001 From: Krishna Teja Puttagunta Date: Thu, 15 Jun 2023 16:24:48 -0700 Subject: [PATCH] AlertManager: include reason label in cortex_alertmanager_notifications_failed_total (#5409) Signed-off-by: Krishna Teja Puttagunta --- CHANGELOG.md | 1 + pkg/alertmanager/alertmanager_metrics.go | 4 +- pkg/alertmanager/alertmanager_metrics_test.go | 135 +++++++++--------- 3 files changed, 71 insertions(+), 69 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd10e20086..8d54edfc59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] AlertManager: include reason label in cortex_alertmanager_notifications_failed_total.#5409 * [CHANGE] Query: Set CORS Origin headers for Query API #5388 * [CHANGE] Updating prometheus/alertmanager from v0.25.0 to v0.25.1-0.20230505130626-263ca5c9438e. This includes the below changes. #5276 - Validating new fields on the Webhook AM config, PushOver AM Config and Telegram AM Config. diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 77432c15df..3e28d4574b 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -84,7 +84,7 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { numFailedNotifications: prometheus.NewDesc( "cortex_alertmanager_notifications_failed_total", "The total number of failed notifications.", - []string{"user", "integration"}, nil), + []string{"user", "integration", "reason"}, nil), numNotificationRequestsTotal: prometheus.NewDesc( "cortex_alertmanager_notification_requests_total", "The total number of attempted notification requests.", @@ -292,7 +292,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration") - data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration") + data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration", "reason") data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", "integration") data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", "integration") data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 35b979e6c9..c3ac8f4be9 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -21,6 +21,7 @@ var integrations = []string{ "webhook", "victorops", } +var reason = "clientError" func TestAlertmanagerMetricsStore(t *testing.T) { mainReg := prometheus.NewPedanticRegistry() @@ -107,30 +108,30 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_notification_latency_seconds_count 24 # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. # TYPE cortex_alertmanager_notifications_failed_total counter - cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user1"} 5 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user2"} 50 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user3"} 500 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user1"} 1 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user2"} 10 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user3"} 100 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user1"} 3 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user2"} 30 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user3"} 300 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user1"} 4 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user2"} 40 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user3"} 400 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user1"} 7 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user2"} 70 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user3"} 700 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user1"} 6 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user2"} 60 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user3"} 600 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user1"} 2 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user2"} 20 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user3"} 200 # HELP cortex_alertmanager_notification_requests_total The total number of attempted notification requests. # TYPE cortex_alertmanager_notification_requests_total counter cortex_alertmanager_notification_requests_total{integration="email",user="user1"} 0 @@ -453,30 +454,30 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. # TYPE cortex_alertmanager_notifications_failed_total counter - cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user3"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user1"} 5 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user2"} 50 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user3"} 500 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user1"} 1 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user2"} 10 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user3"} 100 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user1"} 3 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user2"} 30 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user3"} 300 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user1"} 4 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user2"} 40 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user3"} 400 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user1"} 7 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user2"} 70 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user3"} 700 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user1"} 6 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user2"} 60 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user3"} 600 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user1"} 2 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user2"} 20 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user3"} 200 # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. # TYPE cortex_alertmanager_notifications_total counter @@ -721,22 +722,22 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications. # TYPE cortex_alertmanager_notifications_failed_total counter - cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0 - cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5 - cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1 - cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3 - cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4 - cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7 - cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6 - cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2 - cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user1"} 0 + cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user2"} 0 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user1"} 5 + cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user2"} 50 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user1"} 1 + cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user2"} 10 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user1"} 3 + cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user2"} 30 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user1"} 4 + cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user2"} 40 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user1"} 7 + cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user2"} 70 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user1"} 6 + cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user2"} 60 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user1"} 2 + cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user2"} 20 # HELP cortex_alertmanager_notifications_total The total number of attempted notifications. # TYPE cortex_alertmanager_notifications_total counter @@ -872,7 +873,7 @@ func populateAlertmanager(base float64) *prometheus.Registry { nm := newNotifyMetrics(reg) for i, integration := range integrations { nm.numNotifications.WithLabelValues(integration).Add(base * float64(i)) - nm.numFailedNotifications.WithLabelValues(integration).Add(base * float64(i)) + nm.numFailedNotifications.WithLabelValues(integration, reason).Add(base * float64(i)) nm.numNotificationRequestsTotal.WithLabelValues(integration).Add(base * float64(i)) nm.numNotificationRequestsFailedTotal.WithLabelValues(integration).Add(base * float64(i)) nm.notificationLatencySeconds.WithLabelValues(integration).Observe(base * float64(i) * 0.025) @@ -1034,7 +1035,7 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { Namespace: "alertmanager", Name: "notifications_failed_total", Help: "The total number of failed notifications.", - }, []string{"integration"}), + }, []string{"integration", "reason"}), numNotificationRequestsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notification_requests_total", @@ -1054,7 +1055,7 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics { } for _, integration := range integrations { m.numNotifications.WithLabelValues(integration) - m.numFailedNotifications.WithLabelValues(integration) + m.numFailedNotifications.WithLabelValues(integration, reason) m.numNotificationRequestsTotal.WithLabelValues(integration) m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration)