Skip to content

Commit

Permalink
AlertManager: include reason label in cortex_alertmanager_notificatio…
Browse files Browse the repository at this point in the history
…ns_failed_total (#5409)

Signed-off-by: Krishna Teja Puttagunta <[email protected]>
  • Loading branch information
krishnateja325 authored Jun 15, 2023
1 parent 47c1079 commit d544bf6
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 69 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## master / unreleased
* [CHANGE] AlertManager: include reason label in cortex_alertmanager_notifications_failed_total.#5409
* [CHANGE] Query: Set CORS Origin headers for Query API #5388
* [CHANGE] Updating prometheus/alertmanager from v0.25.0 to v0.25.1-0.20230505130626-263ca5c9438e. This includes the below changes. #5276
- Validating new fields on the Webhook AM config, PushOver AM Config and Telegram AM Config.
Expand Down
4 changes: 2 additions & 2 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
numFailedNotifications: prometheus.NewDesc(
"cortex_alertmanager_notifications_failed_total",
"The total number of failed notifications.",
[]string{"user", "integration"}, nil),
[]string{"user", "integration", "reason"}, nil),
numNotificationRequestsTotal: prometheus.NewDesc(
"cortex_alertmanager_notification_requests_total",
"The total number of attempted notification requests.",
Expand Down Expand Up @@ -292,7 +292,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")

data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration", "reason")
data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", "integration")
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
Expand Down
135 changes: 68 additions & 67 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ var integrations = []string{
"webhook",
"victorops",
}
var reason = "clientError"

func TestAlertmanagerMetricsStore(t *testing.T) {
mainReg := prometheus.NewPedanticRegistry()
Expand Down Expand Up @@ -107,30 +108,30 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notification_latency_seconds_count 24
# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300
cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user3"} 500
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user3"} 100
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user3"} 300
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user3"} 400
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user3"} 700
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user3"} 600
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user2"} 20
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user3"} 200
# HELP cortex_alertmanager_notification_requests_total The total number of attempted notification requests.
# TYPE cortex_alertmanager_notification_requests_total counter
cortex_alertmanager_notification_requests_total{integration="email",user="user1"} 0
Expand Down Expand Up @@ -453,30 +454,30 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300
cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user3"} 500
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user3"} 100
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user3"} 300
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user3"} 400
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user3"} 700
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user3"} 600
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user2"} 20
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user3"} 200
# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
Expand Down Expand Up @@ -721,22 +722,22 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",reason="clientError",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",reason="clientError",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user1"} 1
cortex_alertmanager_notifications_failed_total{integration="pagerduty",reason="clientError",user="user2"} 10
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user1"} 3
cortex_alertmanager_notifications_failed_total{integration="pushover",reason="clientError",user="user2"} 30
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user1"} 4
cortex_alertmanager_notifications_failed_total{integration="slack",reason="clientError",user="user2"} 40
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user1"} 7
cortex_alertmanager_notifications_failed_total{integration="victorops",reason="clientError",user="user2"} 70
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user1"} 6
cortex_alertmanager_notifications_failed_total{integration="webhook",reason="clientError",user="user2"} 60
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user1"} 2
cortex_alertmanager_notifications_failed_total{integration="wechat",reason="clientError",user="user2"} 20
# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
Expand Down Expand Up @@ -872,7 +873,7 @@ func populateAlertmanager(base float64) *prometheus.Registry {
nm := newNotifyMetrics(reg)
for i, integration := range integrations {
nm.numNotifications.WithLabelValues(integration).Add(base * float64(i))
nm.numFailedNotifications.WithLabelValues(integration).Add(base * float64(i))
nm.numFailedNotifications.WithLabelValues(integration, reason).Add(base * float64(i))
nm.numNotificationRequestsTotal.WithLabelValues(integration).Add(base * float64(i))
nm.numNotificationRequestsFailedTotal.WithLabelValues(integration).Add(base * float64(i))
nm.notificationLatencySeconds.WithLabelValues(integration).Observe(base * float64(i) * 0.025)
Expand Down Expand Up @@ -1034,7 +1035,7 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics {
Namespace: "alertmanager",
Name: "notifications_failed_total",
Help: "The total number of failed notifications.",
}, []string{"integration"}),
}, []string{"integration", "reason"}),
numNotificationRequestsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notification_requests_total",
Expand All @@ -1054,7 +1055,7 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics {
}
for _, integration := range integrations {
m.numNotifications.WithLabelValues(integration)
m.numFailedNotifications.WithLabelValues(integration)
m.numFailedNotifications.WithLabelValues(integration, reason)
m.numNotificationRequestsTotal.WithLabelValues(integration)
m.numNotificationRequestsFailedTotal.WithLabelValues(integration)
m.notificationLatencySeconds.WithLabelValues(integration)
Expand Down

0 comments on commit d544bf6

Please sign in to comment.