Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alertmanager: Skip exposing some more metrics when they have zero value #9359

Merged
merged 13 commits into from
Sep 26, 2024
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@
* [CHANGE] Querier: allow wrapping errors with context errors only when the former actually correspond to `context.Canceled` and `context.DeadlineExceeded`. #9175
* [CHANGE] Query-scheduler: Remove the experimental `-query-scheduler.use-multi-algorithm-query-queue` flag. The new multi-algorithm tree queue is always used for the scheduler. #9210
* [CHANGE] Distributor: reject incoming requests until the distributor service has started. #9317
* [CHANGE] Alertmanager: the following metrics are not exported for a given `user` when the metric value is zero: #9359
* `cortex_alertmanager_alerts_received_total`
* `cortex_alertmanager_alerts_invalid_total`
* `cortex_alertmanager_partial_state_merges_total`
* `cortex_alertmanager_partial_state_merges_failed_total`
* `cortex_alertmanager_state_replication_total`
* `cortex_alertmanager_state_replication_failed_total`
* `cortex_alertmanager_alerts`
* `cortex_alertmanager_silences`
* [FEATURE] Alertmanager: Added `-alertmanager.log-parsing-label-matchers` to control logging when parsing label matchers. This flag is intended to be used with `-alertmanager.utf8-strict-mode-enabled` to validate UTF-8 strict mode is working as intended. The default value is `false`. #9173
* [FEATURE] Alertmanager: Added `-alertmanager.utf8-migration-logging-enabled` to enable logging of tenant configurations that are incompatible with UTF-8 strict mode. The default value is `false`. #9174
* [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #8422 #8430 #8454 #8455 #8360 #8490 #8508 #8577 #8660 #8671 #8677 #8747 #8850 #8872 #8838 #8911 #8909 #8923 #8924 #8925 #8932 #8933 #8934 #8962 #8986 #8993 #8995 #9008 #9017 #9018 #9019 #9120 #9121 #9136 #9139 #9140 #9145 #9191 #9192 #9194 #9196 #9201 #9212 #9225 #9260 #9272 #9277 #9278 #9280 #9281 #9342 #9343 #9371
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ require (
github.com/golang/snappy v0.0.4
github.com/google/gopacket v1.1.19
github.com/gorilla/mux v1.8.1
github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e
github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee
github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc
github.com/hashicorp/golang-lru v1.0.2 // indirect
github.com/json-iterator/go v1.1.12
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1250,8 +1250,8 @@ github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85T
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4=
github.com/grafana/alerting v0.0.0-20240906191856-cdc634f213ea h1:AGmVRk+9ZmzuiLJl6hzQE1vBlVz9wbEb2+J52Gui2ys=
github.com/grafana/alerting v0.0.0-20240906191856-cdc634f213ea/go.mod h1:GMLi6d09Xqo96fCVUjNk//rcjP5NKEdjOzfWIffD5r4=
github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e h1:srRxyx7T2OWJzenCktxk2bCsh8hnbzRYSGRkZxMbGK4=
github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e/go.mod h1:SPLNCARd4xdjCkue0O6hvuoveuS1dGJjDnfxYe405YQ=
github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee h1:msHhmdD1R967st2JOpLxp7tkUwRc580H/Ae193VyweY=
github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee/go.mod h1:SPLNCARd4xdjCkue0O6hvuoveuS1dGJjDnfxYe405YQ=
github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc h1:BW+LjKJDz0So5LI8UZfW5neWeKpSkWqhmGjQFzcFfLM=
github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc/go.mod h1:JVmqPBe8A/pZWwRoJW5ZjyALeY5OXMzPl7LrVXOdZAI=
github.com/grafana/goautoneg v0.0.0-20240607115440-f335c04c58ce h1:WI1olbgS+sEl77qxEYbmt9TgRUz7iLqmjh8lYPpGlKQ=
Expand Down
7 changes: 4 additions & 3 deletions integration/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ func TestAlertmanagerSharding(t *testing.T) {
return alertmanagers.WaitSumMetricsWithOptions(
e2e.Equals(float64(amount)),
[]string{"cortex_alertmanager_silences"},
e2e.WaitMissingMetrics,
e2e.SkipMissingMetrics,
e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "state", state),
),
Expand Down Expand Up @@ -927,9 +927,10 @@ func TestAlertmanagerShardingScaling(t *testing.T) {
e2e.Equals(float64(numUsers*expectedReplication)),
"cortex_alertmanager_config_last_reload_successful"))

require.NoError(t, ams.WaitSumMetrics(
require.NoError(t, ams.WaitSumMetricsWithOptions(
e2e.Equals(float64(expectedSilences*expectedReplication)),
"cortex_alertmanager_silences"))
[]string{"cortex_alertmanager_silences"},
e2e.SkipMissingMetrics))
}

// Start up the first instance and use it to create some silences.
Expand Down
16 changes: 8 additions & 8 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,16 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data := m.regs.BuildMetricFamiliesPerTenant()

data.SendSumOfCountersPerTenant(out, m.alertsReceived, "alertmanager_alerts_received_total")
data.SendSumOfCountersPerTenant(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")
data.SendSumOfCountersPerTenant(out, m.alertsReceived, "alertmanager_alerts_received_total", dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.alertsInvalid, "alertmanager_alerts_invalid_total", dskit_metrics.WithSkipZeroValueMetrics)

data.SendSumOfCountersPerTenant(out, m.numNotifications, "alertmanager_notifications_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", dskit_metrics.WithLabels("integration", "reason"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numNotificationSuppressedTotal, "alertmanager_notifications_suppressed_total", dskit_metrics.WithLabels("reason"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfGaugesPerTenantWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")
data.SendSumOfGaugesPerTenant(out, m.markerAlerts, "alertmanager_alerts", dskit_metrics.WithLabels("state"), dskit_metrics.WithSkipZeroValueMetrics)

data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
Expand All @@ -367,14 +367,14 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
data.SendSumOfGaugesPerTenantWithLabels(out, m.silences, "alertmanager_silences", "state")
data.SendSumOfGaugesPerTenant(out, m.silences, "alertmanager_silences", dskit_metrics.WithLabels("state"), dskit_metrics.WithSkipZeroValueMetrics)

data.SendMaxOfGaugesPerTenant(out, m.configHashValue, "alertmanager_config_hash")

data.SendSumOfCountersPerTenant(out, m.partialMerges, "alertmanager_partial_state_merges_total")
data.SendSumOfCountersPerTenant(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total")
data.SendSumOfCountersPerTenant(out, m.replicationTotal, "alertmanager_state_replication_total")
data.SendSumOfCountersPerTenant(out, m.replicationFailed, "alertmanager_state_replication_failed_total")
data.SendSumOfCountersPerTenant(out, m.partialMerges, "alertmanager_partial_state_merges_total", dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total", dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.replicationTotal, "alertmanager_state_replication_total", dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.replicationFailed, "alertmanager_state_replication_failed_total", dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total")
data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total")
data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total")
Expand Down
10 changes: 5 additions & 5 deletions pkg/ruler/manager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,9 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersPerTenant(out, m.IterationsScheduled, "prometheus_rule_group_iterations_total", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfCountersPerTenant(out, m.EvalTotal, "prometheus_rule_evaluations_total", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfCountersPerTenant(out, m.EvalFailures, "prometheus_rule_evaluation_failures_total", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", "rule_group")
data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", "rule_group")
data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", "rule_group")
data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", "rule_group")
data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", "rule_group")
data.SendSumOfGaugesPerTenant(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupRules, "prometheus_rule_group_rules", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", dskit_metrics.WithLabels("rule_group"))
}
18 changes: 11 additions & 7 deletions vendor/github.com/grafana/dskit/metrics/tenant_registries.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion vendor/modules.txt

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading