diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d51f69e715..7f68a1bc0d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,15 @@ * [CHANGE] Distributor: reject incoming requests until the distributor service has started. #9317 * [CHANGE] Ingester, Distributor: Remove deprecated `-ingester.limit-inflight-requests-using-grpc-method-limiter` and `-distributor.limit-inflight-requests-using-grpc-method-limiter`. The feature was deprecated and enabled by default in Mimir 2.12. #9407 * [CHANGE] Querier: Remove deprecated `-querier.max-query-into-future`. The feature was deprecated in Mimir 2.12. #9407 +* [CHANGE] Alertmanager: the following metrics are not exported for a given `user` when the metric value is zero: #9359 + * `cortex_alertmanager_alerts_received_total` + * `cortex_alertmanager_alerts_invalid_total` + * `cortex_alertmanager_partial_state_merges_total` + * `cortex_alertmanager_partial_state_merges_failed_total` + * `cortex_alertmanager_state_replication_total` + * `cortex_alertmanager_state_replication_failed_total` + * `cortex_alertmanager_alerts` + * `cortex_alertmanager_silences` * [FEATURE] Alertmanager: Added `-alertmanager.log-parsing-label-matchers` to control logging when parsing label matchers. This flag is intended to be used with `-alertmanager.utf8-strict-mode-enabled` to validate UTF-8 strict mode is working as intended. The default value is `false`. #9173 * [FEATURE] Alertmanager: Added `-alertmanager.utf8-migration-logging-enabled` to enable logging of tenant configurations that are incompatible with UTF-8 strict mode. The default value is `false`. #9174 * [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #8422 #8430 #8454 #8455 #8360 #8490 #8508 #8577 #8660 #8671 #8677 #8747 #8850 #8872 #8838 #8911 #8909 #8923 #8924 #8925 #8932 #8933 #8934 #8962 #8986 #8993 #8995 #9008 #9017 #9018 #9019 #9120 #9121 #9136 #9139 #9140 #9145 #9191 #9192 #9194 #9196 #9201 #9212 #9225 #9260 #9272 #9277 #9278 #9280 #9281 #9342 #9343 #9367 #9368 #9371 diff --git a/go.mod b/go.mod index 09adc1e7ab3..917b572dc5a 100644 --- a/go.mod +++ b/go.mod @@ -22,7 +22,7 @@ require ( github.com/golang/snappy v0.0.4 github.com/google/gopacket v1.1.19 github.com/gorilla/mux v1.8.1 - github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e + github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/json-iterator/go v1.1.12 diff --git a/go.sum b/go.sum index ee5939f82df..4760cd20a36 100644 --- a/go.sum +++ b/go.sum @@ -1250,8 +1250,8 @@ github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85T github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4= github.com/grafana/alerting v0.0.0-20240906191856-cdc634f213ea h1:AGmVRk+9ZmzuiLJl6hzQE1vBlVz9wbEb2+J52Gui2ys= github.com/grafana/alerting v0.0.0-20240906191856-cdc634f213ea/go.mod h1:GMLi6d09Xqo96fCVUjNk//rcjP5NKEdjOzfWIffD5r4= -github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e h1:srRxyx7T2OWJzenCktxk2bCsh8hnbzRYSGRkZxMbGK4= -github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e/go.mod h1:SPLNCARd4xdjCkue0O6hvuoveuS1dGJjDnfxYe405YQ= +github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee h1:msHhmdD1R967st2JOpLxp7tkUwRc580H/Ae193VyweY= +github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee/go.mod h1:SPLNCARd4xdjCkue0O6hvuoveuS1dGJjDnfxYe405YQ= github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc h1:BW+LjKJDz0So5LI8UZfW5neWeKpSkWqhmGjQFzcFfLM= github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc/go.mod h1:JVmqPBe8A/pZWwRoJW5ZjyALeY5OXMzPl7LrVXOdZAI= github.com/grafana/goautoneg v0.0.0-20240607115440-f335c04c58ce h1:WI1olbgS+sEl77qxEYbmt9TgRUz7iLqmjh8lYPpGlKQ= diff --git a/integration/alertmanager_test.go b/integration/alertmanager_test.go index 8ef81ee45dc..f9b95b659a8 100644 --- a/integration/alertmanager_test.go +++ b/integration/alertmanager_test.go @@ -566,7 +566,7 @@ func TestAlertmanagerSharding(t *testing.T) { return alertmanagers.WaitSumMetricsWithOptions( e2e.Equals(float64(amount)), []string{"cortex_alertmanager_silences"}, - e2e.WaitMissingMetrics, + e2e.SkipMissingMetrics, e2e.WithLabelMatchers( labels.MustNewMatcher(labels.MatchEqual, "state", state), ), @@ -927,9 +927,10 @@ func TestAlertmanagerShardingScaling(t *testing.T) { e2e.Equals(float64(numUsers*expectedReplication)), "cortex_alertmanager_config_last_reload_successful")) - require.NoError(t, ams.WaitSumMetrics( + require.NoError(t, ams.WaitSumMetricsWithOptions( e2e.Equals(float64(expectedSilences*expectedReplication)), - "cortex_alertmanager_silences")) + []string{"cortex_alertmanager_silences"}, + e2e.SkipMissingMetrics)) } // Start up the first instance and use it to create some silences. diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 496b4f7a1a6..73973ac4a97 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -337,8 +337,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data := m.regs.BuildMetricFamiliesPerTenant() - data.SendSumOfCountersPerTenant(out, m.alertsReceived, "alertmanager_alerts_received_total") - data.SendSumOfCountersPerTenant(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") + data.SendSumOfCountersPerTenant(out, m.alertsReceived, "alertmanager_alerts_received_total", dskit_metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, m.alertsInvalid, "alertmanager_alerts_invalid_total", dskit_metrics.WithSkipZeroValueMetrics) data.SendSumOfCountersPerTenant(out, m.numNotifications, "alertmanager_notifications_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics) data.SendSumOfCountersPerTenant(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", dskit_metrics.WithLabels("integration", "reason"), dskit_metrics.WithSkipZeroValueMetrics) @@ -346,7 +346,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerTenant(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics) data.SendSumOfCountersPerTenant(out, m.numNotificationSuppressedTotal, "alertmanager_notifications_suppressed_total", dskit_metrics.WithLabels("reason"), dskit_metrics.WithSkipZeroValueMetrics) data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") - data.SendSumOfGaugesPerTenantWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") + data.SendSumOfGaugesPerTenant(out, m.markerAlerts, "alertmanager_alerts", dskit_metrics.WithLabels("state"), dskit_metrics.WithSkipZeroValueMetrics) data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds") @@ -367,14 +367,14 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total") data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") - data.SendSumOfGaugesPerTenantWithLabels(out, m.silences, "alertmanager_silences", "state") + data.SendSumOfGaugesPerTenant(out, m.silences, "alertmanager_silences", dskit_metrics.WithLabels("state"), dskit_metrics.WithSkipZeroValueMetrics) data.SendMaxOfGaugesPerTenant(out, m.configHashValue, "alertmanager_config_hash") - data.SendSumOfCountersPerTenant(out, m.partialMerges, "alertmanager_partial_state_merges_total") - data.SendSumOfCountersPerTenant(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total") - data.SendSumOfCountersPerTenant(out, m.replicationTotal, "alertmanager_state_replication_total") - data.SendSumOfCountersPerTenant(out, m.replicationFailed, "alertmanager_state_replication_failed_total") + data.SendSumOfCountersPerTenant(out, m.partialMerges, "alertmanager_partial_state_merges_total", dskit_metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total", dskit_metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, m.replicationTotal, "alertmanager_state_replication_total", dskit_metrics.WithSkipZeroValueMetrics) + data.SendSumOfCountersPerTenant(out, m.replicationFailed, "alertmanager_state_replication_failed_total", dskit_metrics.WithSkipZeroValueMetrics) data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total") data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total") data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total") diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index 59d6af59977..9f5a3953fea 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -153,9 +153,9 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerTenant(out, m.IterationsScheduled, "prometheus_rule_group_iterations_total", dskit_metrics.WithLabels("rule_group")) data.SendSumOfCountersPerTenant(out, m.EvalTotal, "prometheus_rule_evaluations_total", dskit_metrics.WithLabels("rule_group")) data.SendSumOfCountersPerTenant(out, m.EvalFailures, "prometheus_rule_evaluation_failures_total", dskit_metrics.WithLabels("rule_group")) - data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", "rule_group") - data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", "rule_group") - data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", "rule_group") - data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", "rule_group") - data.SendSumOfGaugesPerTenantWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", "rule_group") + data.SendSumOfGaugesPerTenant(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", dskit_metrics.WithLabels("rule_group")) + data.SendSumOfGaugesPerTenant(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", dskit_metrics.WithLabels("rule_group")) + data.SendSumOfGaugesPerTenant(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", dskit_metrics.WithLabels("rule_group")) + data.SendSumOfGaugesPerTenant(out, m.GroupRules, "prometheus_rule_group_rules", dskit_metrics.WithLabels("rule_group")) + data.SendSumOfGaugesPerTenant(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", dskit_metrics.WithLabels("rule_group")) } diff --git a/vendor/github.com/grafana/dskit/metrics/tenant_registries.go b/vendor/github.com/grafana/dskit/metrics/tenant_registries.go index e1dc153d237..c903877aea5 100644 --- a/vendor/github.com/grafana/dskit/metrics/tenant_registries.go +++ b/vendor/github.com/grafana/dskit/metrics/tenant_registries.go @@ -219,25 +219,29 @@ func (d MetricFamiliesPerTenant) SendSumOfGaugesWithLabels(out chan<- prometheus // SendSumOfGaugesPerTenant provides metrics on a per-tenant basis. // This function assumes that `tenant` is the first label on the provided metric Desc. -func (d MetricFamiliesPerTenant) SendSumOfGaugesPerTenant(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string) { - d.SendSumOfGaugesPerTenantWithLabels(out, desc, gauge) -} +func (d MetricFamiliesPerTenant) SendSumOfGaugesPerTenant(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, options ...MetricOption) { + opts := applyMetricOptions(options...) -// SendSumOfGaugesPerTenantWithLabels provides metrics with the provided label names on a per-tenant basis. This function assumes that `tenant` is the -// first label on the provided metric Desc -func (d MetricFamiliesPerTenant) SendSumOfGaugesPerTenantWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { for _, tenantEntry := range d { if tenantEntry.tenant == "" { continue } result := singleValueWithLabelsMap{} - tenantEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, gaugeValue, result.aggregateFn, false) + tenantEntry.metrics.sumOfSingleValuesWithLabels(metric, opts.labelNames, gaugeValue, result.aggregateFn, opts.skipZeroValueMetrics) result.prependTenantLabelValue(tenantEntry.tenant) result.WriteToMetricChannel(out, desc, prometheus.GaugeValue) } } +// SendSumOfGaugesPerTenantWithLabels provides metrics with the provided label names on a per-tenant basis. This function assumes that `tenant` is the +// first label on the provided metric Desc +// +// Deprecated: use SendSumOfGaugesPerTenant with WithLabels option instead. +func (d MetricFamiliesPerTenant) SendSumOfGaugesPerTenantWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) { + d.SendSumOfGaugesPerTenant(out, desc, metric, WithLabels(labelNames...)) +} + func (d MetricFamiliesPerTenant) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string, skipZeroValue bool) singleValueWithLabelsMap { result := singleValueWithLabelsMap{} for _, tenantEntry := range d { diff --git a/vendor/modules.txt b/vendor/modules.txt index b9cd301bb1e..d03bee53037 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -611,7 +611,7 @@ github.com/grafana/alerting/receivers/webex github.com/grafana/alerting/receivers/webhook github.com/grafana/alerting/receivers/wecom github.com/grafana/alerting/templates -# github.com/grafana/dskit v0.0.0-20240920183844-560bb26f205e +# github.com/grafana/dskit v0.0.0-20240923130221-1f324b47eaee ## explicit; go 1.21 github.com/grafana/dskit/backoff github.com/grafana/dskit/ballast