diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index f3c4552c8c9..9c652789350 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -3513,6 +3513,9 @@ query_rejection: # CLI flag: -ruler.query-offset [ruler_query_offset: | default = 0s] +# external labels for alerting rules +[ruler_external_labels: | default = []] + # The default tenant's shard size when the shuffle-sharding strategy is used. # Must be set when the store-gateway sharding is enabled with the # shuffle-sharding strategy. When this setting is specified in the per-tenant @@ -3614,9 +3617,6 @@ query_rejection: # list of rule groups to disable [disabled_rule_groups: | default = []] - -# external labels for alerting rules -[external_labels: | default = []] ``` ### `memberlist_config` diff --git a/pkg/ruler/api_test.go b/pkg/ruler/api_test.go index 29ec2b296e1..e99357c1402 100644 --- a/pkg/ruler/api_test.go +++ b/pkg/ruler/api_test.go @@ -454,7 +454,7 @@ func TestRuler_LimitsPerGroup(t *testing.T) { r := newTestRuler(t, cfg, store, nil) defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - r.limits = ruleLimits{maxRuleGroups: 1, maxRulesPerRuleGroup: 1} + r.limits = &ruleLimits{maxRuleGroups: 1, maxRulesPerRuleGroup: 1} a := NewAPI(r, r.store, log.NewNopLogger()) @@ -508,7 +508,7 @@ func TestRuler_RulerGroupLimits(t *testing.T) { r := newTestRuler(t, cfg, store, nil) defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - r.limits = ruleLimits{maxRuleGroups: 1, maxRulesPerRuleGroup: 1} + r.limits = &ruleLimits{maxRuleGroups: 1, maxRulesPerRuleGroup: 1} a := NewAPI(r, r.store, log.NewNopLogger()) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index a4ef97f271d..3e65c828d6a 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -153,7 +153,7 @@ type RulesLimits interface { RulerMaxRulesPerRuleGroup(userID string) int RulerQueryOffset(userID string) time.Duration DisabledRuleGroups(userID string) validation.DisabledRuleGroups - ExternalLabels(userID string) labels.Labels + RulerExternalLabels(userID string) labels.Labels } // EngineQueryFunc returns a new engine query function validating max queryLength. diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 3a3d6633508..d921171ee52 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -287,7 +287,7 @@ func TestPusherErrors(t *testing.T) { writes := prometheus.NewCounter(prometheus.CounterOpts{}) failures := prometheus.NewCounter(prometheus.CounterOpts{}) - pa := NewPusherAppendable(pusher, "user-1", ruleLimits{}, writes, failures) + pa := NewPusherAppendable(pusher, "user-1", &ruleLimits{}, writes, failures) lbls, err := parser.ParseMetric("foo_bar") require.NoError(t, err) diff --git a/pkg/ruler/external_labels.go b/pkg/ruler/external_labels.go index e725ca776d7..886fc4d0ed8 100644 --- a/pkg/ruler/external_labels.go +++ b/pkg/ruler/external_labels.go @@ -35,7 +35,7 @@ func (e *userExternalLabels) get(userID string) (labels.Labels, bool) { } func (e *userExternalLabels) update(userID string) (labels.Labels, bool) { - lset := e.limits.ExternalLabels(userID) + lset := e.limits.RulerExternalLabels(userID) e.mtx.Lock() defer e.mtx.Unlock() diff --git a/pkg/ruler/external_labels_test.go b/pkg/ruler/external_labels_test.go new file mode 100644 index 00000000000..45ff1507c83 --- /dev/null +++ b/pkg/ruler/external_labels_test.go @@ -0,0 +1,69 @@ +package ruler + +import ( + "testing" + + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/require" +) + +func TestUserExternalLabels(t *testing.T) { + limits := ruleLimits{} + e := newUserExternalLabels(labels.FromStrings("from", "cortex"), &limits) + + tests := []struct { + name string + removeBeforeTest bool + exists bool + userExternalLabels labels.Labels + expectedExternalLabels labels.Labels + }{ + { + name: "global labels only", + removeBeforeTest: false, + exists: false, + userExternalLabels: nil, + expectedExternalLabels: labels.FromStrings("from", "cortex"), + }, + { + name: "local labels without overriding", + removeBeforeTest: true, + exists: false, + userExternalLabels: labels.FromStrings("tag", "local"), + expectedExternalLabels: labels.FromStrings("from", "cortex", "tag", "local"), + }, + { + name: "local labels that override globals", + removeBeforeTest: false, + exists: true, + userExternalLabels: labels.FromStrings("from", "cloud", "tag", "local"), + expectedExternalLabels: labels.FromStrings("from", "cloud", "tag", "local"), + }, + } + + const userID = "test-user" + for _, data := range tests { + data := data + t.Run(data.name, func(t *testing.T) { + if data.removeBeforeTest { + e.remove(userID) + } + _, exists := e.get(userID) + require.Equal(t, data.exists, exists) + + limits.externalLabels = data.userExternalLabels + lset, ok := e.update(userID) + require.True(t, ok) + require.Equal(t, data.expectedExternalLabels, lset) + lset1, ok := e.update(userID) + require.False(t, ok) // Not updated. + require.Equal(t, data.expectedExternalLabels, lset1) + }) + } + + _, ok := e.get(userID) + require.True(t, ok) + e.cleanup() + _, ok = e.get(userID) + require.False(t, ok) +} diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index e610e393adf..9af478b2b4e 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -29,7 +29,7 @@ func TestSyncRuleGroups(t *testing.T) { } ruleManagerFactory := RuleManagerFactory(nil, waitDurations) - limits := ruleLimits{externalLabels: labels.FromStrings("from", "cortex")} + limits := &ruleLimits{externalLabels: labels.FromStrings("from", "cortex")} m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, limits, ruleManagerFactory, nil, nil, log.NewNopLogger()) require.NoError(t, err) @@ -64,7 +64,7 @@ func TestSyncRuleGroups(t *testing.T) { require.True(t, ok) lset, ok := m.userExternalLabels.get(user) require.True(t, ok) - require.Equal(t, limits.externalLabels, lset) + require.Equal(t, limits.RulerExternalLabels(user), lset) } // Passing empty map / nil stops all managers. @@ -160,7 +160,7 @@ func TestSlowRuleGroupSyncDoesNotSlowdownListRules(t *testing.T) { } ruleManagerFactory := RuleManagerFactory(groupsToReturn, waitDurations) - m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, ruleLimits{}, ruleManagerFactory, nil, prometheus.NewRegistry(), log.NewNopLogger()) + m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, &ruleLimits{}, ruleManagerFactory, nil, prometheus.NewRegistry(), log.NewNopLogger()) require.NoError(t, err) m.SyncRuleGroups(context.Background(), userRules) @@ -223,7 +223,7 @@ func TestSyncRuleGroupsCleanUpPerUserMetrics(t *testing.T) { ruleManagerFactory := RuleManagerFactory(nil, waitDurations) - m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, ruleLimits{}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) + m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, &ruleLimits{}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) require.NoError(t, err) const user = "testUser" @@ -271,7 +271,7 @@ func TestBackupRules(t *testing.T) { ruleManagerFactory := RuleManagerFactory(nil, waitDurations) config := Config{RulePath: dir} config.Ring.ReplicationFactor = 3 - m, err := NewDefaultMultiTenantManager(config, ruleLimits{}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) + m, err := NewDefaultMultiTenantManager(config, &ruleLimits{}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger()) require.NoError(t, err) const user1 = "testUser" diff --git a/pkg/ruler/ruler_ring_test.go b/pkg/ruler/ruler_ring_test.go index 4b740eea691..7dd3cca9a98 100644 --- a/pkg/ruler/ruler_ring_test.go +++ b/pkg/ruler/ruler_ring_test.go @@ -255,7 +255,7 @@ func TestGetReplicationSetForListRule(t *testing.T) { } r, _ := buildRuler(t, cfg, nil, store, nil) - r.limits = ruleLimits{} + r.limits = &ruleLimits{} rulerRing := r.ring // We start ruler's ring, but nothing else (not even lifecycler). diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index d1a0d0d0784..7aceeeac22a 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -2,6 +2,7 @@ package ruler import ( "context" + "encoding/json" "errors" "fmt" "io" @@ -21,6 +22,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gorilla/mux" + "github.com/prometheus/alertmanager/api/v2/models" "github.com/prometheus/client_golang/prometheus" prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" @@ -83,6 +85,7 @@ func defaultRulerConfig(t testing.TB) Config { } type ruleLimits struct { + mtx sync.RWMutex tenantShard int maxRulesPerRuleGroup int maxRuleGroups int @@ -92,29 +95,51 @@ type ruleLimits struct { externalLabels labels.Labels } -func (r ruleLimits) RulerTenantShardSize(_ string) int { +func (r *ruleLimits) setRulerExternalLabels(lset labels.Labels) { + r.mtx.Lock() + r.externalLabels = lset + r.mtx.Unlock() +} + +func (r *ruleLimits) RulerTenantShardSize(_ string) int { + r.mtx.RLock() + defer r.mtx.RUnlock() return r.tenantShard } -func (r ruleLimits) RulerMaxRuleGroupsPerTenant(_ string) int { +func (r *ruleLimits) RulerMaxRuleGroupsPerTenant(_ string) int { + r.mtx.RLock() + defer r.mtx.RUnlock() return r.maxRuleGroups } -func (r ruleLimits) RulerMaxRulesPerRuleGroup(_ string) int { +func (r *ruleLimits) RulerMaxRulesPerRuleGroup(_ string) int { + r.mtx.RLock() + defer r.mtx.RUnlock() return r.maxRulesPerRuleGroup } -func (r ruleLimits) DisabledRuleGroups(userID string) validation.DisabledRuleGroups { +func (r *ruleLimits) DisabledRuleGroups(userID string) validation.DisabledRuleGroups { + r.mtx.RLock() + defer r.mtx.RUnlock() return r.disabledRuleGroups } -func (r ruleLimits) MaxQueryLength(_ string) time.Duration { return r.maxQueryLength } +func (r *ruleLimits) MaxQueryLength(_ string) time.Duration { + r.mtx.RLock() + defer r.mtx.RUnlock() + return r.maxQueryLength +} -func (r ruleLimits) RulerQueryOffset(_ string) time.Duration { +func (r *ruleLimits) RulerQueryOffset(_ string) time.Duration { + r.mtx.RLock() + defer r.mtx.RUnlock() return r.queryOffset } -func (r ruleLimits) ExternalLabels(_ string) labels.Labels { +func (r *ruleLimits) RulerExternalLabels(_ string) labels.Labels { + r.mtx.RLock() + defer r.mtx.RUnlock() return r.externalLabels } @@ -233,14 +258,14 @@ func testSetup(t *testing.T, querierTestConfig *querier.TestConfig) (*promql.Eng reg := prometheus.NewRegistry() queryable := testQueryableFunc(querierTestConfig, reg, l) - return engine, queryable, pusher, l, ruleLimits{maxRuleGroups: 20, maxRulesPerRuleGroup: 15}, reg + return engine, queryable, pusher, l, &ruleLimits{maxRuleGroups: 20, maxRulesPerRuleGroup: 15}, reg } func newManager(t *testing.T, cfg Config) *DefaultMultiTenantManager { engine, queryable, pusher, logger, overrides, reg := testSetup(t, nil) metrics := NewRuleEvalMetrics(cfg, nil) managerFactory := DefaultTenantManagerFactory(cfg, pusher, queryable, engine, overrides, metrics, nil) - manager, err := NewDefaultMultiTenantManager(cfg, ruleLimits{}, managerFactory, metrics, reg, logger) + manager, err := NewDefaultMultiTenantManager(cfg, overrides, managerFactory, metrics, reg, logger) require.NoError(t, err) return manager @@ -298,7 +323,7 @@ func buildRuler(t *testing.T, rulerConfig Config, querierTestConfig *querier.Tes engine, queryable, pusher, logger, overrides, reg := testSetup(t, querierTestConfig) metrics := NewRuleEvalMetrics(rulerConfig, reg) managerFactory := DefaultTenantManagerFactory(rulerConfig, pusher, queryable, engine, overrides, metrics, reg) - manager, err := NewDefaultMultiTenantManager(rulerConfig, ruleLimits{}, managerFactory, metrics, reg, log.NewNopLogger()) + manager, err := NewDefaultMultiTenantManager(rulerConfig, &ruleLimits{}, managerFactory, metrics, reg, log.NewNopLogger()) require.NoError(t, err) ruler, err := newRuler( @@ -381,6 +406,101 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { `), "prometheus_notifications_dropped_total")) } +func TestNotifierSendExternalLabels(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + receivedLabelsCh := make(chan models.LabelSet, 1) + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + alerts := models.PostableAlerts{} + err := json.NewDecoder(r.Body).Decode(&alerts) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + if len(alerts) == 1 { + select { + case <-ctx.Done(): + case receivedLabelsCh <- alerts[0].Labels: + } + } + })) + t.Cleanup(ts.Close) + + cfg := defaultRulerConfig(t) + cfg.AlertmanagerURL = ts.URL + cfg.AlertmanagerDiscovery = false + cfg.ExternalLabels = []labels.Label{{Name: "region", Value: "us-east-1"}} + limits := &ruleLimits{} + engine, queryable, pusher, logger, _, reg := testSetup(t, nil) + metrics := NewRuleEvalMetrics(cfg, nil) + managerFactory := DefaultTenantManagerFactory(cfg, pusher, queryable, engine, limits, metrics, nil) + manager, err := NewDefaultMultiTenantManager(cfg, limits, managerFactory, metrics, reg, logger) + require.NoError(t, err) + t.Cleanup(manager.Stop) + + const userID = "n1" + manager.SyncRuleGroups(context.Background(), map[string]rulespb.RuleGroupList{ + userID: {&rulespb.RuleGroupDesc{Name: "group", Namespace: "ns", Interval: time.Minute, User: userID}}, + }) + + manager.notifiersMtx.Lock() + n, ok := manager.notifiers[userID] + manager.notifiersMtx.Unlock() + require.True(t, ok) + + tests := []struct { + name string + userExternalLabels []labels.Label + expectedExternalLabels []labels.Label + }{ + { + name: "global labels only", + userExternalLabels: nil, + expectedExternalLabels: []labels.Label{{Name: "region", Value: "us-east-1"}}, + }, + { + name: "local labels without overriding", + userExternalLabels: labels.FromStrings("mylabel", "local"), + expectedExternalLabels: []labels.Label{{Name: "region", Value: "us-east-1"}, {Name: "mylabel", Value: "local"}}, + }, + { + name: "local labels that override globals", + userExternalLabels: labels.FromStrings("region", "cloud", "mylabel", "local"), + expectedExternalLabels: []labels.Label{{Name: "region", Value: "cloud"}, {Name: "mylabel", Value: "local"}}, + }, + } + for _, test := range tests { + test := test + + t.Run(test.name, func(t *testing.T) { + limits.setRulerExternalLabels(test.userExternalLabels) + manager.SyncRuleGroups(context.Background(), map[string]rulespb.RuleGroupList{ + userID: {&rulespb.RuleGroupDesc{Name: "group", Namespace: "ns", Interval: time.Minute, User: userID}}, + }) + + // FIXME: we need to wait for the discoverer to sync again after applying the configuration. + // Ref: https://github.com/prometheus/prometheus/pull/14987 + require.Eventually(t, func() bool { + return len(n.notifier.Alertmanagers()) > 0 + }, 10*time.Second, 10*time.Millisecond) + + n.notifier.Send(¬ifier.Alert{ + Labels: labels.Labels{labels.Label{Name: "alertname", Value: "testalert"}}, + }) + select { + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for alert to be sent") + case receivedLabels := <-receivedLabelsCh: + for _, expectedLabel := range test.expectedExternalLabels { + value, ok := receivedLabels[expectedLabel.Name] + require.True(t, ok) + require.Equal(t, expectedLabel.Value, value) + } + } + }) + } +} + func TestRuler_TestShutdown(t *testing.T) { tests := []struct { name string @@ -1159,7 +1279,7 @@ func TestGetRules(t *testing.T) { } r, _ := buildRuler(t, cfg, nil, store, rulerAddrMap) - r.limits = ruleLimits{tenantShard: tc.shuffleShardSize} + r.limits = &ruleLimits{tenantShard: tc.shuffleShardSize} rulerAddrMap[id] = r if r.ring != nil { require.NoError(t, services.StartAndAwaitRunning(context.Background(), r.ring)) @@ -1396,7 +1516,7 @@ func TestGetRulesFromBackup(t *testing.T) { } r, _ := buildRuler(t, cfg, nil, store, rulerAddrMap) - r.limits = ruleLimits{tenantShard: 3} + r.limits = &ruleLimits{tenantShard: 3} rulerAddrMap[id] = r if r.ring != nil { require.NoError(t, services.StartAndAwaitRunning(context.Background(), r.ring)) @@ -1612,7 +1732,7 @@ func getRulesHATest(replicationFactor int) func(t *testing.T) { } r, _ := buildRuler(t, cfg, nil, store, rulerAddrMap) - r.limits = ruleLimits{tenantShard: 3} + r.limits = &ruleLimits{tenantShard: 3} rulerAddrMap[id] = r if r.ring != nil { require.NoError(t, services.StartAndAwaitRunning(context.Background(), r.ring)) @@ -2208,7 +2328,7 @@ func TestSharding(t *testing.T) { } r, _ := buildRuler(t, cfg, nil, store, nil) - r.limits = ruleLimits{tenantShard: tc.shuffleShardSize} + r.limits = &ruleLimits{tenantShard: tc.shuffleShardSize} if forceRing != nil { r.ring = forceRing @@ -2358,7 +2478,7 @@ func Test_LoadPartialGroups(t *testing.T) { } r1, manager := buildRuler(t, cfg, nil, store, nil) - r1.limits = ruleLimits{tenantShard: 1} + r1.limits = &ruleLimits{tenantShard: 1} require.NoError(t, services.StartAndAwaitRunning(context.Background(), r1)) t.Cleanup(r1.StopAsync) @@ -2882,7 +3002,7 @@ func TestRulerDisablesRuleGroups(t *testing.T) { } r, _ := buildRuler(t, cfg, nil, store, nil) - r.limits = ruleLimits{tenantShard: 3, disabledRuleGroups: tc.disabledRuleGroups} + r.limits = &ruleLimits{tenantShard: 3, disabledRuleGroups: tc.disabledRuleGroups} if forceRing != nil { r.ring = forceRing diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index f66ebda05e0..3658b3fc790 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -185,6 +185,7 @@ type Limits struct { RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` RulerQueryOffset model.Duration `yaml:"ruler_query_offset" json:"ruler_query_offset"` + RulerExternalLabels labels.Labels `yaml:"ruler_external_labels" json:"ruler_external_labels" doc:"nocli|description=external labels for alerting rules"` // Store-gateway. StoreGatewayTenantShardSize float64 `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` @@ -214,8 +215,6 @@ type Limits struct { AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"` AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"` DisabledRuleGroups DisabledRuleGroups `yaml:"disabled_rule_groups" json:"disabled_rule_groups" doc:"nocli|description=list of rule groups to disable"` - - ExternalLabels labels.Labels `yaml:"external_labels" json:"external_labels" doc:"nocli|description=external labels for alerting rules"` } // RegisterFlags adds the flags required to config this to the given FlagSet @@ -315,7 +314,7 @@ func (l *Limits) Validate(shardByAllLabels bool) error { return errMaxGlobalSeriesPerUserValidation } - if err := l.ExternalLabels.Validate(func(l labels.Label) error { + if err := l.RulerExternalLabels.Validate(func(l labels.Label) error { if !model.LabelName(l.Name).IsValid() { return fmt.Errorf("%w: %q", errInvalidLabelName, l.Name) } @@ -965,8 +964,8 @@ func (o *Overrides) DisabledRuleGroups(userID string) DisabledRuleGroups { return DisabledRuleGroups{} } -func (o *Overrides) ExternalLabels(userID string) labels.Labels { - return o.GetOverridesForUser(userID).ExternalLabels +func (o *Overrides) RulerExternalLabels(userID string) labels.Labels { + return o.GetOverridesForUser(userID).RulerExternalLabels } // GetOverridesForUser returns the per-tenant limits with overrides. diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index aa0cc90c030..50d7cb7e3f4 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -63,10 +63,14 @@ func TestLimits_Validate(t *testing.T) { shardByAllLabels: true, expected: nil, }, - "external-labels invalid": { - limits: Limits{ExternalLabels: labels.Labels{{Name: "123dd", Value: "oo"}}}, + "external-labels invalid label name": { + limits: Limits{RulerExternalLabels: labels.Labels{{Name: "123invalid", Value: "good"}}}, expected: errInvalidLabelName, }, + "external-labels invalid label value": { + limits: Limits{RulerExternalLabels: labels.Labels{{Name: "good", Value: string([]byte{0xff, 0xfe, 0xfd})}}}, + expected: errInvalidLabelValue, + }, } for testName, testData := range tests {