Skip to content

Commit

Permalink
merge from master
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Le <[email protected]>
  • Loading branch information
alexqyle committed Sep 13, 2023
2 parents 146417b + 97effe9 commit 20ff2ca
Show file tree
Hide file tree
Showing 29 changed files with 1,444 additions and 169 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Changelog

## master / unreleased
* [FEATURE] Ruler: Add support for disabling rule groups. #5521
* [FEATURE] Added the flag `-alertmanager.alerts-gc-interval` to configure alert manager alerts Garbage collection interval. #5550
* [FEATURE] Ruler: Add support for Limit field on RuleGroup. #5528
* [FEATURE] AlertManager: Add support for Webex, Discord and Telegram Receiver. #5493
* [FEATURE] Ingester: added `-admin-limit-message` to customize the message contained in limit errors.#5460
* [FEATURE] AlertManager: Update version to v0.26.0 and bring in Microsoft Teams receiver. #5543
* [FEATURE] Store Gateway: Support lazy expanded posting optimization. Added new flag `"blocks-storage.bucket-store.lazy-expanded-postings-enabled` and new metrics `cortex_bucket_store_lazy_expanded_postings_total`, `cortex_bucket_store_lazy_expanded_posting_size_bytes_total` and `cortex_bucket_store_lazy_expanded_posting_series_overfetched_size_bytes_total`. #5556.
* [CHANGE] AlertManager: include reason label in cortex_alertmanager_notifications_failed_total.#5409
* [CHANGE] Query: Set CORS Origin headers for Query API #5388
* [CHANGE] Updating prometheus/alertmanager from v0.25.0 to v0.25.1-0.20230505130626-263ca5c9438e. This includes the below changes. #5276
Expand Down Expand Up @@ -33,6 +35,7 @@
* [FEATURE] Ruler: Support for filtering rules in the API. #5417
* [FEATURE] Compactor: Add `-compactor.ring.tokens-file-path` to store generated tokens locally. #5432
* [FEATURE] Query Frontend: Add `-frontend.retry-on-too-many-outstanding-requests` to re-enqueue 429 requests if there are multiple query-schedulers available. #5496
* [FEATURE] Store Gateway: Add `-blocks-storage.bucket-store.max-inflight-requests`for store gateways to reject further requests upon reaching the limit. #5553
* [FEATURE] Compactor: Implemented partitioning compactor based on proposal #4843. #5465
* [ENHANCEMENT] Distributor/Ingester: Add span on push path #5319
* [ENHANCEMENT] Support object storage backends for runtime configuration file. #5292
Expand All @@ -58,6 +61,7 @@
* [ENHANCEMENT] Store Gateway: add metric `cortex_bucket_store_chunk_refetches_total` for number of chunk refetches. #5532
* [ENHANCEMENT] BasicLifeCycler: allow final-sleep during shutdown #5517
* [ENHANCEMENT] All: Handling CMK Access Denied errors. #5420 #5542
* [ENHANCEMENT] Querier: Retry store gateway client connection closing gRPC error. #5558
* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293
Expand Down
10 changes: 10 additions & 0 deletions docs/blocks-storage/querier.md
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,11 @@ blocks_storage:
# CLI flag: -blocks-storage.bucket-store.max-concurrent
[max_concurrent: <int> | default = 100]
# Max number of inflight queries to execute against the long-term storage.
# The limit is shared across all tenants. 0 to disable.
# CLI flag: -blocks-storage.bucket-store.max-inflight-requests
[max_inflight_requests: <int> | default = 0]
# Maximum number of concurrent tenants synching blocks.
# CLI flag: -blocks-storage.bucket-store.tenant-sync-concurrency
[tenant_sync_concurrency: <int> | default = 10]
Expand Down Expand Up @@ -1101,6 +1106,11 @@ blocks_storage:
# CLI flag: -blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout
[index_header_lazy_loading_idle_timeout: <duration> | default = 20m]

# If true, Store Gateway will estimate postings size and try to lazily
# expand postings if it downloads less data than expanding all postings.
# CLI flag: -blocks-storage.bucket-store.lazy-expanded-postings-enabled
[lazy_expanded_postings_enabled: <boolean> | default = false]

tsdb:
# Local directory to store TSDBs in the ingesters.
# CLI flag: -blocks-storage.tsdb.dir
Expand Down
10 changes: 10 additions & 0 deletions docs/blocks-storage/store-gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,11 @@ blocks_storage:
# CLI flag: -blocks-storage.bucket-store.max-concurrent
[max_concurrent: <int> | default = 100]
# Max number of inflight queries to execute against the long-term storage.
# The limit is shared across all tenants. 0 to disable.
# CLI flag: -blocks-storage.bucket-store.max-inflight-requests
[max_inflight_requests: <int> | default = 0]
# Maximum number of concurrent tenants synching blocks.
# CLI flag: -blocks-storage.bucket-store.tenant-sync-concurrency
[tenant_sync_concurrency: <int> | default = 10]
Expand Down Expand Up @@ -1204,6 +1209,11 @@ blocks_storage:
# CLI flag: -blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout
[index_header_lazy_loading_idle_timeout: <duration> | default = 20m]

# If true, Store Gateway will estimate postings size and try to lazily
# expand postings if it downloads less data than expanding all postings.
# CLI flag: -blocks-storage.bucket-store.lazy-expanded-postings-enabled
[lazy_expanded_postings_enabled: <boolean> | default = false]

tsdb:
# Local directory to store TSDBs in the ingesters.
# CLI flag: -blocks-storage.tsdb.dir
Expand Down
35 changes: 33 additions & 2 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,11 @@ bucket_store:
# CLI flag: -blocks-storage.bucket-store.max-concurrent
[max_concurrent: <int> | default = 100]
# Max number of inflight queries to execute against the long-term storage. The
# limit is shared across all tenants. 0 to disable.
# CLI flag: -blocks-storage.bucket-store.max-inflight-requests
[max_inflight_requests: <int> | default = 0]
# Maximum number of concurrent tenants synching blocks.
# CLI flag: -blocks-storage.bucket-store.tenant-sync-concurrency
[tenant_sync_concurrency: <int> | default = 10]
Expand Down Expand Up @@ -1643,6 +1648,11 @@ bucket_store:
# CLI flag: -blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout
[index_header_lazy_loading_idle_timeout: <duration> | default = 20m]

# If true, Store Gateway will estimate postings size and try to lazily expand
# postings if it downloads less data than expanding all postings.
# CLI flag: -blocks-storage.bucket-store.lazy-expanded-postings-enabled
[lazy_expanded_postings_enabled: <boolean> | default = false]

tsdb:
# Local directory to store TSDBs in the ingesters.
# CLI flag: -blocks-storage.tsdb.dir
Expand Down Expand Up @@ -2857,7 +2867,7 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
# List of metric relabel configurations. Note that in most situations, it is
# more effective to use metrics relabeling directly in the Prometheus server,
# e.g. remote_write.write_relabel_configs.
[metric_relabel_configs: <relabel_config...> | default = ]
[metric_relabel_configs: <relabel_config...> | default = []]
# Enables support for exemplars in TSDB and sets the maximum number that will be
# stored. less than zero means disabled. If the value is set to zero, cortex
Expand Down Expand Up @@ -3105,6 +3115,9 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
# alerts will fail with a log message and metric increment. 0 = no limit.
# CLI flag: -alertmanager.max-alerts-size-bytes
[alertmanager_max_alerts_size_bytes: <int> | default = 0]

# list of rule groups to disable
[disabled_rule_groups: <list of DisabledRuleGroup> | default = []]
```
### `memberlist_config`
Expand Down Expand Up @@ -3724,7 +3737,7 @@ The `ruler_config` configures the Cortex ruler.
[external_url: <url> | default = ]
# Labels to add to all alerts.
[external_labels: <map of string to string> | default = ]
[external_labels: <list of Label> | default = []]
ruler_client:
# gRPC client max receive message size (bytes).
Expand Down Expand Up @@ -4937,3 +4950,21 @@ otel:
# CLI flag: -tracing.otel.tls.tls-insecure-skip-verify
[tls_insecure_skip_verify: <boolean> | default = false]
```
### `DisabledRuleGroup`

```yaml
# namespace in which the rule group belongs
[namespace: <string> | default = ""]
# name of the rule group
[name: <string> | default = ""]
```

### `Label`

```yaml
[name: <string> | default = ""]
[value: <string> | default = ""]
```
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ require (
github.com/stretchr/testify v1.8.4
github.com/thanos-io/objstore v0.0.0-20230816175749-20395bffdf26
github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e
github.com/thanos-io/thanos v0.32.1-0.20230831143954-f75e44ac929c
github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd
github.com/uber/jaeger-client-go v2.30.0+incompatible
github.com/weaveworks/common v0.0.0-20221201103051-7c2720a9024d
go.etcd.io/etcd/api/v3 v3.5.9
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1216,8 +1216,8 @@ github.com/thanos-io/objstore v0.0.0-20230816175749-20395bffdf26 h1:q1lin/af0lw+
github.com/thanos-io/objstore v0.0.0-20230816175749-20395bffdf26/go.mod h1:oJ82xgcBDzGJrEgUsjlTj6n01+ZWUMMUR8BlZzX5xDE=
github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e h1:kwsFCU8eSkZehbrAN3nXPw5RdMHi/Bok/y8l2C4M+gk=
github.com/thanos-io/promql-engine v0.0.0-20230821193351-e1ae4275b96e/go.mod h1:+T/ZYNCGybT6eTsGGvVtGb63nT1cvUmH6MjqRrcQoKw=
github.com/thanos-io/thanos v0.32.1-0.20230831143954-f75e44ac929c h1:d5IJk0L61FaewLnGoVLlJb206vMz8WD6ash104tsc2w=
github.com/thanos-io/thanos v0.32.1-0.20230831143954-f75e44ac929c/go.mod h1:J81dp4qaOX+GfPmRoYqu/aZXfEBri7+i3TzY2xamthg=
github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd h1:JAXqwb/nzY7WzijekZrhrL63m988VLyoFUEaKLU15iA=
github.com/thanos-io/thanos v0.32.3-0.20230911095949-f6a39507b6bd/go.mod h1:J81dp4qaOX+GfPmRoYqu/aZXfEBri7+i3TzY2xamthg=
github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab h1:7ZR3hmisBWw77ZpO1/o86g+JV3VKlk3d48jopJxzTjU=
github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab/go.mod h1:eheTFp954zcWZXCU8d0AT76ftsQOTo4DTqkN/h3k1MY=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
Expand Down
123 changes: 123 additions & 0 deletions integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package integration

import (
"bytes"
"context"
"crypto/x509"
"crypto/x509/pkix"
Expand All @@ -29,6 +30,7 @@ import (
"github.com/prometheus/prometheus/prompb"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/thanos-io/objstore/providers/s3"
"gopkg.in/yaml.v3"

"github.com/cortexproject/cortex/integration/ca"
Expand Down Expand Up @@ -915,6 +917,127 @@ func TestRulerMetricsWhenIngesterFails(t *testing.T) {
})
}

func TestRulerDisablesRuleGroups(t *testing.T) {
s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

// Start dependencies.
consul := e2edb.NewConsul()
minio := e2edb.NewMinio(9000, bucketName, rulestoreBucketName)
require.NoError(t, s.StartAndWaitReady(consul, minio))

const blockRangePeriod = 2 * time.Second
// Configure the ruler.
flags := mergeFlags(
BlocksStorageFlags(),
RulerFlags(),
map[string]string{
"-blocks-storage.tsdb.block-ranges-period": blockRangePeriod.String(),
"-blocks-storage.tsdb.ship-interval": "1s",
"-blocks-storage.bucket-store.sync-interval": "1s",
"-blocks-storage.bucket-store.index-cache.backend": tsdb.IndexCacheBackendInMemory,
"-blocks-storage.tsdb.retention-period": ((blockRangePeriod * 2) - 1).String(),

// Enable the bucket index so we can skip the initial bucket scan.
"-blocks-storage.bucket-store.bucket-index.enabled": "false",
// Evaluate rules often, so that we don't need to wait for metrics to show up.
"-ruler.evaluation-interval": "2s",
"-ruler.poll-interval": "2s",
// No delay
"-ruler.evaluation-delay-duration": "0",

// We run single ingester only, no replication.
"-distributor.replication-factor": "1",

// Very low limit so that ruler hits it.
"-querier.max-fetched-chunks-per-query": "15",
"-querier.query-store-after": (1 * time.Second).String(),
"-querier.query-ingesters-within": (2 * time.Second).String(),
},
)

const namespace = "test"
const user = "user"
configFileName := "runtime-config.yaml"
bucketName := "cortex"

storeGateway := e2ecortex.NewStoreGateway("store-gateway-1", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")

flags = mergeFlags(flags, map[string]string{
"-querier.store-gateway-addresses": storeGateway.NetworkGRPCEndpoint(),
"-runtime-config.backend": "s3",
"-runtime-config.s3.access-key-id": e2edb.MinioAccessKey,
"-runtime-config.s3.secret-access-key": e2edb.MinioSecretKey,
"-runtime-config.s3.bucket-name": bucketName,
"-runtime-config.s3.endpoint": fmt.Sprintf("%s-minio-9000:9000", networkName),
"-runtime-config.s3.insecure": "true",
"-runtime-config.file": configFileName,
"-runtime-config.reload-period": "2s",
})

distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")

client, err := s3.NewBucketWithConfig(nil, s3.Config{
Endpoint: minio.HTTPEndpoint(),
Insecure: true,
Bucket: bucketName,
AccessKey: e2edb.MinioAccessKey,
SecretKey: e2edb.MinioSecretKey,
}, "runtime-config-test")

require.NoError(t, err)

// update runtime config
newRuntimeConfig := []byte(`overrides:
user:
disabled_rule_groups:
- name: bad_rule
namespace: test`)
require.NoError(t, client.Upload(context.Background(), configFileName, bytes.NewReader(newRuntimeConfig)))
time.Sleep(2 * time.Second)

ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), flags, "")

ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
require.NoError(t, s.StartAndWaitReady(distributor, ingester, ruler, storeGateway))

// Wait until both the distributor and ruler have updated the ring. The querier will also watch
// the store-gateway ring if blocks sharding is enabled.
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(1024), "cortex_ring_tokens_total"))

c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", ruler.HTTPEndpoint(), user)
require.NoError(t, err)

expression := "absent(sum_over_time(metric{}[2s] offset 1h))"

t.Run("disable_rule_group", func(t *testing.T) {

ruleGroup := ruleGroupWithRule("bad_rule", "rule", expression)
ruleGroup.Interval = 2
require.NoError(t, c.SetRuleGroup(ruleGroup, namespace))

ruleGroup = ruleGroupWithRule("good_rule", "rule", expression)
ruleGroup.Interval = 2
require.NoError(t, c.SetRuleGroup(ruleGroup, namespace))

m1 := ruleGroupMatcher(user, namespace, "good_rule")

// Wait until ruler has loaded the group.
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_sync_rules_total"}, e2e.WaitMissingMetrics))

require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m1), e2e.WaitMissingMetrics))

filter := e2ecortex.RuleFilter{}
actualGroups, err := c.GetPrometheusRules(filter)
require.NoError(t, err)
assert.Equal(t, 1, len(actualGroups))
assert.Equal(t, "good_rule", actualGroups[0].Name)
assert.Equal(t, "test", actualGroups[0].File)
})
}

func ruleGroupMatcher(user, namespace, groupName string) *labels.Matcher {
return labels.MustNewMatcher(labels.MatchEqual, "rule_group", fmt.Sprintf("/rules/%s/%s;%s", user, namespace, groupName))
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/querier/blocks_store_queryable.go
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,12 @@ func isRetryableError(err error) bool {
switch status.Code(err) {
case codes.Unavailable:
return true
case codes.ResourceExhausted:
return errors.Is(err, storegateway.ErrTooManyInflightRequests)
// Client side connection closing, this error happens during store gateway deployment.
// https://github.com/grpc/grpc-go/blob/03172006f5d168fc646d87928d85cb9c4a480291/clientconn.go#L67
case codes.Canceled:
return strings.Contains(err.Error(), "grpc: the client connection is closing")
default:
return false
}
Expand Down
Loading

0 comments on commit 20ff2ca

Please sign in to comment.