From 5a7228e61ea5d1c38dde7753e3ca88f382380c02 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Wed, 15 Nov 2023 18:40:42 -0800 Subject: [PATCH] Add querier.max-subquery-steps to make subquery step size check optional (#5656) * add querier.max-subquery-steps to make subquery step size check optional Signed-off-by: Ben Ye * update Signed-off-by: Ben Ye * disable subquery step size check by default, make it optional Signed-off-by: Ben Ye * fix integ test and add changelog Signed-off-by: Ben Ye --------- Signed-off-by: Ben Ye --- CHANGELOG.md | 1 + docs/blocks-storage/querier.md | 5 ++ docs/configuration/config-file-reference.md | 5 ++ integration/query_frontend_test.go | 5 ++ pkg/cortex/modules.go | 1 + pkg/querier/querier.go | 4 ++ .../query_range_middlewares_test.go | 1 + pkg/querier/tripperware/roundtrip.go | 5 +- pkg/querier/tripperware/roundtrip_test.go | 72 ++++++++++++------- 9 files changed, 73 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dba1248b2c..24b9cb02f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master / unreleased * [CHANGE] Azure Storage: Upgraded objstore dependency and support Azure Workload Identity Authentication. Added `connection_string` to support authenticating via SAS token. Marked `msi_resource` config as deprecating. #5645 +* [CHANGE] Query Frontend: Expose `-querier.max-subquery-steps` to configure subquery max steps check. By default, the limit is set to 0, which is disabled. #5656 * [FEATURE] Ingester: Add per-tenant new metric `cortex_ingester_tsdb_data_replay_duration_seconds`. #5477 * [ENHANCEMENT] Store Gateway: Added `-store-gateway.enabled-tenants` and `-store-gateway.disabled-tenants` to explicitly enable or disable store-gateway for specific tenants. #5638 * [BUGFIX] Query Frontend: Fix query string being omitted in query stats log. #5655 diff --git a/docs/blocks-storage/querier.md b/docs/blocks-storage/querier.md index ae9360b501..b1bdd3a90c 100644 --- a/docs/blocks-storage/querier.md +++ b/docs/blocks-storage/querier.md @@ -157,6 +157,11 @@ querier: # CLI flag: -querier.default-evaluation-interval [default_evaluation_interval: | default = 1m] + # Max number of steps allowed for every subquery expression in query. Number + # of steps is calculated using subquery range / step. A value > 0 enables it. + # CLI flag: -querier.max-subquery-steps + [max_subquery_steps: | default = 0] + # Active query tracker monitors active queries, and writes them to the file in # given directory. If Cortex discovers any queries in this log during startup, # it will log them to the log file. Setting to empty value disables active diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 2958ad71a6..e0f5871201 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -3436,6 +3436,11 @@ The `querier_config` configures the Cortex querier. # CLI flag: -querier.default-evaluation-interval [default_evaluation_interval: | default = 1m] +# Max number of steps allowed for every subquery expression in query. Number of +# steps is calculated using subquery range / step. A value > 0 enables it. +# CLI flag: -querier.max-subquery-steps +[max_subquery_steps: | default = 0] + # Active query tracker monitors active queries, and writes them to the file in # given directory. If Cortex discovers any queries in this log during startup, # it will log them to the log file. Setting to empty value disables active query diff --git a/integration/query_frontend_test.go b/integration/query_frontend_test.go index 0dbbadeb10..5b3c0029c0 100644 --- a/integration/query_frontend_test.go +++ b/integration/query_frontend_test.go @@ -220,6 +220,11 @@ func TestQueryFrontendSubQueryStepSize(t *testing.T) { minio := e2edb.NewMinio(9000, BlocksStorageFlags()["-blocks-storage.s3.bucket-name"]) require.NoError(t, s.StartAndWaitReady(minio)) + + // Enable subquery step size check. + flags = mergeFlags(e2e.EmptyFlags(), map[string]string{ + "-querier.max-subquery-steps": "11000", + }) return cortexConfigFile, flags }, }) diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 64cce59870..7858483c68 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -486,6 +486,7 @@ func (t *Cortex) initQueryFrontendTripperware() (serv services.Service, err erro t.Overrides, queryAnalyzer, t.Cfg.Querier.DefaultEvaluationInterval, + t.Cfg.Querier.MaxSubQuerySteps, ) return services.NewIdleService(nil, func(_ error) error { diff --git a/pkg/querier/querier.go b/pkg/querier/querier.go index df13fb7ce6..d66697674d 100644 --- a/pkg/querier/querier.go +++ b/pkg/querier/querier.go @@ -64,6 +64,9 @@ type Config struct { // step if not specified. DefaultEvaluationInterval time.Duration `yaml:"default_evaluation_interval"` + // Limit of number of steps allowed for every subquery expression in a query. + MaxSubQuerySteps int64 `yaml:"max_subquery_steps"` + // Directory for ActiveQueryTracker. If empty, ActiveQueryTracker will be disabled and MaxConcurrent will not be applied (!). // ActiveQueryTracker logs queries that were active during the last crash, but logs them on the next startup. // However, we need to use active query tracker, otherwise we cannot limit Max Concurrent queries in the PromQL @@ -114,6 +117,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.LookbackDelta, "querier.lookback-delta", 5*time.Minute, "Time since the last sample after which a time series is considered stale and ignored by expression evaluations.") f.DurationVar(&cfg.ShuffleShardingIngestersLookbackPeriod, "querier.shuffle-sharding-ingesters-lookback-period", 0, "When distributor's sharding strategy is shuffle-sharding and this setting is > 0, queriers fetch in-memory series from the minimum set of required ingesters, selecting only ingesters which may have received series since 'now - lookback period'. The lookback period should be greater or equal than the configured 'query store after' and 'query ingesters within'. If this setting is 0, queriers always query all ingesters (ingesters shuffle sharding on read path is disabled).") f.BoolVar(&cfg.ThanosEngine, "querier.thanos-engine", false, "Experimental. Use Thanos promql engine https://github.com/thanos-io/promql-engine rather than the Prometheus promql engine.") + f.Int64Var(&cfg.MaxSubQuerySteps, "querier.max-subquery-steps", 0, "Max number of steps allowed for every subquery expression in query. Number of steps is calculated using subquery range / step. A value > 0 enables it.") } // Validate the config diff --git a/pkg/querier/tripperware/queryrange/query_range_middlewares_test.go b/pkg/querier/tripperware/queryrange/query_range_middlewares_test.go index 5ade2abd52..6276be72bd 100644 --- a/pkg/querier/tripperware/queryrange/query_range_middlewares_test.go +++ b/pkg/querier/tripperware/queryrange/query_range_middlewares_test.go @@ -74,6 +74,7 @@ func TestRoundTrip(t *testing.T) { nil, qa, time.Minute, + 0, ) for i, tc := range []struct { diff --git a/pkg/querier/tripperware/roundtrip.go b/pkg/querier/tripperware/roundtrip.go index 93fa1234cd..6aefe4ccec 100644 --- a/pkg/querier/tripperware/roundtrip.go +++ b/pkg/querier/tripperware/roundtrip.go @@ -103,6 +103,7 @@ func NewQueryTripperware( limits Limits, queryAnalyzer querysharding.Analyzer, defaultSubQueryInterval time.Duration, + maxSubQuerySteps int64, ) Tripperware { // Per tenant query metrics. queriesPerTenant := promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ @@ -145,10 +146,10 @@ func NewQueryTripperware( activeUsers.UpdateUserTimestamp(userStr, time.Now()) queriesPerTenant.WithLabelValues(op, userStr).Inc() - if isQuery || isQueryRange { + if maxSubQuerySteps > 0 && (isQuery || isQueryRange) { query := r.FormValue("query") // Check subquery step size. - if err := SubQueryStepSizeCheck(query, defaultSubQueryInterval, MaxStep); err != nil { + if err := SubQueryStepSizeCheck(query, defaultSubQueryInterval, maxSubQuerySteps); err != nil { return nil, err } } diff --git a/pkg/querier/tripperware/roundtrip_test.go b/pkg/querier/tripperware/roundtrip_test.go index 8497491ba6..e52514ee8d 100644 --- a/pkg/querier/tripperware/roundtrip_test.go +++ b/pkg/querier/tripperware/roundtrip_test.go @@ -109,46 +109,69 @@ func TestRoundTrip(t *testing.T) { path, expectedBody string expectedErr error limits Limits + maxSubQuerySteps int64 }{ { - path: "/foo", - expectedBody: "bar", - limits: defaultOverrides, + path: "/foo", + expectedBody: "bar", + limits: defaultOverrides, + maxSubQuerySteps: 11000, }, { - path: queryExemplar, - expectedBody: "bar", - limits: defaultOverrides, + path: queryExemplar, + expectedBody: "bar", + limits: defaultOverrides, + maxSubQuerySteps: 11000, }, { - path: queryRange, - expectedBody: responseBody, - limits: defaultOverrides, + path: queryRange, + expectedBody: responseBody, + limits: defaultOverrides, + maxSubQuerySteps: 11000, }, { - path: query, - expectedBody: "bar", - limits: defaultOverrides, + path: query, + expectedBody: "bar", + limits: defaultOverrides, + maxSubQuerySteps: 11000, }, { - path: queryNonShardable, - expectedBody: "bar", - limits: defaultOverrides, + path: queryNonShardable, + expectedBody: "bar", + limits: defaultOverrides, + maxSubQuerySteps: 11000, }, { - path: queryNonShardable, - expectedBody: "bar", - limits: shardingOverrides, + path: queryNonShardable, + expectedBody: "bar", + limits: shardingOverrides, + maxSubQuerySteps: 11000, }, { - path: query, - expectedBody: responseBody, - limits: shardingOverrides, + path: query, + expectedBody: responseBody, + limits: shardingOverrides, + maxSubQuerySteps: 11000, }, + // Shouldn't hit subquery step limit because max steps is set to 0 so this check is disabled. { - path: querySubqueryStepSizeTooSmall, - expectedErr: httpgrpc.Errorf(http.StatusBadRequest, ErrSubQueryStepTooSmall, 11000), - limits: defaultOverrides, + path: querySubqueryStepSizeTooSmall, + expectedBody: "bar", + limits: defaultOverrides, + maxSubQuerySteps: 0, + }, + // Shouldn't hit subquery step limit because max steps is higher, which is 100K. + { + path: querySubqueryStepSizeTooSmall, + expectedBody: "bar", + limits: defaultOverrides, + maxSubQuerySteps: 100000, + }, + { + path: querySubqueryStepSizeTooSmall, + expectedErr: httpgrpc.Errorf(http.StatusBadRequest, ErrSubQueryStepTooSmall, 11000), + limits: defaultOverrides, + maxSubQuerySteps: 11000, }, } { t.Run(tc.path, func(t *testing.T) { @@ -177,6 +200,7 @@ func TestRoundTrip(t *testing.T) { tc.limits, querysharding.NewQueryAnalyzer(), time.Minute, + tc.maxSubQuerySteps, ) resp, err := tw(downstream).RoundTrip(req) if tc.expectedErr == nil {