From be1f450d3c777e0eaddbcdebed61eb9393c463d9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 21 Dec 2023 12:40:55 +0100 Subject: [PATCH] Jsonnet: improve querier HPA to scale up and down more gradually (#6971) * Jsonnet: improve querier HPA Signed-off-by: Marco Pracucci * Fixed query Signed-off-by: Marco Pracucci * Added CHANGELOG entry Signed-off-by: Marco Pracucci * Addressed review comments Signed-off-by: Marco Pracucci --------- Signed-off-by: Marco Pracucci --- CHANGELOG.md | 3 +- ...g-custom-target-utilization-generated.yaml | 21 +++++- .../test-autoscaling-generated.yaml | 21 +++++- ...ployment-mode-s3-autoscaled-generated.yaml | 21 +++++- operations/mimir/autoscaling.libsonnet | 75 +++++++++++++++++-- .../autoscaling.libsonnet | 3 +- 6 files changed, 128 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 550033bf5c9..63dc2c7a773 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,7 +34,6 @@ ### Jsonnet * [CHANGE] Querier: Increase `JAEGER_REPORTER_MAX_QUEUE_SIZE` from 1000 to 5000, to avoid dropping tracing spans. #6764 -* [ENHANCEMENT] Alerts: Add `MimirStoreGatewayTooManyFailedOperations` warning alert that triggers when Mimir store-gateway report error when interacting with the object storage. #6831 * [FEATURE] Added support for the following root-level settings to configure the list of matchers to apply to node affinity: #6782 #6829 * `alertmanager_node_affinity_matchers` * `compactor_node_affinity_matchers` @@ -69,6 +68,8 @@ * `store_gateway_zone_b_node_affinity_matchers` * `store_gateway_zone_c_node_affinity_matchers` * [FEATURE] Ingester: Allow automated zone-by-zone downscaling, that can be enabled via the `ingester_automated_downscale_enabled` flag. It is disabled by default. #6850 +* [ENHANCEMENT] Alerts: Add `MimirStoreGatewayTooManyFailedOperations` warning alert that triggers when Mimir store-gateway report error when interacting with the object storage. #6831 +* [ENHANCEMENT] Querier HPA: improved scaling metric and scaling policies, in order to scale up and down more gradually. #6971 * [BUGFIX] Update memcached-exporter to 0.14.1 due to CVE-2023-39325. #6861 ### Mimirtool diff --git a/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml b/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml index ba42cf94510..63ed7040d69 100644 --- a/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml +++ b/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml @@ -2035,9 +2035,19 @@ spec: behavior: scaleDown: policies: - - periodSeconds: 60 + - periodSeconds: 120 type: Percent value: 10 + stabilizationWindowSeconds: 600 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 60 maxReplicaCount: 30 minReplicaCount: 3 pollingInterval: 10 @@ -2046,11 +2056,18 @@ spec: triggers: - metadata: metricName: cortex_querier_hpa_default - query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.75"}[5m])) + query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.5"}[1m])) serverAddress: http://prometheus.default:9090/prometheus threshold: "7" name: cortex_querier_hpa_default type: prometheus + - metadata: + metricName: cortex_querier_hpa_default_requests_duration + query: sum(rate(cortex_querier_request_duration_seconds_sum{container="querier",namespace="default"}[1m])) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "7" + name: cortex_querier_hpa_default_requests_duration + type: prometheus --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject diff --git a/operations/mimir-tests/test-autoscaling-generated.yaml b/operations/mimir-tests/test-autoscaling-generated.yaml index 9db35f471cb..3a01e6f101f 100644 --- a/operations/mimir-tests/test-autoscaling-generated.yaml +++ b/operations/mimir-tests/test-autoscaling-generated.yaml @@ -2035,9 +2035,19 @@ spec: behavior: scaleDown: policies: - - periodSeconds: 60 + - periodSeconds: 120 type: Percent value: 10 + stabilizationWindowSeconds: 600 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 60 maxReplicaCount: 30 minReplicaCount: 3 pollingInterval: 10 @@ -2046,11 +2056,18 @@ spec: triggers: - metadata: metricName: cortex_querier_hpa_default - query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.75"}[5m])) + query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.5"}[1m])) serverAddress: http://prometheus.default:9090/prometheus threshold: "6" name: cortex_querier_hpa_default type: prometheus + - metadata: + metricName: cortex_querier_hpa_default_requests_duration + query: sum(rate(cortex_querier_request_duration_seconds_sum{container="querier",namespace="default"}[1m])) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "6" + name: cortex_querier_hpa_default_requests_duration + type: prometheus --- apiVersion: keda.sh/v1alpha1 kind: ScaledObject diff --git a/operations/mimir-tests/test-read-write-deployment-mode-s3-autoscaled-generated.yaml b/operations/mimir-tests/test-read-write-deployment-mode-s3-autoscaled-generated.yaml index 1fd8f2fb88d..6e1c2c7f823 100644 --- a/operations/mimir-tests/test-read-write-deployment-mode-s3-autoscaled-generated.yaml +++ b/operations/mimir-tests/test-read-write-deployment-mode-s3-autoscaled-generated.yaml @@ -1848,9 +1848,19 @@ spec: behavior: scaleDown: policies: - - periodSeconds: 60 + - periodSeconds: 120 type: Percent value: 10 + stabilizationWindowSeconds: 600 + scaleUp: + policies: + - periodSeconds: 120 + type: Percent + value: 50 + - periodSeconds: 120 + type: Pods + value: 15 + stabilizationWindowSeconds: 60 maxReplicaCount: 20 minReplicaCount: 2 pollingInterval: 10 @@ -1859,8 +1869,15 @@ spec: triggers: - metadata: metricName: cortex_mimir_read_hpa_default - query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="mimir-backend",namespace="default",quantile="0.75"}[5m])) + query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="mimir-backend",namespace="default",quantile="0.5"}[1m])) serverAddress: http://prometheus.default:9090/prometheus threshold: "6" name: cortex_mimir_read_hpa_default type: prometheus + - metadata: + metricName: cortex_mimir_read_hpa_default_requests_duration + query: sum(rate(cortex_querier_request_duration_seconds_sum{container="mimir-read",namespace="default"}[1m])) + serverAddress: http://prometheus.default:9090/prometheus + threshold: "6" + name: cortex_mimir_read_hpa_default_requests_duration + type: prometheus diff --git a/operations/mimir/autoscaling.libsonnet b/operations/mimir/autoscaling.libsonnet index c23f489e4e4..4de04958422 100644 --- a/operations/mimir/autoscaling.libsonnet +++ b/operations/mimir/autoscaling.libsonnet @@ -153,7 +153,7 @@ // `weight` param can be used to control just a portion of the expected queriers with the generated scaled object. // For example, if you run multiple querier deployments on different node types, you can use the weight to control which portion of them runs on which nodes. // The weight is a number between 0 and 1, where 1 means 100% of the expected queriers. - newQuerierScaledObject(name, query_scheduler_container, querier_max_concurrent, min_replicas, max_replicas, target_utilization, weight=1):: self.newScaledObject(name, $._config.namespace, { + newQuerierScaledObject(name, query_scheduler_container_name, querier_container_name, querier_max_concurrent, min_replicas, max_replicas, target_utilization, weight=1):: self.newScaledObject(name, $._config.namespace, { min_replica_count: replicasWithWeight(min_replicas, weight), max_replica_count: replicasWithWeight(max_replicas, weight), @@ -163,17 +163,75 @@ // Each query scheduler tracks *at regular intervals* the number of inflight requests // (both enqueued and processing queries) as a summary. With the following query we target - // to have enough querier workers to run the max observed inflight requests 75% of time. + // to have enough querier workers to run the max observed inflight requests 50% of time. // - // Instead of measuring it as instant query, we look at the max 75th percentile over the last - // 5 minutes. This allows us to scale up quickly, but scale down slowly (and not too early - // if within the next 5 minutes after a scale up we have further spikes). - query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%s",namespace="%s",quantile="0.75"}[5m]))' % [query_scheduler_container, $._config.namespace], weight), + // This metric covers the case queries are piling up in the query-scheduler queue. + query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%s",namespace="%s",quantile="0.5"}[1m]))' % [query_scheduler_container_name, $._config.namespace], weight), + + threshold: '%d' % std.floor(querier_max_concurrent * target_utilization), + }, + { + metric_name: 'cortex_%s_hpa_%s_requests_duration' % [std.strReplace(name, '-', '_'), $._config.namespace], + + // The total requests duration / second is a good approximation of the number of querier workers used. + // + // This metric covers the case queries are not necessarily piling up in the query-scheduler queue, + // but queriers are busy. + query: metricWithWeight('sum(rate(cortex_querier_request_duration_seconds_sum{container="%s",namespace="%s"}[1m]))' % [querier_container_name, $._config.namespace], weight), threshold: '%d' % std.floor(querier_max_concurrent * target_utilization), }, ], - }), + }) + { + spec+: { + advanced: { + horizontalPodAutoscalerConfig: { + behavior: { + scaleUp: { + // When multiple policies are specified the policy which allows the highest amount of change is the + // policy which is selected by default. + policies: [ + { + // Allow to scale up at most 50% of pods every 2m. Why 2m? Because the metric looks back 1m and we + // give another 1m to let new queriers to start and process some backlog. + // + // This policy covers the case we already have an high number of queriers running and adding +50% + // in the span of 2m means adding a significative number of pods. + type: 'Percent', + value: 50, + periodSeconds: 120, + }, + { + // Allow to scale up at most 15 pods every 2m. Why 2m? Because the metric looks back 1m and we + // give another 1m to let new queriers to start and process some backlog. + // + // This policy covers the case we currently have an small number of queriers (e.g. < 10) and limiting + // the scaling by percentage may be too slow when scaling up. + type: 'Pods', + value: 15, + periodSeconds: 120, + }, + ], + // Scaling metrics query the last 1m, so after a scale up we should wait at least 1m before we re-evaluate + // them for a further scale up. + stabilizationWindowSeconds: 60, + }, + scaleDown: { + policies: [{ + // Allow to scale down up to 10% of pods every 2m. + type: 'Percent', + value: 10, + periodSeconds: 120, + }], + // Reduce the likelihood of flapping replicas. When the metrics indicate that the target should be scaled + // down, HPA looks into previously computed desired states, and uses the highest value from the last 10m. + stabilizationWindowSeconds: 600, + }, + }, + }, + }, + }, + }, // To scale out relatively quickly, but scale in slower, we look at the average CPU utilization // per replica over 5m (rolling window) and then we pick the highest value over the last 15m. @@ -309,7 +367,8 @@ querier_scaled_object: if !$._config.autoscaling_querier_enabled then null else self.newQuerierScaledObject( name='querier', - query_scheduler_container='query-scheduler', + query_scheduler_container_name='query-scheduler', + querier_container_name='querier', querier_max_concurrent=$.querier_args['querier.max-concurrent'], min_replicas=$._config.autoscaling_querier_min_replicas, max_replicas=$._config.autoscaling_querier_max_replicas, diff --git a/operations/mimir/read-write-deployment/autoscaling.libsonnet b/operations/mimir/read-write-deployment/autoscaling.libsonnet index 643c1f7953e..935311c98cf 100644 --- a/operations/mimir/read-write-deployment/autoscaling.libsonnet +++ b/operations/mimir/read-write-deployment/autoscaling.libsonnet @@ -16,7 +16,8 @@ // be a sufficient indication of load on the read path. self.newQuerierScaledObject( name='mimir-read', - query_scheduler_container='mimir-backend', + query_scheduler_container_name='mimir-backend', + querier_container_name='mimir-read', querier_max_concurrent=$.querier_args['querier.max-concurrent'], min_replicas=$._config.autoscaling_mimir_read_min_replicas, max_replicas=$._config.autoscaling_mimir_read_max_replicas,