Jsonnet: improve querier HPA to scale up and down more gradually (#6971)

* Jsonnet: improve querier HPA Signed-off-by: Marco Pracucci <[email protected]> * Fixed query Signed-off-by: Marco Pracucci <[email protected]> * Added CHANGELOG entry Signed-off-by: Marco Pracucci <[email protected]> * Addressed review comments Signed-off-by: Marco Pracucci <[email protected]> --------- Signed-off-by: Marco Pracucci <[email protected]>
grafana · Dec 21, 2023 · be1f450 · be1f450
1 parent 324843d
commit be1f450
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,7 +34,6 @@
 ### Jsonnet
 
 * [CHANGE] Querier: Increase `JAEGER_REPORTER_MAX_QUEUE_SIZE` from 1000 to 5000, to avoid dropping tracing spans. #6764
-* [ENHANCEMENT] Alerts: Add `MimirStoreGatewayTooManyFailedOperations` warning  alert that triggers when Mimir store-gateway report error when interacting with the object storage. #6831
 * [FEATURE] Added support for the following root-level settings to configure the list of matchers to apply to node affinity: #6782 #6829
   * `alertmanager_node_affinity_matchers`
   * `compactor_node_affinity_matchers`
@@ -69,6 +68,8 @@
   * `store_gateway_zone_b_node_affinity_matchers`
   * `store_gateway_zone_c_node_affinity_matchers`
 * [FEATURE] Ingester: Allow automated zone-by-zone downscaling, that can be enabled via the `ingester_automated_downscale_enabled` flag. It is disabled by default. #6850
+* [ENHANCEMENT] Alerts: Add `MimirStoreGatewayTooManyFailedOperations` warning alert that triggers when Mimir store-gateway report error when interacting with the object storage. #6831
+* [ENHANCEMENT] Querier HPA: improved scaling metric and scaling policies, in order to scale up and down more gradually. #6971
 * [BUGFIX] Update memcached-exporter to 0.14.1 due to CVE-2023-39325. #6861
 
 ### Mimirtool

diff --git a/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml b/operations/mimir-tests/test-autoscaling-custom-target-utilization-generated.yaml
@@ -2035,9 +2035,19 @@ spec:
       behavior:
         scaleDown:
           policies:
-          - periodSeconds: 60
+          - periodSeconds: 120
             type: Percent
             value: 10
+          stabilizationWindowSeconds: 600
+        scaleUp:
+          policies:
+          - periodSeconds: 120
+            type: Percent
+            value: 50
+          - periodSeconds: 120
+            type: Pods
+            value: 15
+          stabilizationWindowSeconds: 60
   maxReplicaCount: 30
   minReplicaCount: 3
   pollingInterval: 10
@@ -2046,11 +2056,18 @@ spec:
   triggers:
   - metadata:
       metricName: cortex_querier_hpa_default
-      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.75"}[5m]))
+      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.5"}[1m]))
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "7"
     name: cortex_querier_hpa_default
     type: prometheus
+  - metadata:
+      metricName: cortex_querier_hpa_default_requests_duration
+      query: sum(rate(cortex_querier_request_duration_seconds_sum{container="querier",namespace="default"}[1m]))
+      serverAddress: http://prometheus.default:9090/prometheus
+      threshold: "7"
+    name: cortex_querier_hpa_default_requests_duration
+    type: prometheus
 ---
 apiVersion: keda.sh/v1alpha1
 kind: ScaledObject

diff --git a/operations/mimir-tests/test-autoscaling-generated.yaml b/operations/mimir-tests/test-autoscaling-generated.yaml
@@ -2035,9 +2035,19 @@ spec:
       behavior:
         scaleDown:
           policies:
-          - periodSeconds: 60
+          - periodSeconds: 120
             type: Percent
             value: 10
+          stabilizationWindowSeconds: 600
+        scaleUp:
+          policies:
+          - periodSeconds: 120
+            type: Percent
+            value: 50
+          - periodSeconds: 120
+            type: Pods
+            value: 15
+          stabilizationWindowSeconds: 60
   maxReplicaCount: 30
   minReplicaCount: 3
   pollingInterval: 10
@@ -2046,11 +2056,18 @@ spec:
   triggers:
   - metadata:
       metricName: cortex_querier_hpa_default
-      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.75"}[5m]))
+      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.5"}[1m]))
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "6"
     name: cortex_querier_hpa_default
     type: prometheus
+  - metadata:
+      metricName: cortex_querier_hpa_default_requests_duration
+      query: sum(rate(cortex_querier_request_duration_seconds_sum{container="querier",namespace="default"}[1m]))
+      serverAddress: http://prometheus.default:9090/prometheus
+      threshold: "6"
+    name: cortex_querier_hpa_default_requests_duration
+    type: prometheus
 ---
 apiVersion: keda.sh/v1alpha1
 kind: ScaledObject

diff --git a/operations/mimir-tests/test-read-write-deployment-mode-s3-autoscaled-generated.yaml b/operations/mimir-tests/test-read-write-deployment-mode-s3-autoscaled-generated.yaml
@@ -1848,9 +1848,19 @@ spec:
       behavior:
         scaleDown:
           policies:
-          - periodSeconds: 60
+          - periodSeconds: 120
             type: Percent
             value: 10
+          stabilizationWindowSeconds: 600
+        scaleUp:
+          policies:
+          - periodSeconds: 120
+            type: Percent
+            value: 50
+          - periodSeconds: 120
+            type: Pods
+            value: 15
+          stabilizationWindowSeconds: 60
   maxReplicaCount: 20
   minReplicaCount: 2
   pollingInterval: 10
@@ -1859,8 +1869,15 @@ spec:
   triggers:
   - metadata:
       metricName: cortex_mimir_read_hpa_default
-      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="mimir-backend",namespace="default",quantile="0.75"}[5m]))
+      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="mimir-backend",namespace="default",quantile="0.5"}[1m]))
       serverAddress: http://prometheus.default:9090/prometheus
       threshold: "6"
     name: cortex_mimir_read_hpa_default
     type: prometheus
+  - metadata:
+      metricName: cortex_mimir_read_hpa_default_requests_duration
+      query: sum(rate(cortex_querier_request_duration_seconds_sum{container="mimir-read",namespace="default"}[1m]))
+      serverAddress: http://prometheus.default:9090/prometheus
+      threshold: "6"
+    name: cortex_mimir_read_hpa_default_requests_duration
+    type: prometheus
diff --git a/operations/mimir/autoscaling.libsonnet b/operations/mimir/autoscaling.libsonnet
@@ -153,7 +153,7 @@
   // `weight` param can be used to control just a portion of the expected queriers with the generated scaled object.
   // For example, if you run multiple querier deployments on different node types, you can use the weight to control which portion of them runs on which nodes.
   // The weight is a number between 0 and 1, where 1 means 100% of the expected queriers.
-  newQuerierScaledObject(name, query_scheduler_container, querier_max_concurrent, min_replicas, max_replicas, target_utilization, weight=1):: self.newScaledObject(name, $._config.namespace, {
+  newQuerierScaledObject(name, query_scheduler_container_name, querier_container_name, querier_max_concurrent, min_replicas, max_replicas, target_utilization, weight=1):: self.newScaledObject(name, $._config.namespace, {
     min_replica_count: replicasWithWeight(min_replicas, weight),
     max_replica_count: replicasWithWeight(max_replicas, weight),
 
@@ -163,17 +163,75 @@
 
         // Each query scheduler tracks *at regular intervals* the number of inflight requests
         // (both enqueued and processing queries) as a summary. With the following query we target
-        // to have enough querier workers to run the max observed inflight requests 75% of time.
+        // to have enough querier workers to run the max observed inflight requests 50% of time.
         //
-        // Instead of measuring it as instant query, we look at the max 75th percentile over the last
-        // 5 minutes. This allows us to scale up quickly, but scale down slowly (and not too early
-        // if within the next 5 minutes after a scale up we have further spikes).
-        query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%s",namespace="%s",quantile="0.75"}[5m]))' % [query_scheduler_container, $._config.namespace], weight),
+        // This metric covers the case queries are piling up in the query-scheduler queue.
+        query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%s",namespace="%s",quantile="0.5"}[1m]))' % [query_scheduler_container_name, $._config.namespace], weight),
+
+        threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
+      },
+      {
+        metric_name: 'cortex_%s_hpa_%s_requests_duration' % [std.strReplace(name, '-', '_'), $._config.namespace],
+
+        // The total requests duration / second is a good approximation of the number of querier workers used.
+        //
+        // This metric covers the case queries are not necessarily piling up in the query-scheduler queue,
+        // but queriers are busy.
+        query: metricWithWeight('sum(rate(cortex_querier_request_duration_seconds_sum{container="%s",namespace="%s"}[1m]))' % [querier_container_name, $._config.namespace], weight),
 
         threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
       },
     ],
-  }),
+  }) + {
+    spec+: {
+      advanced: {
+        horizontalPodAutoscalerConfig: {
+          behavior: {
+            scaleUp: {
+              // When multiple policies are specified the policy which allows the highest amount of change is the
+              // policy which is selected by default.
+              policies: [
+                {
+                  // Allow to scale up at most 50% of pods every 2m. Why 2m? Because the metric looks back 1m and we
+                  // give another 1m to let new queriers to start and process some backlog.
+                  //
+                  // This policy covers the case we already have an high number of queriers running and adding +50%
+                  // in the span of 2m means adding a significative number of pods.
+                  type: 'Percent',
+                  value: 50,
+                  periodSeconds: 120,
+                },
+                {
+                  // Allow to scale up at most 15 pods every 2m. Why 2m? Because the metric looks back 1m and we
+                  // give another 1m to let new queriers to start and process some backlog.
+                  //
+                  // This policy covers the case we currently have an small number of queriers (e.g. < 10) and limiting
+                  // the scaling by percentage may be too slow when scaling up.
+                  type: 'Pods',
+                  value: 15,
+                  periodSeconds: 120,
+                },
+              ],
+              // Scaling metrics query the last 1m, so after a scale up we should wait at least 1m before we re-evaluate
+              // them for a further scale up.
+              stabilizationWindowSeconds: 60,
+            },
+            scaleDown: {
+              policies: [{
+                // Allow to scale down up to 10% of pods every 2m.
+                type: 'Percent',
+                value: 10,
+                periodSeconds: 120,
+              }],
+              // Reduce the likelihood of flapping replicas. When the metrics indicate that the target should be scaled
+              // down, HPA looks into previously computed desired states, and uses the highest value from the last 10m.
+              stabilizationWindowSeconds: 600,
+            },
+          },
+        },
+      },
+    },
+  },
 
   // To scale out relatively quickly, but scale in slower, we look at the average CPU utilization
   // per replica over 5m (rolling window) and then we pick the highest value over the last 15m.
@@ -309,7 +367,8 @@
   querier_scaled_object: if !$._config.autoscaling_querier_enabled then null else
     self.newQuerierScaledObject(
       name='querier',
-      query_scheduler_container='query-scheduler',
+      query_scheduler_container_name='query-scheduler',
+      querier_container_name='querier',
       querier_max_concurrent=$.querier_args['querier.max-concurrent'],
       min_replicas=$._config.autoscaling_querier_min_replicas,
       max_replicas=$._config.autoscaling_querier_max_replicas,

diff --git a/operations/mimir/read-write-deployment/autoscaling.libsonnet b/operations/mimir/read-write-deployment/autoscaling.libsonnet
@@ -16,7 +16,8 @@
     //                 be a sufficient indication of load on the read path.
     self.newQuerierScaledObject(
       name='mimir-read',
-      query_scheduler_container='mimir-backend',
+      query_scheduler_container_name='mimir-backend',
+      querier_container_name='mimir-read',
       querier_max_concurrent=$.querier_args['querier.max-concurrent'],
       min_replicas=$._config.autoscaling_mimir_read_min_replicas,
       max_replicas=$._config.autoscaling_mimir_read_max_replicas,