From 2b1089a8b7191da3636812646cd4db86aaa220f2 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 2 Sep 2024 18:56:34 +0200 Subject: [PATCH 1/5] add MimirHPANeedsToBeScaledUp alert --- CHANGELOG.md | 4 +++ .../atlas/alerting-rules/mimir.rules.yml | 26 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36dd8bba..5821ac85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add `MimirHPANeedsToBeScaledUp` alert, to detect when Mimir's HPAs have reached maximum capacity. + ### Changed - alertmanager alerts: add link to dashboard diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 41535395..4c87c592 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -137,4 +137,30 @@ spec: severity: page team: atlas topic: observability + - alert: MimirHPANeedsToBeScaledUp + annotations: + description: '{{`Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up.`}}' + opsrecipe: mimir-ingester/ + expr: |- + ( + kube_horizontalpodautoscaler_status_desired_replicas{namespace="mimir"} / + on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) + kube_horizontalpodautoscaler_spec_max_replicas{namespace="mimir"} >= 1.0 + ) + and on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) + ( + kube_horizontalpodautoscaler_status_target_metric{namespace="mimir"} > + on(cluster_id, customer, installation, namespace, horizontalpodautoscaler, metric_name, metric_target_type) + kube_horizontalpodautoscaler_spec_target_metric{namespace="mimir"} + ) + for: 30m + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability {{- end }} From 5276895723d59b94b7a083b80a2ced0d93be76e6 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 2 Sep 2024 19:03:03 +0200 Subject: [PATCH 2/5] simple comparison over percentage calculation --- .../templates/platform/atlas/alerting-rules/mimir.rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 4c87c592..9311a937 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -143,9 +143,9 @@ spec: opsrecipe: mimir-ingester/ expr: |- ( - kube_horizontalpodautoscaler_status_desired_replicas{namespace="mimir"} / + kube_horizontalpodautoscaler_status_desired_replicas{namespace="mimir"} >= on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) - kube_horizontalpodautoscaler_spec_max_replicas{namespace="mimir"} >= 1.0 + kube_horizontalpodautoscaler_spec_max_replicas{namespace="mimir"} ) and on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) ( From 84d79640d5fa2225072c5e1bcc63ad2ab7e19bc2 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Tue, 3 Sep 2024 17:08:57 +0200 Subject: [PATCH 3/5] rename to MimirHPAReachedMaxReplicas --- CHANGELOG.md | 2 +- .../templates/platform/atlas/alerting-rules/mimir.rules.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5821ac85..57cccbc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add `MimirHPANeedsToBeScaledUp` alert, to detect when Mimir's HPAs have reached maximum capacity. +- Add `MimirHPAReachedMaxReplicas` alert, to detect when Mimir's HPAs have reached maximum capacity. ### Changed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 9311a937..96978384 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -137,7 +137,7 @@ spec: severity: page team: atlas topic: observability - - alert: MimirHPANeedsToBeScaledUp + - alert: MimirHPAReachedMaxReplicas annotations: description: '{{`Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up.`}}' opsrecipe: mimir-ingester/ From df35a5ce9c802a813efb3ad5d5564aa9461b31eb Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Sun, 8 Sep 2024 14:02:42 +0200 Subject: [PATCH 4/5] add unit tests for MimirHPAReachedMaxReplicas --- .../atlas/alerting-rules/mimir.rules.test.yml | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index eed18727..51234506 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -318,3 +318,65 @@ tests: opsrecipe: "mimir-ingester/" - alertname: MimirIngesterNeedsToBeScaledDown eval_time: 280m + - interval: 1m + input_series: + # HPA max replicas = 3 for the whole test + # HPA target metric = 90% for the whole test + # Cases: + # desired_replicas < max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas < max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas < max_replicas AND current_utilization > target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization > target_utilization does fire + # desired_replicas > max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas > max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas > max_replicas AND current_utilization > target_utilization does fire + - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}' + values: '3+0x360' + - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}' + values: '2+0x120 3+0x120 4+0x120' + - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}' + values: '90+0x360' + # HPA current metric = 80% for 10mn, then increase to 90% for 10mn + - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}' + values: '80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40' + alert_rule_test: + - alertname: MimirHPAReachedMaxReplicas + eval_time: 234m + - alertname: MimirHPAReachedMaxReplicas + eval_time: 235m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + horizontalpodautoscaler: mimir-distributor + namespace: mimir + exp_annotations: + description: "Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + opsrecipe: "mimir-ingester/" + - alertname: MimirHPAReachedMaxReplicas + eval_time: 246m + - alertname: MimirHPAReachedMaxReplicas + eval_time: 360m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + horizontalpodautoscaler: mimir-distributor + namespace: mimir + exp_annotations: + description: "Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + opsrecipe: "mimir-ingester/" From 15c8055b2edbae2b6b4ce51e89ba9e3762e6ff6b Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Sun, 8 Sep 2024 14:24:36 +0200 Subject: [PATCH 5/5] add test comment --- .../platform/atlas/alerting-rules/mimir.rules.test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index 1d716d4d..8f60ac38 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -321,6 +321,7 @@ tests: opsrecipe: "mimir-ingester/" - alertname: MimirIngesterNeedsToBeScaledDown eval_time: 280m + # Test for MimirHPAReachedMaxReplicas alert - interval: 1m input_series: # HPA max replicas = 3 for the whole test