From e48e77ffc0801f5d9538c6139d6ee61419b74b87 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Mon, 23 Sep 2024 15:58:22 +0200 Subject: [PATCH] fix-mimir-hpa-label --- CHANGELOG.md | 1 + .../atlas/alerting-rules/mimir.rules.yml | 2 +- .../atlas/alerting-rules/mimir.rules.test.yml | 4 ++-- .../atlas/alerting-rules/loki.rules.test.yml | 16 ++++++++-------- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a4f3ae8..95021edf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove deprecated app labels for `external-dns` and `ingress-nginx` alerts. - Remove deprecated app labels for `kube-state-metrics` alerts. - Fix falco events alerts node label to hostname as node does not exist. +- Fix `MimirHPAReachedMaxReplicas` description to render the horizontalpodautoscaler label. ## [4.15.2] - 2024-09-17 diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 6eac71fd..d641c2c9 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -142,7 +142,7 @@ spec: topic: observability - alert: MimirHPAReachedMaxReplicas annotations: - description: '{{`Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up.`}}' + description: '{{`Mimir {{ $labels.horizontalpodautoscaler }} HPA has reached maximum replicas and consume too much resources, it needs to be scaled up.`}}' opsrecipe: mimir-hpa/ expr: |- ( diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index f1d45481..4d18a45c 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -363,7 +363,7 @@ tests: horizontalpodautoscaler: mimir-distributor namespace: mimir exp_annotations: - description: "Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." opsrecipe: "mimir-hpa/" - alertname: MimirHPAReachedMaxReplicas eval_time: 246m @@ -382,7 +382,7 @@ tests: horizontalpodautoscaler: mimir-distributor namespace: mimir exp_annotations: - description: "Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." opsrecipe: "mimir-hpa/" # Test for MimirCompactorFailedCompaction alert - interval: 1m diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml index ed6e5fdc..a2e8b6bd 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml @@ -18,7 +18,7 @@ tests: alert_rule_test: - alertname: LokiRequestPanics eval_time: 15m # should be OK after 15 minutes - exp_alerts: + exp_alerts: [] - alertname: LokiRequestPanics eval_time: 25m # After 25 minutes, should fire an alert for the t+20 error exp_alerts: @@ -43,11 +43,11 @@ tests: opsrecipe: loki/ - alertname: LokiRequestPanics eval_time: 40m # After 40 minutes, all should be back to normal - exp_alerts: + exp_alerts: [] - alertname: LokiRequestErrors eval_time: 15m # should be OK after 15 minutes - exp_alerts: + exp_alerts: [] - alertname: LokiRequestErrors eval_time: 160m # Alert after more than 120m of incident exp_alerts: @@ -74,13 +74,13 @@ tests: - alertname: LokiRingUnhealthy eval_time: 15m # should be OK after 15 minutes - exp_alerts: + exp_alerts: [] - alertname: LokiRingUnhealthy eval_time: 25m # after 25 minutes we have an unhealthy member, but we want to filter too short events. So no alert yet. - exp_alerts: + exp_alerts: [] - alertname: LokiRingUnhealthy eval_time: 35m # special case to validate when a new pod is unhealthy (no data at the beginning) - exp_alerts: + exp_alerts: [] - alertname: LokiRingUnhealthy eval_time: 60m # now the event has been there for 20 minutes, we should have an alert. exp_alerts: @@ -115,7 +115,7 @@ tests: alert_rule_test: - alertname: LokiRestartingTooOften eval_time: 15m # should be OK after 15 minutes - exp_alerts: + exp_alerts: [] - alertname: LokiRestartingTooOften eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error exp_alerts: @@ -133,7 +133,7 @@ tests: opsrecipe: loki/ - alertname: LokiRestartingTooOften eval_time: 140m # After 140m minutes, all should be back to normal - exp_alerts: + exp_alerts: [] - interval: 1m input_series: # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests.