fix-mimir-hpa-label

giantswarm · Sep 23, 2024 · e48e77f · e48e77f
1 parent 124dc12
commit e48e77f
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Remove deprecated app labels for `external-dns` and `ingress-nginx` alerts.
 - Remove deprecated app labels for `kube-state-metrics` alerts.
 - Fix falco events alerts node label to hostname as node does not exist.
+- Fix `MimirHPAReachedMaxReplicas` description to render the horizontalpodautoscaler label.
 
 ## [4.15.2] - 2024-09-17
 

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
@@ -142,7 +142,7 @@ spec:
         topic: observability
     - alert: MimirHPAReachedMaxReplicas
       annotations:
-        description: '{{`Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up.`}}'
+        description: '{{`Mimir {{ $labels.horizontalpodautoscaler }} HPA has reached maximum replicas and consume too much resources, it needs to be scaled up.`}}'
         opsrecipe: mimir-hpa/
       expr: |-
         (

diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
@@ -363,7 +363,7 @@ tests:
                horizontalpodautoscaler: mimir-distributor
                namespace: mimir
              exp_annotations:
-               description: "Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up."
+               description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up."
                opsrecipe: "mimir-hpa/"
       - alertname: MimirHPAReachedMaxReplicas
         eval_time: 246m
@@ -382,7 +382,7 @@ tests:
                horizontalpodautoscaler: mimir-distributor
                namespace: mimir
              exp_annotations:
-               description: "Mimir ${ labels.horizontalpodautoscaler } HPA has reached maximum replicas and consume too much resources, it needs to be scaled up."
+               description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up."
                opsrecipe: "mimir-hpa/"
   # Test for MimirCompactorFailedCompaction alert
   - interval: 1m

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
@@ -18,7 +18,7 @@ tests:
     alert_rule_test:
       - alertname: LokiRequestPanics
         eval_time: 15m  # should be OK after 15 minutes
-        exp_alerts:
+        exp_alerts: []
       - alertname: LokiRequestPanics
         eval_time: 25m  # After 25 minutes, should fire an alert for the t+20 error
         exp_alerts:
@@ -43,11 +43,11 @@ tests:
               opsrecipe: loki/
       - alertname: LokiRequestPanics
         eval_time: 40m  # After 40 minutes, all should be back to normal
-        exp_alerts:
+        exp_alerts: []
 
       - alertname: LokiRequestErrors
         eval_time: 15m  # should be OK after 15 minutes
-        exp_alerts:
+        exp_alerts: []
       - alertname: LokiRequestErrors
         eval_time: 160m  # Alert after more than 120m of incident
         exp_alerts:
@@ -74,13 +74,13 @@ tests:
 
       - alertname: LokiRingUnhealthy
         eval_time: 15m  # should be OK after 15 minutes
-        exp_alerts:
+        exp_alerts: []
       - alertname: LokiRingUnhealthy
         eval_time: 25m  # after 25 minutes we have an unhealthy member, but we want to filter too short events. So no alert yet.
-        exp_alerts:
+        exp_alerts: []
       - alertname: LokiRingUnhealthy
         eval_time: 35m  # special case to validate when a new pod is unhealthy (no data at the beginning)
-        exp_alerts:
+        exp_alerts: []
       - alertname: LokiRingUnhealthy
         eval_time: 60m  # now the event has been there for 20 minutes, we should have an alert.
         exp_alerts:
@@ -115,7 +115,7 @@ tests:
     alert_rule_test:
       - alertname: LokiRestartingTooOften
         eval_time: 15m  # should be OK after 15 minutes
-        exp_alerts:
+        exp_alerts: []
       - alertname: LokiRestartingTooOften
         eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
         exp_alerts:
@@ -133,7 +133,7 @@ tests:
               opsrecipe: loki/
       - alertname: LokiRestartingTooOften
         eval_time: 140m  # After 140m minutes, all should be back to normal
-        exp_alerts:
+        exp_alerts: []
   - interval: 1m
     input_series:
       # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests.