From a0d0eea56272293652ae85616ae0fb5ee4025363 Mon Sep 17 00:00:00 2001
From: QuantumEnigmaa <thibaud@giantswarm.io>
Date: Tue, 3 Sep 2024 12:00:04 +0200
Subject: [PATCH 1/2] update LokiHpaReachedMaxReplicas alert

---
 CHANGELOG.md                                     |  4 ++++
 .../platform/atlas/alerting-rules/loki.rules.yml | 16 ++++++++++++----
 .../atlas/alerting-rules/loki.rules.test.yml     |  6 +++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dadbb9f1..2d10878d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Updated `LokiHpaReachedMaxReplicas` alert.
+
 ## [4.13.0] - 2024-09-03
 
 ### Changed
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
index 82d3fbba..246e94f2 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
@@ -112,10 +112,18 @@ spec:
       annotations:
         description: '{{`Loki component {{ $labels.horizontalpodautoscaler }} has reached its maxReplicas number but still needs to be scaled up.`}}'
         opsrecipe: loki/
-      expr: |
-        sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"})
-          != 
-        sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"})
+      expr: |-
+        (
+          kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki"} >=
+          on(cluster_id, customer, installation, namespace, horizontalpodautoscaler)
+          kube_horizontalpodautoscaler_spec_max_replicas{namespace="loki"}
+        )
+        and on(cluster_id, customer, installation, namespace, horizontalpodautoscaler)
+        (
+          kube_horizontalpodautoscaler_status_target_metric{namespace="loki"} >
+          on(cluster_id, customer, installation, namespace, horizontalpodautoscaler, metric_name, metric_target_type)
+          kube_horizontalpodautoscaler_spec_target_metric{namespace="loki"}
+        )
       for: 4h
       labels:
         area: platform
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
index 7dc53bec..3956db6a 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
@@ -196,8 +196,12 @@ tests:
       # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests.
       - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "2+0x20 3+0x250 2+0x250"
-      - series: 'kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+      - series: 'kube_horizontalpodautoscaler_status_max_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "2+0x520"
+      - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "60+0x20 120+0x250 60+0x250"
+      - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "90+0x520"
     alert_rule_test:
       - alertname: LokiHpaReachedMaxReplicas
         eval_time: 15m

From bafafa5568a22b9242947fe4942025934aec1e3e Mon Sep 17 00:00:00 2001
From: QuantumEnigmaa <thibaud@giantswarm.io>
Date: Tue, 3 Sep 2024 13:39:35 +0200
Subject: [PATCH 2/2] fix UTs

---
 .../global/platform/atlas/alerting-rules/loki.rules.test.yml  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
index 3956db6a..ed6e5fdc 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
@@ -196,7 +196,7 @@ tests:
       # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests.
       - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "2+0x20 3+0x250 2+0x250"
-      - series: 'kube_horizontalpodautoscaler_status_max_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+      - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "2+0x520"
       - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "60+0x20 120+0x250 60+0x250"
@@ -222,8 +222,6 @@ tests:
               horizontalpodautoscaler: loki-backend
               installation: golem
               cluster_id: golem
-              pipeline: testing
-              provider: capa
             exp_annotations:
               description: Loki component loki-backend has reached its maxReplicas number but still needs to be scaled up.
               opsrecipe: loki/