Merge branch 'main' into send-slo-aggregations-to-grafana-cloud

giantswarm · Sep 4, 2024 · a1f5c3b · a1f5c3b
2 parents 4edb344 + 7197d3b
commit a1f5c3b
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add aggregations for slo metrics to export them to grafana cloud
 
+## [4.13.2] - 2024-09-03
+
+### Changed
+
+- Updated `LokiHpaReachedMaxReplicas` alert.
+
 ## [4.13.1] - 2024-09-03
 
 ### Fixed
@@ -3052,7 +3058,8 @@ Fix `PromtailRequestsErrors` alerts as promtail retries after some backoff so ac
 
 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2
 
-[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.1...HEAD
+[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.2...HEAD
+[4.13.2]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.1...v4.13.2
 [4.13.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.0...v4.13.1
 [4.13.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.12.0...v4.13.0
 [4.12.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.11.0...v4.12.0

diff --git a/helm/prometheus-rules/Chart.yaml b/helm/prometheus-rules/Chart.yaml
@@ -5,7 +5,7 @@ home: https://github.com/giantswarm/prometheus-rules
 icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png
 name: prometheus-rules
 appVersion: '0.1.0'
-version: '4.13.1'
+version: '4.13.2'
 annotations:
   application.giantswarm.io/team: "atlas"
   config.giantswarm.io/version: 1.x.x
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
@@ -112,10 +112,18 @@ spec:
       annotations:
         description: '{{`Loki component {{ $labels.horizontalpodautoscaler }} has reached its maxReplicas number but still needs to be scaled up.`}}'
         opsrecipe: loki/
-      expr: |
-        sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"})
-          != 
-        sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"})
+      expr: |-
+        (
+          kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki"} >=
+          on(cluster_id, customer, installation, namespace, horizontalpodautoscaler)
+          kube_horizontalpodautoscaler_spec_max_replicas{namespace="loki"}
+        )
+        and on(cluster_id, customer, installation, namespace, horizontalpodautoscaler)
+        (
+          kube_horizontalpodautoscaler_status_target_metric{namespace="loki"} >
+          on(cluster_id, customer, installation, namespace, horizontalpodautoscaler, metric_name, metric_target_type)
+          kube_horizontalpodautoscaler_spec_target_metric{namespace="loki"}
+        )
       for: 4h
       labels:
         area: platform

diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod
@@ -7,7 +7,7 @@ toolchain go1.23.0
 require (
 	// Try to keep version in sync with our prometheus rule CRD version.
 	// see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11
-	github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.0
+	github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.1
 	sigs.k8s.io/yaml v1.4.0
 )
 

diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum
@@ -543,6 +543,8 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.75.2 h
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.75.2/go.mod h1:XYrdZw5dW12Cjkt4ndbeNZZTBp4UCHtW0ccR9+sTtPU=
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.0 h1:tRwEFYFg+To2TGnibGl8dHBCh8Z/BVNKnXj2O5Za/2M=
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.0/go.mod h1:Rd8YnCqz+2FYsiGmE2DMlaLjQRB4v2jFNnzCt9YY4IM=
+github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.1 h1:QU2cs0xxKYvF1JfibP/8vs+pFy6OvIpqNR2lYC4jYNU=
+github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.1/go.mod h1:Rd8YnCqz+2FYsiGmE2DMlaLjQRB4v2jFNnzCt9YY4IM=
 github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0=
 github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE=
 github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg=

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml
@@ -196,8 +196,12 @@ tests:
       # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests.
       - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "2+0x20 3+0x250 2+0x250"
-      - series: 'kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+      - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
         values: "2+0x520"
+      - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "60+0x20 120+0x250 60+0x250"
+      - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "90+0x520"
     alert_rule_test:
       - alertname: LokiHpaReachedMaxReplicas
         eval_time: 15m
@@ -218,8 +222,6 @@ tests:
               horizontalpodautoscaler: loki-backend
               installation: golem
               cluster_id: golem
-              pipeline: testing
-              provider: capa
             exp_annotations:
               description: Loki component loki-backend has reached its maxReplicas number but still needs to be scaled up.
               opsrecipe: loki/