From 14d28862bbe351aff9b2bd37efbfc4890c2ac604 Mon Sep 17 00:00:00 2001 From: Zirko <64951262+QuantumEnigmaa@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:47:35 +0200 Subject: [PATCH 1/3] update LokiHpaReachedMaxReplicas alert (#1342) * update LokiHpaReachedMaxReplicas alert * fix UTs --- CHANGELOG.md | 4 ++++ .../platform/atlas/alerting-rules/loki.rules.yml | 16 ++++++++++++---- .../atlas/alerting-rules/loki.rules.test.yml | 8 +++++--- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc379b3..ae084f66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Updated `LokiHpaReachedMaxReplicas` alert. + ## [4.13.1] - 2024-09-03 ### Fixed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml index 82d3fbba..246e94f2 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml @@ -112,10 +112,18 @@ spec: annotations: description: '{{`Loki component {{ $labels.horizontalpodautoscaler }} has reached its maxReplicas number but still needs to be scaled up.`}}' opsrecipe: loki/ - expr: | - sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"}) - != - sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"}) + expr: |- + ( + kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki"} >= + on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) + kube_horizontalpodautoscaler_spec_max_replicas{namespace="loki"} + ) + and on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) + ( + kube_horizontalpodautoscaler_status_target_metric{namespace="loki"} > + on(cluster_id, customer, installation, namespace, horizontalpodautoscaler, metric_name, metric_target_type) + kube_horizontalpodautoscaler_spec_target_metric{namespace="loki"} + ) for: 4h labels: area: platform diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml index 7dc53bec..ed6e5fdc 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml @@ -196,8 +196,12 @@ tests: # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests. - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' values: "2+0x20 3+0x250 2+0x250" - - series: 'kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' values: "2+0x520" + - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "60+0x20 120+0x250 60+0x250" + - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "90+0x520" alert_rule_test: - alertname: LokiHpaReachedMaxReplicas eval_time: 15m @@ -218,8 +222,6 @@ tests: horizontalpodautoscaler: loki-backend installation: golem cluster_id: golem - pipeline: testing - provider: capa exp_annotations: description: Loki component loki-backend has reached its maxReplicas number but still needs to be scaled up. opsrecipe: loki/ From 8e6d5f435b8c045a1751138601580e5b70541201 Mon Sep 17 00:00:00 2001 From: Taylor Bot Date: Tue, 3 Sep 2024 18:52:35 +0300 Subject: [PATCH 2/3] Release v4.13.2 (#1346) --- CHANGELOG.md | 5 ++++- helm/prometheus-rules/Chart.yaml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae084f66..ec1d5dad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [4.13.2] - 2024-09-03 + ### Changed - Updated `LokiHpaReachedMaxReplicas` alert. @@ -3052,7 +3054,8 @@ Fix `PromtailRequestsErrors` alerts as promtail retries after some backoff so ac - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.1...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.2...HEAD +[4.13.2]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.1...v4.13.2 [4.13.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.13.0...v4.13.1 [4.13.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.12.0...v4.13.0 [4.12.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.11.0...v4.12.0 diff --git a/helm/prometheus-rules/Chart.yaml b/helm/prometheus-rules/Chart.yaml index ce94f08c..a95482a6 100644 --- a/helm/prometheus-rules/Chart.yaml +++ b/helm/prometheus-rules/Chart.yaml @@ -5,7 +5,7 @@ home: https://github.com/giantswarm/prometheus-rules icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png name: prometheus-rules appVersion: '0.1.0' -version: '4.13.1' +version: '4.13.2' annotations: application.giantswarm.io/team: "atlas" config.giantswarm.io/version: 1.x.x From 7197d3b911365b6eac348d1bff46178a7ac13303 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:39:39 +0200 Subject: [PATCH 3/3] fix(deps): update module github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring to v0.76.1 (#1343) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- test/hack/checkLabels/go.mod | 2 +- test/hack/checkLabels/go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod index c18e55d3..e88d3239 100644 --- a/test/hack/checkLabels/go.mod +++ b/test/hack/checkLabels/go.mod @@ -7,7 +7,7 @@ toolchain go1.23.0 require ( // Try to keep version in sync with our prometheus rule CRD version. // see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11 - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.0 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.1 sigs.k8s.io/yaml v1.4.0 ) diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum index 1b60a2c6..d877600d 100644 --- a/test/hack/checkLabels/go.sum +++ b/test/hack/checkLabels/go.sum @@ -543,6 +543,8 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.75.2 h github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.75.2/go.mod h1:XYrdZw5dW12Cjkt4ndbeNZZTBp4UCHtW0ccR9+sTtPU= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.0 h1:tRwEFYFg+To2TGnibGl8dHBCh8Z/BVNKnXj2O5Za/2M= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.0/go.mod h1:Rd8YnCqz+2FYsiGmE2DMlaLjQRB4v2jFNnzCt9YY4IM= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.1 h1:QU2cs0xxKYvF1JfibP/8vs+pFy6OvIpqNR2lYC4jYNU= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.76.1/go.mod h1:Rd8YnCqz+2FYsiGmE2DMlaLjQRB4v2jFNnzCt9YY4IM= github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0= github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE= github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg=