diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc379b3..ae084f66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Updated `LokiHpaReachedMaxReplicas` alert. + ## [4.13.1] - 2024-09-03 ### Fixed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml index 82d3fbba..246e94f2 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml @@ -112,10 +112,18 @@ spec: annotations: description: '{{`Loki component {{ $labels.horizontalpodautoscaler }} has reached its maxReplicas number but still needs to be scaled up.`}}' opsrecipe: loki/ - expr: | - sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"}) - != - sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"}) + expr: |- + ( + kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki"} >= + on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) + kube_horizontalpodautoscaler_spec_max_replicas{namespace="loki"} + ) + and on(cluster_id, customer, installation, namespace, horizontalpodautoscaler) + ( + kube_horizontalpodautoscaler_status_target_metric{namespace="loki"} > + on(cluster_id, customer, installation, namespace, horizontalpodautoscaler, metric_name, metric_target_type) + kube_horizontalpodautoscaler_spec_target_metric{namespace="loki"} + ) for: 4h labels: area: platform diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml index 7dc53bec..ed6e5fdc 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml @@ -196,8 +196,12 @@ tests: # loki-backend real memory usage gradually decreases until it goes below 30% of the memory requests. - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' values: "2+0x20 3+0x250 2+0x250" - - series: 'kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' values: "2+0x520" + - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "60+0x20 120+0x250 60+0x250" + - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="loki-backend", namespace="loki", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "90+0x520" alert_rule_test: - alertname: LokiHpaReachedMaxReplicas eval_time: 15m @@ -218,8 +222,6 @@ tests: horizontalpodautoscaler: loki-backend installation: golem cluster_id: golem - pipeline: testing - provider: capa exp_annotations: description: Loki component loki-backend has reached its maxReplicas number but still needs to be scaled up. opsrecipe: loki/