From d2cd806f03ef11dffcccf18d1f02f5cca1e75e48 Mon Sep 17 00:00:00 2001 From: Zirko <64951262+QuantumEnigmaa@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:59:42 +0200 Subject: [PATCH] remove cluster_id label from 2 loki alerting rules (#1317) * remove cluster_id label from 2 loki alerting rules * refacto * update changelog --- CHANGELOG.md | 4 ++++ .../platform/atlas/alerting-rules/loki.rules.yml | 12 ++++++------ .../atlas/alerting-rules/loki.rules.test.yml | 4 ++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 530d6044..2be2f298 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Restricted range of `LokiHpaReachedMaxReplicas` and `LokiNeedsToBeScaledDown` rules to management clusters. + ## [4.9.0] - 2024-08-01 ### Added diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml index ddd83574..82d3fbba 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml @@ -88,14 +88,14 @@ spec: description: '{{`Loki component {{ $labels.labelpod }} is consuming very few resources and needs to be scaled down.`}}' opsrecipe: loki/ expr: |- - sum by (cluster_id, installation, namespace, pipeline, provider, labelpod) (label_replace(container_memory_working_set_bytes{container="loki", namespace="loki"}, "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) + sum by (cluster_id, installation, namespace, pipeline, provider, labelpod) (label_replace(container_memory_working_set_bytes{container="loki", namespace="loki", cluster_type="management_cluster"}, "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) / - sum by(cluster_id, installation, namespace, pipeline, provider, labelpod) (label_replace(kube_pod_container_resource_requests{container="loki", namespace="loki", unit="byte"}, "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) + sum by(cluster_id, installation, namespace, pipeline, provider, labelpod) (label_replace(kube_pod_container_resource_requests{container="loki", namespace="loki", unit="byte", cluster_type="management_cluster"}, "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) <= 0.30 and - sum(label_replace(rate(container_cpu_usage_seconds_total{container="loki", namespace="loki"}[5m]), "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) by (cluster_id, installation, namespace, pipeline, provider, labelpod) + sum(label_replace(rate(container_cpu_usage_seconds_total{container="loki", namespace="loki", cluster_type="management_cluster"}[5m]), "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) by (cluster_id, installation, namespace, pipeline, provider, labelpod) / - sum by(cluster_id, installation, namespace, pipeline, provider, labelpod) (label_replace(kube_pod_container_resource_requests{container="loki", namespace="loki", unit="core"}, "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) + sum by(cluster_id, installation, namespace, pipeline, provider, labelpod) (label_replace(kube_pod_container_resource_requests{container="loki", namespace="loki", unit="core", cluster_type="management_cluster"}, "labelpod", "$1", "pod", "(loki-[[:alnum:]]*)-.*")) <= 0.30 for: 1d labels: @@ -113,9 +113,9 @@ spec: description: '{{`Loki component {{ $labels.horizontalpodautoscaler }} has reached its maxReplicas number but still needs to be scaled up.`}}' opsrecipe: loki/ expr: | - sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read"}) + sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_desired_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"}) != - sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read"}) + sum by (cluster_id, installation, namespace, pipeline, provider, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{namespace="loki", horizontalpodautoscaler=~"loki-backend|loki-write|loki-read", cluster_type="management_cluster"}) for: 4h labels: area: platform diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml index aa4d043d..7dc53bec 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml @@ -177,8 +177,8 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" - cluster_id: golem installation: "golem" + cluster_id: "golem" labelpod: "loki-backend" pipeline: "testing" provider: "capa" @@ -215,9 +215,9 @@ tests: team: atlas topic: observability namespace: loki - cluster_id: golem horizontalpodautoscaler: loki-backend installation: golem + cluster_id: golem pipeline: testing provider: capa exp_annotations: