From aad83c2ae9d480af45bddd4603991126e95080b9 Mon Sep 17 00:00:00 2001 From: Herve Nicol <12008875+hervenicol@users.noreply.github.com> Date: Wed, 2 Oct 2024 19:31:10 +0200 Subject: [PATCH] Add alert LokiMissingLogs --- CHANGELOG.md | 4 +++ .../atlas/alerting-rules/loki.rules.yml | 24 +++++++++++++ .../atlas/alerting-rules/loki.rules.test.yml | 35 +++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0f7f2e6..e832ee71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Alerting rule for Loki missing logs at ingestion + ## [4.16.1] - 2024-09-26 ### Fixed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml index 31840cae..43b9cf3c 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml @@ -171,3 +171,27 @@ spec: severity: page team: atlas topic: observability + - alert: LokiMissingLogs + annotations: + dashboard: loki-canary/loki-canary + description: This alert checks that loki is not missing canary logs + opsrecipe: loki/ + expr: | + ( + sum by (cluster_id, pod, installation, pipeline, provider) + (increase(loki_canary_missing_entries_total{cluster_type="management_cluster",namespace="loki"}[5m])) + / + sum by (cluster_id, pod, installation, pipeline, provider) + (increase(loki_canary_entries_total{cluster_type="management_cluster",namespace="loki"}[5m])) + ) > 0 + for: 30m + labels: + area: platform + cancel_if_cluster_control_plane_unhealthy: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml index 0504b201..81c961e3 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml @@ -289,3 +289,38 @@ tests: opsrecipe: "loki#lokicompactorfailedcompaction" - alertname: LokiCompactorFailedCompaction eval_time: 300m + + # Test for LokiMissingLogs alert + - interval: 1m + input_series: + - series: 'loki_canary_entries_total{app="loki", cluster="loki", cluster_id="grizzly", cluster_type="management_cluster", container="loki-canary", customer="giantswarm", endpoint="http-metrics", installation="grizzly", namespace="loki", pod="loki-canary-5649fbcb65-lkdkq", pipeline="testing", provider="capz", service="loki-canary", service_priority="highest"}' + values: 0+1x1000 + - series: 'loki_canary_missing_entries_total{app="loki", cluster="loki", cluster_id="grizzly", cluster_type="management_cluster", container="loki-canary", customer="giantswarm", endpoint="http-metrics", installation="grizzly", namespace="loki", pod="loki-canary-5649fbcb65-lkdkq", pipeline="testing", provider="capz", service="loki-canary", service_priority="highest"}' + values: "0+0x120 0+1x120 120+0x120" + alert_rule_test: + - alertname: LokiMissingLogs + eval_time: 60m + - alertname: LokiMissingLogs + eval_time: 200m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_control_plane_unhealthy: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: grizzly + installation: "grizzly" + pipeline: "testing" + pod: "loki-canary-5649fbcb65-lkdkq" + provider: "capz" + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: loki-canary/loki-canary + description: This alert checks that loki is not missing canary logs + opsrecipe: "loki/" + - alertname: LokiMissingLogs + eval_time: 300m