giantswarm · QuentinBisson · Sep 25, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Add `LokiFailedCompaction` alert to know when Loki did not manage to run a successfull compaction in the last 2 hours.
+
 ### Changed
 
 - Upgrade Alloy to 0.5.2 which brings no value to this repo.

@@ -135,3 +135,22 @@ spec:
         severity: page
         team: atlas
         topic: observability
+  - name: loki.compactor
+    rules:
+    - alert: LokiCompactorFailedCompaction
+      annotations:
+        dashboard: loki-retention/loki-retention
+        description: 'Loki compactor has been failing compactions for more than 2 hours.'
+        opsrecipe: loki#lokicompactorfailedcompaction
+      # This alert checks if Loki's the last successful compaction run is older than 2 hours
+      expr: (time() - loki_compactor_apply_retention_last_successful_run_timestamp_seconds > 60 * 60 * 2)
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_status_updating: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
@@ -166,11 +166,13 @@ spec:
         severity: page
         team: atlas
         topic: observability
+  - name: mimir.compactor
+    rules:
     - alert: MimirCompactorFailedCompaction
       annotations:
         dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources
         description: 'Mimir compactor has been failing its compactions for 2 hours.'
-        opsrecipe: mimir/
+        opsrecipe: mimir#mimircompactorfailedcompaction
       # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L858
       expr: sum(increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h])) by (cluster_id, installation, namespace, pipeline, provider) > 2
       labels:

@@ -389,7 +389,7 @@ tests:
     input_series:
       # mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests.
       - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}'
-        values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190"                             
+        values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190"
     alert_rule_test:
       - alertname: MimirCompactorFailedCompaction
         eval_time: 15m 
@@ -415,7 +415,7 @@ tests:
             exp_annotations:
               dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources
               description: Mimir compactor has been failing its compactions for 2 hours.
-              opsrecipe: "mimir/"
+              opsrecipe: "mimir#mimircompactorfailedcompaction"
       - alertname: MimirCompactorFailedCompaction
         eval_time: 205m 
       - alertname: MimirCompactorFailedCompaction

@@ -227,3 +227,35 @@ tests:
               opsrecipe: loki/
       - alertname: LokiHpaReachedMaxReplicas
         eval_time: 515m 
+
+  # Test for LokiCompactorFailedCompaction alert
+  - interval: 1m
+    input_series:
+      - series: 'loki_compactor_apply_retention_last_successful_run_timestamp_seconds{cluster_id="golem", installation="golem", namespace="loki", pipeline="testing", provider="capa"}'
+        values: "0x240 14400+60x100" # time() returns eval_time so we return 0 for the first 4 hours and then 14400 (240 minutes) for the next 100 minutes
+    alert_rule_test:
+      - alertname: LokiCompactorFailedCompaction
+        eval_time: 15m
+      - alertname: LokiCompactorFailedCompaction
+        eval_time: 230m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: golem
+              installation: "golem"
+              pipeline: "testing"
+              provider: "capa"
+              namespace: loki
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: loki-retention/loki-retention
+              description: Loki compactor has been failing compactions for more than 2 hours.
+              opsrecipe: "loki#lokicompactorfailedcompaction"
+      - alertname: LokiCompactorFailedCompaction
+        eval_time: 300m