From 9e726643210d6c065e14351140a52ce4f16a4a4c Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 11:53:41 +0100
Subject: [PATCH] Clean up some rules a bit

---
 CHANGELOG.md                                  |   3 +-
 .../atlas/alerting-rules/alloy.rules.yml      |  21 ++-
 ...rter.rules.yml => grafana-cloud.rules.yml} |  30 +++-
 .../atlas/alerting-rules/grafana.rules.yml    |   4 +-
 .../kube-state-metrics.rules.yml              |   1 -
 .../atlas/alerting-rules/mimir.rules.yml      |  15 --
 .../atlas/alerting-rules/prometheus.rules.yml |  18 --
 ....rules.test.yml => grafana-cloud.test.yml} |   2 +-
 .../atlas/alerting-rules/mimir.rules.test.yml |  29 ----
 .../atlas/alerting-rules/alloy.rules.test.yml | 154 ++++++++++++++++++
 .../logging-pipeline.rules.test.yml           | 118 --------------
 11 files changed, 204 insertions(+), 191 deletions(-)
 rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.yml => grafana-cloud.rules.yml} (74%)
 rename test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.test.yml => grafana-cloud.test.yml} (99%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9ef272b..92d0a37e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
-  - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics.
 
 ### Changed
 
@@ -22,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-operator`
   - `alloy-rules`
   - `observability-gateway`
+- Move all `grafana-cloud` related alerts to their own file.
+- Move all alloy related alerts to the alloy alert file and fix alloy-logs tests.
 
 ## [4.23.0] - 2024-10-30
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index aa1959de..8b3e6256 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -6,7 +6,7 @@ metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: alloy.rules
-  namespace: {{ .Values.namespace  }}
+  namespace: {{ .Values.namespace }}
 spec:
   groups:
     # List of alerts for on the state of the alloy components.
@@ -48,7 +48,24 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
-    - name: logging-agent
+    - name: alloy.rules
+      rules:
+        - alert: AlloyForPrometheusRulesDown
+          annotations:
+            description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.'
+            opsrecipe: prometheus-rules/
+          expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0
+          for: 1h
+          labels:
+            area: platform
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_outside_working_hours: "true"
+            severity: page
+            team: atlas
+            topic: observability
+    - name: alloy.logs
       rules:
         # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
         # and join the pods with the not running containers
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
similarity index 74%
rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml
rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
index 40d76d3d..9560570e 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
@@ -1,13 +1,35 @@
-{{- if .Values.mimir.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: mimir-to-grafana-cloud-exporter.rules
-  namespace: {{ .Values.namespace }}
+    {{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+    {{- end }}
+  name: grafana-cloud.rules
+  namespace: {{ .Values.namespace  }}
 spec:
   groups:
+  - name: grafana-cloud
+    rules:
+    ## Pages Atlas when prometheus fails to send samples to cortex
+    - alert: PrometheusMissingGrafanaCloud
+      annotations:
+        description: 'Prometheus is not sending data to Grafana Cloud.'
+        opsrecipe: prometheus-grafanacloud/
+      {{- if .Values.mimir.enabled }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+      {{- else }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
+      {{- end }}
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+  {{- if .Values.mimir.enabled }}
   - name: mimir-to-grafana-cloud-exporter
     rules:
     - alert: MimirToGrafanaCloudExporterDown
@@ -73,4 +95,4 @@ spec:
         severity: page
         team: atlas
         topic: observability
-{{- end }}
+  {{- end }}
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
index 39fb4a0a..97a10780 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
@@ -3,9 +3,9 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if not .Values.mimir.enabled }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: grafana.rules
   namespace: {{ .Values.namespace }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
index 6c90a4e2..83089fc3 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -85,7 +85,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-
     - alert: KubeConfigMapCreatedMetricMissing
       annotations:
         description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
index cd47324a..6dc13788 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
@@ -61,21 +61,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: AlloyForPrometheusRulesDown
-      annotations:
-        description: 'Alloy sending PrometheusRules to Mimir ruler is down.'
-        opsrecipe: prometheus-rules/
-      expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: MimirRulerEventsFailed
       annotations:
         dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index b31713f9..a0bd48fe 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,7 +1,6 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: prometheus.rules
@@ -27,23 +26,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    ## Pages Atlas when prometheus fails to send samples to cortex
-    - alert: PrometheusMissingGrafanaCloud
-      annotations:
-        description: 'Prometheus is not sending data to Grafana Cloud.'
-        opsrecipe: prometheus-grafanacloud/
-      {{- if .Values.mimir.enabled }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
-      {{- else }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
-      {{- end }}
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
       annotations:
         description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
similarity index 99%
rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml
rename to test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
index ee5645cf..79c5aa0f 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-- mimir-to-grafana-cloud-exporter.rules.yml
+- grafana-cloud.rules.yml
 
 tests:
   # Tests for `MimirToGrafanaCloudExporterDown` alert
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
index 37d40af1..6bdfeaea 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
@@ -86,35 +86,6 @@ tests:
               dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
               description: "Mimir component : mimir-ingester is down."
               opsrecipe: "mimir/"
-  - interval: 1m
-    input_series:
-      # test with 1 pod: none, up, down
-      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}'
-        values: "_x20 1+0x70 0+0x70"
-    alert_rule_test:
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 10m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 80m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 160m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cluster_id: golem
-              installation: golem
-              provider: capa
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Alloy sending PrometheusRules to Mimir ruler is down."
-              opsrecipe: "prometheus-rules/"
   - interval: 1m
     input_series:
       # test: none, rate > 0, rate = 0
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 2effa82d..d8b9309a 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -72,3 +72,157 @@ tests:
               summary: "Unhealthy components detected."
       - alertname: AlloyUnhealthyComponents
         eval_time: 80m
+
+  # Test AlloyForPrometheusRulesDown
+  - interval: 1m
+    input_series:
+      # test with 1 pod: none, up, down
+      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}'
+        values: "_x20 1+0x70 0+0x70"
+    alert_rule_test:
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 10m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 80m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 160m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: golem
+              installation: golem
+              provider: capa
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down."
+              opsrecipe: "prometheus-rules/"
+
+  # Test LoggingAgentDown
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: none, up, down
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 1+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 0+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+    alert_rule_test:
+      - alertname: LoggingAgentDown
+        eval_time: 10m
+      - alertname: LoggingAgentDown
+        eval_time: 30m
+      - alertname: LoggingAgentDown
+        eval_time: 71m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-1.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-1xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      # Tests with 2 pods
+      - alertname: LoggingAgentDown
+        eval_time: 111m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      - alertname: LoggingAgentDown
+        eval_time: 121m
+      - alertname: LoggingAgentDown
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-2.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-2xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
index 31217a0a..fccbfa5a 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
@@ -3,124 +3,6 @@ rule_files:
   - logging-pipeline.rules.yml
 
 tests:
-  # Test LoggingAgentDown
-  - interval: 1m
-    input_series:
-      # For the first 60min: test with 1 pod: none, up, down
-      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-1xxxx", provider="aws", pipeline="testing"}'
-        values: "_x20 1+0x20 0+0x40"
-      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
-        values: "1x180"
-      # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
-      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-2xxxx", provider="aws", pipeline="testing"}'
-        values: "_x80 1+0x40 1+0x20 0+0x40"
-      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
-        values: "1x180"
-      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-3xxxx", provider="aws", pipeline="testing"}'
-        values: "_x80 0+0x40 1+0x20 0+0x40"
-      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
-        values: "1x180"
-    alert_rule_test:
-      - alertname: LoggingAgentDown
-        eval_time: 10m
-      - alertname: LoggingAgentDown
-        eval_time: 30m
-      - alertname: LoggingAgentDown
-        eval_time: 71m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-1.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-1xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
-      # Tests with 2 pods
-      - alertname: LoggingAgentDown
-        eval_time: 111m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-3.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-3xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
-      - alertname: LoggingAgentDown
-        eval_time: 121m
-      - alertname: LoggingAgentDown
-        eval_time: 180m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-2.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-2xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-3.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-3xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
   # Test LogForwardingErrors
   - interval: 1m
     input_series: