From 96364d1f416b8f61154b1b75633ef30679e8508f Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Tue, 29 Oct 2024 17:01:17 +0100
Subject: [PATCH 01/24] add sensible alerts for alloy

---
 .../atlas/alerting-rules/alloy.rules.yml      |  49 +++
 .../deployment.management-cluster.rules.yml   |   2 +-
 .../atlas/alerting-rules/logging.rules.yaml   | 114 +++++++
 .../atlas/alerting-rules/monitoring.rules.yml | 136 ++++++++
 .../alerting-rules/prometheus-agent.rules.yml | 291 +++++++++---------
 .../atlas/alerting-rules/prometheus.rules.yml |  17 -
 .../atlas/alerting-rules/promtail.rules.yml   |   5 +-
 7 files changed, 448 insertions(+), 166 deletions(-)
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
new file mode 100644
index 000000000..148168239
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -0,0 +1,49 @@
+# This files describe common alloy alerting rules
+# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml).
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: alloy.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+    ## TODO(quentin) add tests for the alerts
+    ## TODO(quentin) add opsrecipe for the alerts
+    ## TODO(quentin) add dashboard annotation for the alerts
+    # List of alerts for on the state of the alloy components.
+    # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet
+    # We added the alert labels and added the missing labels from the aggregations.
+    - name: alloy.controller
+      rules:
+        - alert: SlowComponentEvaluations
+          annotations:
+            description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.
+            summary: Component evaluations are taking too long.
+          expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
+          for: 15m
+          labels:
+            area: platform
+            severity: notify
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+        - alert: UnhealthyComponents
+          annotations:
+            description: Unhealthy components detected under job {{ $labels.job }}
+            summary: Unhealthy components detected.
+          expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
+          for: 15m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
index 6d62a35bc..54a070368 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: deployment-not-satisfied/
-      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|tempo.*|pyroscope.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0
+      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alloy-rules.*|alertmanager.*|grafana.*|logging-operator.*|loki.*|mimir.*|oauth2-proxy.*|object-storage.*|observability-gateway.*|observability-operator.*|prometheus.*|promxy.*|tempo.*|pyroscope.*|silence-operator.*|sloth.*"} > 0
       for: 30m
       labels:
         area: platform
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
new file mode 100644
index 000000000..71d96e782
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
@@ -0,0 +1,114 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: logging.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+    ## TODO(quentin) add tests for the alerts
+    ## TODO(quentin) add opsrecipe for the alerts
+    ## TODO(quentin) add dashboard annotation for the alerts
+    - name: logging-agent
+      rules:
+        # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
+        # and join the pods with the not running containers
+        - alert: LoggingAgentDown
+          annotations:
+            description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
+            opsrecipe: logging-agent/
+          expr: |-
+            kube_pod_info{pod=~"alloy-logs.*"}
+            * on(cluster_id, pod)
+              group_left ()
+              up{job="alloy-logs", container="alloy"} == 0
+          for: 30m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_node_unschedulable: "true"
+            cancel_if_node_not_ready: "true"
+    - name: log-ingestion
+      rules:
+        # Any alloy component that uses the loki.write component can throw such errors.
+        # This includes alloy-logs and the observability-gateway
+        - alert: LogForwardingErrors
+          annotations:
+            description: '{{`More that 10% of the requests to Loki are failing.`}}'
+            opsrecipe: logging-errors/
+          expr: |-
+            (
+              100
+              *
+                (
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) (
+                        rate (
+                          loki_write_request_duration_seconds_count{status_code!~"2.."}[5m:]
+                        )
+                      )
+                    )
+                  /
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) (
+                        rate (
+                          loki_write_request_duration_seconds_count[5m:]
+                        )
+                      )
+                    )
+                )
+            )
+            > 10
+          for: 15m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+        # This alert pages when the loki source api component of the observability gateway is throwing errors
+        - alert: LogReceivingErrors
+          annotations:
+            description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}'
+            opsrecipe: logging-errors/
+          expr: |-
+            (
+              100
+              *
+                (
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) (
+                        rate (
+                          loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push", status_code!~"2.."}[5m:]
+                        )
+                      )
+                    )
+                  /
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) (
+                        rate (
+                          loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push"}[5m:]
+                        )
+                      )
+                    )
+                )
+            )
+            > 10
+          for: 15m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
new file mode 100644
index 000000000..745b86f7e
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
@@ -0,0 +1,136 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: monitoring.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+    ## TODO(quentin) add tests for the monitoring agent alerts
+    ## TODO(quentin) add opsrecipe for the monitoring agent alerts
+    ## TODO(quentin) add dashboard annotation for the monitoring agent alerts
+    ## TODO(quentin) replace MonitoringAgentShardsMissing for alloy-metrics
+    ## TODO(quentin) add component specific errors to replace the ones in the prometheus.rules.yml
+    - name: monitoring-agent
+      rules:
+          ## This alert pages if the monitoring-agent fails to send samples to its remote write endpoint.
+        - alert: MonitoringAgentFailing
+          annotations:
+            description: '{{`Monitoring agent fails to send its data via remote write.`}}'
+            summary: Monitoring agent fails to send samples to its configured remote write endpoint.
+            opsrecipe: monitoring-agent/
+          expr: |-
+            (
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 20m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_is_not_running_monitoring_agent: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_has_no_workers: "true"
+        ## Same as MonitoringAgentFailing, but triggers inhibition earlier and does not page.
+        - alert: MonitoringAgentFailingInhibition
+          annotations:
+            description: '{{`Monitoring agent fails to send its data via remote write.`}}'
+            summary: Monitoring agent fails to send samples to its configured remote write endpoint.
+            opsrecipe: monitoring-agent/
+          expr: |-
+            (
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) unless on (cluster_id) (
+              count(up{job="prometheus-agent"} > 0) by (cluster_id)
+            )
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_is_not_running_monitoring_agent: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+        ## This alert pages if some of the monitoring agent shards are not running.
+        - alert: MonitoringAgentShardsMissing
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Monitoring agent is missing some shards.
+            opsrecipe: monitoring-agent/
+          expr: |-
+            max_over_time(sum by (cluster_id, installation, provider, pipeline)(
+              count(
+                ## number of remotes that are not mimir or grafana-cloud
+                prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
+              ) by (cluster_id, installation, provider, pipeline)
+              !=
+              sum(
+                ## number of shards defined in the Prometheus CR
+                prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
+                # if there is only 1 shard, there is no shard metric so we use the replicas metric
+                or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+              ) by (cluster_id, installation, provider, pipeline)
+            )[5m:])
+          for: 40m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_is_not_running_monitoring_agent: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_outside_working_hours: "true"
+        ## Same as MonitoringAgentShardsMissing but triggers inhibition earlier, and does not page.
+        - alert: MonitoringAgentShardsMissingInhibition
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Monitoring agent is missing some shards.
+            opsrecipe: monitoring-agent/
+          expr: |-
+            max_over_time(sum by (cluster_id, installation, provider, pipeline)(
+              count(
+                ## number of remotes that are not mimir or grafana-cloud
+                prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
+              ) by (cluster_id, installation, provider, pipeline)
+              !=
+              sum(
+                ## number of shards defined in the Prometheus CR
+                prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
+                # if there is only 1 shard, there is no shard metric so we use the replicas metric
+                or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+              ) by (cluster_id, installation, provider, pipeline)
+            )[5m:])
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_is_not_running_monitoring_agent: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_outside_working_hours: "true"
+
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
index 0dfbc0c91..81163c13e 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
@@ -1,157 +1,156 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: prometheus-agent.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: prometheus-agent
-    rules:
-    ## Page Atlas if prometheus agent fails to send samples to MC prometheus.
-    - alert: PrometheusAgentFailing
-      annotations:
-        description: '{{`Prometheus agent remote write is failing.`}}'
-        summary: Prometheus agent fails to send samples to remote write endpoint.
-        opsrecipe: prometheus-agent/
-        dashboard: promRW001/prometheus-remote-write
-      {{- if not .Values.mimir.enabled }}
-      expr: |-
-        max_over_time(
-          sum by (cluster_type, cluster_id, installation, instance, service)
+    - name: prometheus-agent
+      rules:
+      ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
+      - alert: PrometheusAgentFailing
+        annotations:
+          description: '{{`Prometheus agent remote write is failing.`}}'
+          summary: Prometheus agent fails to send samples to remote write endpoint.
+          opsrecipe: prometheus-agent/
+          dashboard: promRW001/prometheus-remote-write
+        {{- if not .Values.mimir.enabled }}
+        expr: |-
+          max_over_time(
+            sum by (cluster_type, cluster_id, installation, instance, service)
+            (
+              up{instance="prometheus-agent"} == 0
+              or
+              absent(up{instance="prometheus-agent"}) == 1
+            )[5m:]
+          )
+        {{- else }}
+        expr: |-
           (
-            up{instance="prometheus-agent"} == 0
-            or
-            absent(up{instance="prometheus-agent"}) == 1
-          )[5m:]
-        )
-      {{- else }}
-      expr: |-
-        (
-          label_replace(
-            capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-            "cluster_id",
-            "$1",
-            "name",
-            "(.*)"
-          ) == 1
-        ) unless on (cluster_id) (
-          count(up{job="prometheus-agent"} > 0) by (cluster_id)
-        )
-      {{- end }}
-      for: 20m
-      labels:
-        area: platform
-        severity: page
-        team: atlas
-        topic: observability
-        inhibit_prometheus_agent_down: "true"
-        cancel_if_cluster_is_not_running_prometheus_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_has_no_workers: "true"
-    ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
-    - alert: PrometheusAgentFailingInhibition
-      annotations:
-        description: '{{`Prometheus agent remote write is failing.`}}'
-        summary: Prometheus agent fails to send samples to remote write endpoint.
-        opsrecipe: prometheus-agent/
-        dashboard: promRW001/prometheus-remote-write
-      {{- if not .Values.mimir.enabled }}
-      expr: |-
-        max_over_time(
-          sum by (cluster_type, cluster_id, installation, instance, service)
+            label_replace(
+              capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+              "cluster_id",
+              "$1",
+              "name",
+              "(.*)"
+            ) == 1
+          ) unless on (cluster_id) (
+            count(up{job="prometheus-agent"} > 0) by (cluster_id)
+          )
+        {{- end }}
+        for: 20m
+        labels:
+          area: platform
+          severity: page
+          team: atlas
+          topic: observability
+          inhibit_monitoring_agent_down: "true"
+          cancel_if_cluster_is_not_running_monitoring_agent: "true"
+          cancel_if_cluster_status_creating: "true"
+          cancel_if_cluster_status_deleting: "true"
+          cancel_if_cluster_has_no_workers: "true"
+      ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
+      - alert: PrometheusAgentFailingInhibition
+        annotations:
+          description: '{{`Prometheus agent remote write is failing.`}}'
+          summary: Prometheus agent fails to send samples to remote write endpoint.
+          opsrecipe: prometheus-agent/
+          dashboard: promRW001/prometheus-remote-write
+        {{- if not .Values.mimir.enabled }}
+        expr: |-
+          max_over_time(
+            sum by (cluster_type, cluster_id, installation, instance, service)
+            (
+              up{instance="prometheus-agent"} == 0
+              or
+              absent(up{instance="prometheus-agent"}) == 1
+            )[5m:]
+          )
+        {{- else }}
+        expr: |-
           (
-            up{instance="prometheus-agent"} == 0
-            or
-            absent(up{instance="prometheus-agent"}) == 1
-          )[5m:]
-        )
-      {{- else }}
-      expr: |-
-        (
-          label_replace(
-            capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-            "cluster_id",
-            "$1",
-            "name",
-            "(.*)"
-          ) == 1
-        ) unless on (cluster_id) (
-          count(up{job="prometheus-agent"} > 0) by (cluster_id)
-        )
-      {{- end }}
-      for: 2m
-      labels:
-        area: platform
-        severity: none
-        team: atlas
-        topic: observability
-        inhibit_prometheus_agent_down: "true"
-        cancel_if_cluster_is_not_running_prometheus_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-    ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus.
-    - alert: PrometheusAgentShardsMissing
-      annotations:
-        description: '{{`Prometheus agent is missing shards.`}}'
-        summary: Prometheus agent is missing shards.
-        opsrecipe: prometheus-agent/
-      expr: |-
-        max_over_time(sum by (cluster_id, installation, provider, pipeline)(
-          count(
-            ## number of remotes that are not mimir or grafana-cloud
-            prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-          ) by (cluster_id, installation, provider, pipeline)
-          !=
-          sum(
-            ## number of shards defined in the Prometheus CR
-            prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-            # if there is only 1 shard, there is no shard metric so we use the replicas metric
-            or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-          ) by (cluster_id, installation, provider, pipeline)
-        )[5m:])
-      for: 40m
-      labels:
-        area: platform
-        severity: page
-        team: atlas
-        topic: observability
-        inhibit_prometheus_agent_down: "true"
-        cancel_if_cluster_is_not_running_prometheus_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
-    ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
-    - alert: PrometheusAgentShardsMissingInhibition
-      annotations:
-        description: '{{`Prometheus agent is missing shards.`}}'
-        summary: Prometheus agent is missing shards.
-        opsrecipe: prometheus-agent/
-      expr: |-
-        max_over_time(sum by (cluster_id, installation, provider, pipeline)(
-          count(
-            ## number of remotes that are not mimir or grafana-cloud
-            prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-          ) by (cluster_id, installation, provider, pipeline)
-          !=
-          sum(
-            ## number of shards defined in the Prometheus CR
-            prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-            # if there is only 1 shard, there is no shard metric so we use the replicas metric
-            or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-          ) by (cluster_id, installation, provider, pipeline)
-        )[5m:])
-      for: 2m
-      labels:
-        area: platform
-        severity: none
-        team: atlas
-        topic: observability
-        inhibit_prometheus_agent_down: "true"
-        cancel_if_cluster_is_not_running_prometheus_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
+            label_replace(
+              capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+              "cluster_id",
+              "$1",
+              "name",
+              "(.*)"
+            ) == 1
+          ) unless on (cluster_id) (
+            count(up{job="prometheus-agent"} > 0) by (cluster_id)
+          )
+        {{- end }}
+        for: 2m
+        labels:
+          area: platform
+          severity: none
+          team: atlas
+          topic: observability
+          inhibit_monitoring_agent_down: "true"
+          cancel_if_cluster_is_not_running_monitoring_agent: "true"
+          cancel_if_cluster_status_creating: "true"
+          cancel_if_cluster_status_deleting: "true"
+      ## This alert pages if one of the prometheus-agent shard is not running.
+      - alert: PrometheusAgentShardsMissing
+        annotations:
+          description: '{{`Prometheus agent is missing shards.`}}'
+          summary: Prometheus agent is missing shards.
+          opsrecipe: prometheus-agent/
+        expr: |-
+          max_over_time(sum by (cluster_id, installation, provider, pipeline)(
+            count(
+              ## number of remotes that are not mimir or grafana-cloud
+              prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
+            ) by (cluster_id, installation, provider, pipeline)
+            !=
+            sum(
+              ## number of shards defined in the Prometheus CR
+              prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
+              # if there is only 1 shard, there is no shard metric so we use the replicas metric
+              or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+            ) by (cluster_id, installation, provider, pipeline)
+          )[5m:])
+        for: 40m
+        labels:
+          area: platform
+          severity: page
+          team: atlas
+          topic: observability
+          inhibit_monitoring_agent_down: "true"
+          cancel_if_cluster_is_not_running_monitoring_agent: "true"
+          cancel_if_cluster_status_creating: "true"
+          cancel_if_cluster_status_deleting: "true"
+          cancel_if_outside_working_hours: "true"
+      ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
+      - alert: PrometheusAgentShardsMissingInhibition
+        annotations:
+          description: '{{`Prometheus agent is missing shards.`}}'
+          summary: Prometheus agent is missing shards.
+          opsrecipe: prometheus-agent/
+        expr: |-
+          max_over_time(sum by (cluster_id, installation, provider, pipeline)(
+            count(
+              ## number of remotes that are not mimir or grafana-cloud
+              prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
+            ) by (cluster_id, installation, provider, pipeline)
+            !=
+            sum(
+              ## number of shards defined in the Prometheus CR
+              prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
+              # if there is only 1 shard, there is no shard metric so we use the replicas metric
+              or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+            ) by (cluster_id, installation, provider, pipeline)
+          )[5m:])
+        for: 2m
+        labels:
+          area: platform
+          severity: none
+          team: atlas
+          topic: observability
+          inhibit_monitoring_agent_down: "true"
+          cancel_if_cluster_is_not_running_monitoring_agent: "true"
+          cancel_if_cluster_status_creating: "true"
+          cancel_if_cluster_status_deleting: "true"
+          cancel_if_outside_working_hours: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index 054d4980b..e5f68c642 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -27,23 +27,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    ## Pages Atlas when prometheus fails to send samples to cortex
-    - alert: PrometheusMissingGrafanaCloud
-      annotations:
-        description: 'Prometheus is not sending data to Grafana Cloud.'
-        opsrecipe: prometheus-grafanacloud/
-      {{- if .Values.mimir.enabled }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
-      {{- else }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
-      {{- end }}
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
       annotations:
         description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml
index f48d135ab..422a9c9b1 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml
@@ -9,16 +9,17 @@ spec:
   groups:
     - name: promtail
       rules:
+        # This alert lists the existing promtail pods (to extract the node label and inhibit if the node is not ready)
+        # and join the pods with the not running containers
         - alert: PromtailDown
           annotations:
             description: '{{`Scraping of all promtail pods to check if one failed every 30 minutes.`}}'
             opsrecipe: promtail/
           expr: |-
-            # List promtail pods to be able to get the node label and join with the node status to not alert if the node is not ready
             kube_pod_info{pod=~"promtail.*"}
             * on(cluster_id, pod)
               group_left ()
-              up{container="promtail"} == 0 # List promtail containers that are not running
+              up{container="promtail"} == 0
           for: 30m
           labels:
             area: platform

From 9c2f6553ed0e62727b29267520a4062db3e7d194 Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Tue, 29 Oct 2024 17:19:58 +0100
Subject: [PATCH 02/24] wip - add ongoing alerts

---
 CHANGELOG.md                                  | 16 +++++
 .../atlas/alerting-rules/alloy.rules.yml      | 10 +--
 .../atlas/alerting-rules/monitoring.rules.yml | 66 +------------------
 .../alerting-rules/prometheus-agent.rules.yml |  4 +-
 ...luster.rules.yml => statefulset.rules.yml} |  9 +--
 .../atlas/alerting-rules/storage.rules.yml    |  2 +-
 .../prometheus-agent.rules.test.yml           | 16 ++---
 .../prometheus-agent.rules.test.yml           | 16 ++---
 .../prometheus-agent.rules.test.yml           | 16 ++---
 .../prometheus-agent.rules.test.yml           | 16 ++---
 10 files changed, 62 insertions(+), 109 deletions(-)
 rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{statefulset.management-cluster.rules.yml => statefulset.rules.yml} (81%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06a4153c3..4d96737f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Add a set of sensible alerts to monitor alloy.
+  - `AlloySlowComponentEvaluations` and `AlloyUnhealthyComponents` to report about alloy component state.
+  - `LoggingAgentDown` to be alerted when the logging agent is down.
+  - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
+  - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
+  - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics.
+
+### Changed
+
+- Update `DeploymentNotSatisfiedAtlas` to take into account the following components:
+  - `observability-operator`
+  - `alloy-rules`
+  - `observability-gateway`
+
 ## [4.22.0] - 2024-10-29
 
 ### Changed
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index 148168239..fae2026b7 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -17,10 +17,11 @@ spec:
     # We added the alert labels and added the missing labels from the aggregations.
     - name: alloy.controller
       rules:
-        - alert: SlowComponentEvaluations
+        - alert: AlloySlowComponentEvaluations
           annotations:
-            description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.
+            description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}'
             summary: Component evaluations are taking too long.
+            opsrecipe: alloy-components/
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
           for: 15m
           labels:
@@ -32,10 +33,11 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
-        - alert: UnhealthyComponents
+        - alert: AlloyUnhealthyComponents
           annotations:
-            description: Unhealthy components detected under job {{ $labels.job }}
+            description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}'
             summary: Unhealthy components detected.
+            opsrecipe: alloy-components/
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
           for: 15m
           labels:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
index 745b86f7e..8ba7a3a51 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
@@ -10,7 +10,7 @@ spec:
     ## TODO(quentin) add tests for the monitoring agent alerts
     ## TODO(quentin) add opsrecipe for the monitoring agent alerts
     ## TODO(quentin) add dashboard annotation for the monitoring agent alerts
-    ## TODO(quentin) replace MonitoringAgentShardsMissing for alloy-metrics
+    ## TODO(quentin) replace PrometheusAgentShardsMissing for alloy-metrics
     ## TODO(quentin) add component specific errors to replace the ones in the prometheus.rules.yml
     - name: monitoring-agent
       rules:
@@ -44,7 +44,7 @@ spec:
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_has_no_workers: "true"
         ## Same as MonitoringAgentFailing, but triggers inhibition earlier and does not page.
-        - alert: MonitoringAgentFailingInhibition
+        - alert: InhibitionMonitoringAgentFailing
           annotations:
             description: '{{`Monitoring agent fails to send its data via remote write.`}}'
             summary: Monitoring agent fails to send samples to its configured remote write endpoint.
@@ -71,66 +71,4 @@ spec:
             cancel_if_cluster_is_not_running_monitoring_agent: "true"
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
-        ## This alert pages if some of the monitoring agent shards are not running.
-        - alert: MonitoringAgentShardsMissing
-          annotations:
-            description: '{{`At least one of the monitoring agent shard is missing.`}}'
-            summary: Monitoring agent is missing some shards.
-            opsrecipe: monitoring-agent/
-          expr: |-
-            max_over_time(sum by (cluster_id, installation, provider, pipeline)(
-              count(
-                ## number of remotes that are not mimir or grafana-cloud
-                prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-              ) by (cluster_id, installation, provider, pipeline)
-              !=
-              sum(
-                ## number of shards defined in the Prometheus CR
-                prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-                # if there is only 1 shard, there is no shard metric so we use the replicas metric
-                or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-              ) by (cluster_id, installation, provider, pipeline)
-            )[5m:])
-          for: 40m
-          labels:
-            area: platform
-            severity: page
-            team: atlas
-            topic: observability
-            inhibit_monitoring_agent_down: "true"
-            cancel_if_cluster_is_not_running_monitoring_agent: "true"
-            cancel_if_cluster_status_creating: "true"
-            cancel_if_cluster_status_deleting: "true"
-            cancel_if_outside_working_hours: "true"
-        ## Same as MonitoringAgentShardsMissing but triggers inhibition earlier, and does not page.
-        - alert: MonitoringAgentShardsMissingInhibition
-          annotations:
-            description: '{{`At least one of the monitoring agent shard is missing.`}}'
-            summary: Monitoring agent is missing some shards.
-            opsrecipe: monitoring-agent/
-          expr: |-
-            max_over_time(sum by (cluster_id, installation, provider, pipeline)(
-              count(
-                ## number of remotes that are not mimir or grafana-cloud
-                prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-              ) by (cluster_id, installation, provider, pipeline)
-              !=
-              sum(
-                ## number of shards defined in the Prometheus CR
-                prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-                # if there is only 1 shard, there is no shard metric so we use the replicas metric
-                or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-              ) by (cluster_id, installation, provider, pipeline)
-            )[5m:])
-          for: 2m
-          labels:
-            area: platform
-            severity: none
-            team: atlas
-            topic: observability
-            inhibit_monitoring_agent_down: "true"
-            cancel_if_cluster_is_not_running_monitoring_agent: "true"
-            cancel_if_cluster_status_creating: "true"
-            cancel_if_cluster_status_deleting: "true"
-            cancel_if_outside_working_hours: "true"
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
index 81163c13e..b1813bee9 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
@@ -52,7 +52,7 @@ spec:
           cancel_if_cluster_status_deleting: "true"
           cancel_if_cluster_has_no_workers: "true"
       ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
-      - alert: PrometheusAgentFailingInhibition
+      - alert: InhibitionPrometheusAgentFailing
         annotations:
           description: '{{`Prometheus agent remote write is failing.`}}'
           summary: Prometheus agent fails to send samples to remote write endpoint.
@@ -124,7 +124,7 @@ spec:
           cancel_if_cluster_status_deleting: "true"
           cancel_if_outside_working_hours: "true"
       ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
-      - alert: PrometheusAgentShardsMissingInhibition
+      - alert: InhibitionPrometheusAgentShardsMissing
         annotations:
           description: '{{`Prometheus agent is missing shards.`}}'
           summary: Prometheus agent is missing shards.
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml
similarity index 81%
rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.management-cluster.rules.yml
rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml
index 473be3186..ea72b199d 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml
@@ -4,10 +4,7 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
-    cluster_type: "management_cluster"
-{{- end }}
-  name: deployment.management-cluster.rules
+  name: statefulset.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
@@ -18,8 +15,8 @@ spec:
         description: '{{`Statefulset {{ $labels.namespace}}/{{ $labels.statefulset }} is not satisfied.`}}'
         opsrecipe: deployment-not-satisfied/
       expr: |-
-        kube_statefulset_status_replicas{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*"}
-        - kube_statefulset_status_replicas_ready{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*"}
+        kube_statefulset_status_replicas{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"}
+        - kube_statefulset_status_replicas_ready{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"}
         > 0
       for: 30m
       labels:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
index 7b0798d5d..a1c006233 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`The free space on the Data Disk for instance: {{ $labels.instance }} and PVC: {{ $labels.persistentvolumeclaim}} was below 10 percent for longer than 1 hour (current value {{ $value | printf "%.2f" }}).`}}'
         opsrecipe: low-disk-space/#persistent-volume
-      expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10
+      expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10
       for: 1h
       labels:
         area: platform
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index 204fe5765..10b14e97f 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -142,7 +142,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -165,7 +165,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -207,7 +207,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -230,7 +230,7 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
   # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric
   - interval: 1m
@@ -246,7 +246,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -269,7 +269,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -311,7 +311,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -334,5 +334,5 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index 79e4a1fc7..b5d92ecc3 100644
--- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -112,7 +112,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -135,7 +135,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -177,7 +177,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -200,7 +200,7 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
   # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric
   - interval: 1m
@@ -216,7 +216,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -239,7 +239,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -281,7 +281,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -304,5 +304,5 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index a2e3ed4bc..7497af50e 100644
--- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -112,7 +112,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -135,7 +135,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -177,7 +177,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -200,7 +200,7 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
   # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric
   - interval: 1m
@@ -216,7 +216,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -239,7 +239,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -281,7 +281,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -304,5 +304,5 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
diff --git a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index a2e3ed4bc..7497af50e 100644
--- a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -112,7 +112,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -135,7 +135,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -177,7 +177,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -200,7 +200,7 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m
   # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric
   - interval: 1m
@@ -216,7 +216,7 @@ tests:
     alert_rule_test:
       - alertname: PrometheusAgentShardsMissing
         eval_time: 40m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 40m
       - alertname: PrometheusAgentShardsMissing
         eval_time: 120m
@@ -239,7 +239,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 100m
         exp_alerts:
           - exp_labels:
@@ -281,7 +281,7 @@ tests:
               description: "Prometheus agent is missing shards."
               opsrecipe: "prometheus-agent/"
               summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 125m
         exp_alerts:
           - exp_labels:
@@ -304,5 +304,5 @@ tests:
               summary: "Prometheus agent is missing shards."
       - alertname: PrometheusAgentShardsMissing
         eval_time: 130m
-      - alertname: PrometheusAgentShardsMissingInhibition
+      - alertname: InhibitionPrometheusAgentShardsMissing
         eval_time: 130m

From 55078dd91090b4a5824bfb1c4eb2602fdf505660 Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 12:15:43 +0100
Subject: [PATCH 03/24] add dashboard annotation

---
 .../turtles/alerting-rules/systemd.rules.yml  |   2 +-
 .../atlas/alerting-rules/alloy.rules.yml      |   7 +-
 .../deployment.management-cluster.rules.yml   |   2 +-
 .../deployment.workload-cluster.rules.yml     |   2 +-
 .../atlas/alerting-rules/logging.rules.yaml   |   9 +-
 .../atlas/alerting-rules/monitoring.rules.yml |  74 -----
 .../alerting-rules/prometheus-agent.rules.yml | 290 +++++++++---------
 .../prometheus-operator.rules.yml             |   2 +-
 .../alerting-rules/chart.rules.yml            |   2 +-
 .../honeybadger/alerting-rules/helm.rules.yml |   2 +-
 .../recording-rules/helm-operations.rules.yml |   2 +-
 .../alerting-rules/logging.rules.test.yml     | 229 ++++++++++++++
 .../helm-operations.rules.test.yml            |   2 +-
 13 files changed, 391 insertions(+), 234 deletions(-)
 delete mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
 create mode 100644 test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml

diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml
index 370c1a1f6..a58297b73 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml
@@ -10,7 +10,7 @@ spec:
   groups:
   - name: systemd
     rules:
-    ## TODO(@giantswarm/team-turtles) Update those lists when all vintage clusters are gone
+    ## TODO(@giantswarm/team-tenet) Update those lists when all vintage clusters are gone
     - alert: ClusterCriticalSystemdUnitFailed
       annotations:
         description: '{{`Critical systemd unit {{ $labels.name }} is failed on {{ $labels.instance }}.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index fae2026b7..edf5e61ce 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -11,7 +11,6 @@ spec:
   groups:
     ## TODO(quentin) add tests for the alerts
     ## TODO(quentin) add opsrecipe for the alerts
-    ## TODO(quentin) add dashboard annotation for the alerts
     # List of alerts for on the state of the alloy components.
     # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet
     # We added the alert labels and added the missing labels from the aggregations.
@@ -19,9 +18,10 @@ spec:
       rules:
         - alert: AlloySlowComponentEvaluations
           annotations:
+            dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
             description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}'
-            summary: Component evaluations are taking too long.
             opsrecipe: alloy-components/
+            summary: Component evaluations are taking too long.
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
           for: 15m
           labels:
@@ -35,9 +35,10 @@ spec:
             cancel_if_cluster_status_updating: "true"
         - alert: AlloyUnhealthyComponents
           annotations:
+            dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
             description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}'
-            summary: Unhealthy components detected.
             opsrecipe: alloy-components/
+            summary: Unhealthy components detected.
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
           for: 15m
           labels:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
index 54a070368..1f98fe451 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
@@ -95,7 +95,7 @@ spec:
         team: phoenix
         topic: managementcluster
     {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
-    ## TODO Remove when all vintage clusters are gone
+    ## TODO(@giantswarm/team-atlas) Remove when all vintage clusters are gone
     - alert: AWSManagementClusterDeploymentScaledDownToZero
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} on AWS has been scaled down to zero for prolonged period of time.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
index 599682b91..3e26744c4 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
@@ -13,7 +13,7 @@ spec:
   groups:
   - name: deployment
     rules:
-    # TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+    # TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
     - alert: WorkloadClusterDeploymentNotSatisfied
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
index 71d96e782..5e34e77fc 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
@@ -7,15 +7,14 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    ## TODO(quentin) add tests for the alerts
     ## TODO(quentin) add opsrecipe for the alerts
-    ## TODO(quentin) add dashboard annotation for the alerts
     - name: logging-agent
       rules:
         # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
         # and join the pods with the not running containers
         - alert: LoggingAgentDown
           annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
             opsrecipe: logging-agent/
           expr: |-
@@ -41,8 +40,9 @@ spec:
         # This includes alloy-logs and the observability-gateway
         - alert: LogForwardingErrors
           annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`More that 10% of the requests to Loki are failing.`}}'
-            opsrecipe: logging-errors/
+            opsrecipe: log-shipping-errors/
           expr: |-
             (
               100
@@ -78,8 +78,9 @@ spec:
         # This alert pages when the loki source api component of the observability gateway is throwing errors
         - alert: LogReceivingErrors
           annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}'
-            opsrecipe: logging-errors/
+            opsrecipe: log-shipping-errors/
           expr: |-
             (
               100
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
deleted file mode 100644
index 8ba7a3a51..000000000
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
-  labels:
-    {{- include "labels.common" . | nindent 4 }}
-  name: monitoring.rules
-  namespace: {{ .Values.namespace  }}
-spec:
-  groups:
-    ## TODO(quentin) add tests for the monitoring agent alerts
-    ## TODO(quentin) add opsrecipe for the monitoring agent alerts
-    ## TODO(quentin) add dashboard annotation for the monitoring agent alerts
-    ## TODO(quentin) replace PrometheusAgentShardsMissing for alloy-metrics
-    ## TODO(quentin) add component specific errors to replace the ones in the prometheus.rules.yml
-    - name: monitoring-agent
-      rules:
-          ## This alert pages if the monitoring-agent fails to send samples to its remote write endpoint.
-        - alert: MonitoringAgentFailing
-          annotations:
-            description: '{{`Monitoring agent fails to send its data via remote write.`}}'
-            summary: Monitoring agent fails to send samples to its configured remote write endpoint.
-            opsrecipe: monitoring-agent/
-          expr: |-
-            (
-              label_replace(
-                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-                "cluster_id",
-                "$1",
-                "name",
-                "(.*)"
-              ) == 1
-            ) unless on (cluster_id) (
-              count(up{job="alloy-metrics"} > 0) by (cluster_id)
-            )
-          for: 20m
-          labels:
-            area: platform
-            severity: page
-            team: atlas
-            topic: observability
-            inhibit_monitoring_agent_down: "true"
-            cancel_if_cluster_is_not_running_monitoring_agent: "true"
-            cancel_if_cluster_status_creating: "true"
-            cancel_if_cluster_status_deleting: "true"
-            cancel_if_cluster_has_no_workers: "true"
-        ## Same as MonitoringAgentFailing, but triggers inhibition earlier and does not page.
-        - alert: InhibitionMonitoringAgentFailing
-          annotations:
-            description: '{{`Monitoring agent fails to send its data via remote write.`}}'
-            summary: Monitoring agent fails to send samples to its configured remote write endpoint.
-            opsrecipe: monitoring-agent/
-          expr: |-
-            (
-              label_replace(
-                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-                "cluster_id",
-                "$1",
-                "name",
-                "(.*)"
-              ) == 1
-            ) unless on (cluster_id) (
-              count(up{job="prometheus-agent"} > 0) by (cluster_id)
-            )
-          for: 2m
-          labels:
-            area: platform
-            severity: none
-            team: atlas
-            topic: observability
-            inhibit_monitoring_agent_down: "true"
-            cancel_if_cluster_is_not_running_monitoring_agent: "true"
-            cancel_if_cluster_status_creating: "true"
-            cancel_if_cluster_status_deleting: "true"
-
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
index b1813bee9..b0c8e2186 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
@@ -7,150 +7,150 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    - name: prometheus-agent
-      rules:
-      ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
-      - alert: PrometheusAgentFailing
-        annotations:
-          description: '{{`Prometheus agent remote write is failing.`}}'
-          summary: Prometheus agent fails to send samples to remote write endpoint.
-          opsrecipe: prometheus-agent/
-          dashboard: promRW001/prometheus-remote-write
-        {{- if not .Values.mimir.enabled }}
-        expr: |-
-          max_over_time(
-            sum by (cluster_type, cluster_id, installation, instance, service)
-            (
-              up{instance="prometheus-agent"} == 0
-              or
-              absent(up{instance="prometheus-agent"}) == 1
-            )[5m:]
-          )
-        {{- else }}
-        expr: |-
+  - name: prometheus-agent
+    rules:
+    ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
+    - alert: PrometheusAgentFailing
+      annotations:
+        description: '{{`Prometheus agent remote write is failing.`}}'
+        summary: Prometheus agent fails to send samples to remote write endpoint.
+        opsrecipe: prometheus-agent/
+        dashboard: promRW001/prometheus-remote-write
+      {{- if not .Values.mimir.enabled }}
+      expr: |-
+        max_over_time(
+          sum by (cluster_type, cluster_id, installation, instance, service)
           (
-            label_replace(
-              capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-              "cluster_id",
-              "$1",
-              "name",
-              "(.*)"
-            ) == 1
-          ) unless on (cluster_id) (
-            count(up{job="prometheus-agent"} > 0) by (cluster_id)
-          )
-        {{- end }}
-        for: 20m
-        labels:
-          area: platform
-          severity: page
-          team: atlas
-          topic: observability
-          inhibit_monitoring_agent_down: "true"
-          cancel_if_cluster_is_not_running_monitoring_agent: "true"
-          cancel_if_cluster_status_creating: "true"
-          cancel_if_cluster_status_deleting: "true"
-          cancel_if_cluster_has_no_workers: "true"
-      ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
-      - alert: InhibitionPrometheusAgentFailing
-        annotations:
-          description: '{{`Prometheus agent remote write is failing.`}}'
-          summary: Prometheus agent fails to send samples to remote write endpoint.
-          opsrecipe: prometheus-agent/
-          dashboard: promRW001/prometheus-remote-write
-        {{- if not .Values.mimir.enabled }}
-        expr: |-
-          max_over_time(
-            sum by (cluster_type, cluster_id, installation, instance, service)
-            (
-              up{instance="prometheus-agent"} == 0
-              or
-              absent(up{instance="prometheus-agent"}) == 1
-            )[5m:]
-          )
-        {{- else }}
-        expr: |-
+            up{instance="prometheus-agent"} == 0
+            or
+            absent(up{instance="prometheus-agent"}) == 1
+          )[5m:]
+        )
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id) (
+          count(up{job="prometheus-agent"} > 0) by (cluster_id)
+        )
+      {{- end }}
+      for: 20m
+      labels:
+        area: platform
+        severity: page
+        team: atlas
+        topic: observability
+        inhibit_monitoring_agent_down: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_has_no_workers: "true"
+    ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
+    - alert: InhibitionPrometheusAgentFailing
+      annotations:
+        description: '{{`Prometheus agent remote write is failing.`}}'
+        summary: Prometheus agent fails to send samples to remote write endpoint.
+        opsrecipe: prometheus-agent/
+        dashboard: promRW001/prometheus-remote-write
+      {{- if not .Values.mimir.enabled }}
+      expr: |-
+        max_over_time(
+          sum by (cluster_type, cluster_id, installation, instance, service)
           (
-            label_replace(
-              capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-              "cluster_id",
-              "$1",
-              "name",
-              "(.*)"
-            ) == 1
-          ) unless on (cluster_id) (
-            count(up{job="prometheus-agent"} > 0) by (cluster_id)
-          )
-        {{- end }}
-        for: 2m
-        labels:
-          area: platform
-          severity: none
-          team: atlas
-          topic: observability
-          inhibit_monitoring_agent_down: "true"
-          cancel_if_cluster_is_not_running_monitoring_agent: "true"
-          cancel_if_cluster_status_creating: "true"
-          cancel_if_cluster_status_deleting: "true"
-      ## This alert pages if one of the prometheus-agent shard is not running.
-      - alert: PrometheusAgentShardsMissing
-        annotations:
-          description: '{{`Prometheus agent is missing shards.`}}'
-          summary: Prometheus agent is missing shards.
-          opsrecipe: prometheus-agent/
-        expr: |-
-          max_over_time(sum by (cluster_id, installation, provider, pipeline)(
-            count(
-              ## number of remotes that are not mimir or grafana-cloud
-              prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-            ) by (cluster_id, installation, provider, pipeline)
-            !=
-            sum(
-              ## number of shards defined in the Prometheus CR
-              prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-              # if there is only 1 shard, there is no shard metric so we use the replicas metric
-              or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-            ) by (cluster_id, installation, provider, pipeline)
-          )[5m:])
-        for: 40m
-        labels:
-          area: platform
-          severity: page
-          team: atlas
-          topic: observability
-          inhibit_monitoring_agent_down: "true"
-          cancel_if_cluster_is_not_running_monitoring_agent: "true"
-          cancel_if_cluster_status_creating: "true"
-          cancel_if_cluster_status_deleting: "true"
-          cancel_if_outside_working_hours: "true"
-      ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
-      - alert: InhibitionPrometheusAgentShardsMissing
-        annotations:
-          description: '{{`Prometheus agent is missing shards.`}}'
-          summary: Prometheus agent is missing shards.
-          opsrecipe: prometheus-agent/
-        expr: |-
-          max_over_time(sum by (cluster_id, installation, provider, pipeline)(
-            count(
-              ## number of remotes that are not mimir or grafana-cloud
-              prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-            ) by (cluster_id, installation, provider, pipeline)
-            !=
-            sum(
-              ## number of shards defined in the Prometheus CR
-              prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-              # if there is only 1 shard, there is no shard metric so we use the replicas metric
-              or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-            ) by (cluster_id, installation, provider, pipeline)
-          )[5m:])
-        for: 2m
-        labels:
-          area: platform
-          severity: none
-          team: atlas
-          topic: observability
-          inhibit_monitoring_agent_down: "true"
-          cancel_if_cluster_is_not_running_monitoring_agent: "true"
-          cancel_if_cluster_status_creating: "true"
-          cancel_if_cluster_status_deleting: "true"
-          cancel_if_outside_working_hours: "true"
+            up{instance="prometheus-agent"} == 0
+            or
+            absent(up{instance="prometheus-agent"}) == 1
+          )[5m:]
+        )
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id) (
+          count(up{job="prometheus-agent"} > 0) by (cluster_id)
+        )
+      {{- end }}
+      for: 2m
+      labels:
+        area: platform
+        severity: none
+        team: atlas
+        topic: observability
+        inhibit_monitoring_agent_down: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+    ## This alert pages if one of the prometheus-agent shard is not running.
+    - alert: PrometheusAgentShardsMissing
+      annotations:
+        description: '{{`Prometheus agent is missing shards.`}}'
+        summary: Prometheus agent is missing shards.
+        opsrecipe: prometheus-agent/
+      expr: |-
+        max_over_time(sum by (cluster_id, installation, provider, pipeline)(
+          count(
+            ## number of remotes that are not mimir or grafana-cloud
+            prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
+          ) by (cluster_id, installation, provider, pipeline)
+          !=
+          sum(
+            ## number of shards defined in the Prometheus CR
+            prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
+            # if there is only 1 shard, there is no shard metric so we use the replicas metric
+            or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+          ) by (cluster_id, installation, provider, pipeline)
+        )[5m:])
+      for: 40m
+      labels:
+        area: platform
+        severity: page
+        team: atlas
+        topic: observability
+        inhibit_monitoring_agent_down: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
+    ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
+    - alert: InhibitionPrometheusAgentShardsMissing
+      annotations:
+        description: '{{`Prometheus agent is missing shards.`}}'
+        summary: Prometheus agent is missing shards.
+        opsrecipe: prometheus-agent/
+      expr: |-
+        max_over_time(sum by (cluster_id, installation, provider, pipeline)(
+          count(
+            ## number of remotes that are not mimir or grafana-cloud
+            prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
+          ) by (cluster_id, installation, provider, pipeline)
+          !=
+          sum(
+            ## number of shards defined in the Prometheus CR
+            prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
+            # if there is only 1 shard, there is no shard metric so we use the replicas metric
+            or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+          ) by (cluster_id, installation, provider, pipeline)
+        )[5m:])
+      for: 2m
+      labels:
+        area: platform
+        severity: none
+        team: atlas
+        topic: observability
+        inhibit_monitoring_agent_down: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
index 6628f6601..a9d130014 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
@@ -10,7 +10,7 @@ spec:
   groups:
   - name: prometheus-operator
     rules:
-    ## TODO(@giantswarm/team-atlas) remove once all clusters are passed v20
+    ## TODO(@giantswarm/team-atlas) - remove once all clusters are passed v20
     - alert: DuplicatePrometheusOperatorKubeletService
       annotations:
         description: '{{`Prometheus-operator in cluster {{ $labels.cluster_id }} has duplicate kubelet service.`}}'
diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml
index c9bc42ce4..1a1befca4 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml
@@ -1,4 +1,4 @@
-# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml
index a07271c66..6ac690b2b 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml
@@ -1,4 +1,4 @@
-# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml
index a703dce91..2675857f0 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml
@@ -1,4 +1,4 @@
-# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml
new file mode 100644
index 000000000..2b0941d70
--- /dev/null
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml
@@ -0,0 +1,229 @@
+---
+rule_files:
+  - logging.rules.yml
+
+tests:
+  # Test LoggingAgentDown
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: none, up, down
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-1xxxx", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-2xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 1+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-3xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 0+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+    alert_rule_test:
+      - alertname: LoggingAgentDown
+        eval_time: 10m
+      - alertname: LoggingAgentDown
+        eval_time: 30m
+      - alertname: LoggingAgentDown
+        eval_time: 71m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-1.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-1xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "logging-agent/"
+      # Tests with 2 pods
+      - alertname: LoggingAgentDown
+        eval_time: 111m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "logging-agent/"
+      - alertname: LoggingAgentDown
+        eval_time: 121m
+      - alertname: LoggingAgentDown
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-2.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-2xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "logging-agent/"
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "logging-agent/"
+  # Test LogForwardingErrors
+  - interval: 1m
+    input_series:
+      # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones
+      - series: 'loki_write_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+0x60   0+50x60      3000+100x60  9000+600x60"
+      - series: 'loki_write_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60"
+    alert_rule_test:
+      - alertname: LogForwardingErrors
+        eval_time: 30m
+      - alertname: LogForwardingErrors
+        eval_time: 90m
+      - alertname: LogForwardingErrors
+        eval_time: 150m
+      - alertname: LogForwardingErrors
+        eval_time: 210m
+      - alertname: LogForwardingErrors
+        eval_time: 270m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the requests to Loki are failing."
+              opsrecipe: "log-shipping-errors/"
+      - alertname: LogForwardingErrors
+        eval_time: 330m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the requests to Loki are failing."
+              opsrecipe: "log-shipping-errors/"
+  # Test LogReceivingErrors
+  - interval: 1m
+    input_series:
+      # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones
+      - series: 'loki_source_api_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+0x60   0+50x60      3000+100x60  9000+600x60"
+      - series: 'loki_source_api_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60"
+    alert_rule_test:
+      - alertname: LogReceivingErrors
+        eval_time: 30m
+      - alertname: LogReceivingErrors
+        eval_time: 90m
+      - alertname: LogReceivingErrors
+        eval_time: 150m
+      - alertname: LogReceivingErrors
+        eval_time: 210m
+      - alertname: LogReceivingErrors
+        eval_time: 270m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the loki requests to the observability gateway are failing."
+              opsrecipe: "log-shipping-errors/"
+      - alertname: LogReceivingErrors
+        eval_time: 330m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the loki requests to the observability gateway are failing."
+              opsrecipe: "log-shipping-errors/"
diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
index 54a65b1a6..d20c15e33 100644
--- a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
+++ b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
@@ -1,4 +1,4 @@
-# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 ---
 rule_files:
   - helm-operations.rules.yml

From 759ae7f61e70b30b38c7ad25c546de540f996208 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 16:38:40 +0100
Subject: [PATCH 04/24] Update
 helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml

---
 .../templates/platform/atlas/alerting-rules/storage.rules.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
index a1c006233..7b0798d5d 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`The free space on the Data Disk for instance: {{ $labels.instance }} and PVC: {{ $labels.persistentvolumeclaim}} was below 10 percent for longer than 1 hour (current value {{ $value | printf "%.2f" }}).`}}'
         opsrecipe: low-disk-space/#persistent-volume
-      expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10
+      expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10
       for: 1h
       labels:
         area: platform

From 9cee93af3dc58a28491e037e20084f3804c00b8e Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 16:39:38 +0100
Subject: [PATCH 05/24] Update prometheus.rules.yml

---
 .../atlas/alerting-rules/prometheus.rules.yml   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index 3a6e62302..b31713f90 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -27,6 +27,23 @@ spec:
         severity: page
         team: atlas
         topic: observability
+    ## Pages Atlas when prometheus fails to send samples to cortex
+    - alert: PrometheusMissingGrafanaCloud
+      annotations:
+        description: 'Prometheus is not sending data to Grafana Cloud.'
+        opsrecipe: prometheus-grafanacloud/
+      {{- if .Values.mimir.enabled }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+      {{- else }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
+      {{- end }}
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
     - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
       annotations:
         description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'

From 14d67c3b174e22be2feeebb62383c69d8de2cdaf Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 16:40:07 +0100
Subject: [PATCH 06/24] Update
 helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

---
 .../templates/platform/atlas/alerting-rules/alloy.rules.yml      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index edf5e61ce..1365fa848 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -9,7 +9,6 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    ## TODO(quentin) add tests for the alerts
     ## TODO(quentin) add opsrecipe for the alerts
     # List of alerts for on the state of the alloy components.
     # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet

From b9c1deab0a289ec8e2f0876c9be3888f0f3ecb0f Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 16:40:24 +0100
Subject: [PATCH 07/24] Update
 helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

---
 .../templates/platform/atlas/alerting-rules/alloy.rules.yml      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index 1365fa848..7c1270285 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -9,7 +9,6 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    ## TODO(quentin) add opsrecipe for the alerts
     # List of alerts for on the state of the alloy components.
     # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet
     # We added the alert labels and added the missing labels from the aggregations.

From 09929b0d2b509b505804db7b1b988ef4969326d5 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 16:40:41 +0100
Subject: [PATCH 08/24] Update
 helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml

---
 .../templates/platform/atlas/alerting-rules/logging.rules.yaml   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
index 5e34e77fc..155b2cda0 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
@@ -7,7 +7,6 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    ## TODO(quentin) add opsrecipe for the alerts
     - name: logging-agent
       rules:
         # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)

From 40452a5fa4122525fb15b0df5093f9c0e742228c Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Wed, 30 Oct 2024 16:41:28 +0100
Subject: [PATCH 09/24] add missing tests

---
 .../atlas/alerting-rules/alloy.rules.test.yml | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
new file mode 100644
index 000000000..62ad40ae6
--- /dev/null
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -0,0 +1,74 @@
+---
+rule_files:
+  - alloy.rules.yml
+
+tests:
+  # Test AlloySlowComponentEvaluations
+  - interval: 1m
+    input_series:
+      - series: 'alloy_component_evaluation_slow_seconds{cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller", component_path="path1", component_id="comp1"}'
+        values: "0+0x10 0+1x50 0x50"
+    alert_rule_test:
+      - alertname: AlloySlowComponentEvaluations
+        eval_time: 10m
+      - alertname: AlloySlowComponentEvaluations
+        eval_time: 50m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              namespace: default
+              job: alloy-controller
+              component_path: path1
+              component_id: comp1
+              severity: notify
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
+              description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1."
+              opsrecipe: "alloy-components/"
+              summary: "Component evaluations are taking too long."
+      - alertname: AlloySlowComponentEvaluations
+        eval_time: 80m
+
+  # Test AlloyUnhealthyComponents
+  - interval: 1m
+    input_series:
+      - series: 'alloy_component_controller_running_components{health_type="unhealthy", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller"}'
+        values: "0+0x10 1+0x50 0x50"
+    alert_rule_test:
+      - alertname: AlloyUnhealthyComponents
+        eval_time: 10m
+      - alertname: AlloyUnhealthyComponents
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              namespace: default
+              job: alloy-controller
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
+              description: "Unhealthy components detected under job alloy-controller"
+              opsrecipe: "alloy-components/"
+              summary: "Unhealthy components detected."
+      - alertname: AlloyUnhealthyComponents
+        eval_time: 80m

From fbc9c8d61a0a5b43c31a215a7b9703dc84d429e4 Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Mon, 4 Nov 2024 15:20:30 +0100
Subject: [PATCH 10/24] change based on ops-recipes

---
 .../atlas/alerting-rules/alloy.rules.yml      | 30 ++++++++++++++--
 ...rules.yaml => logging-pipeline.rules.yaml} | 34 +++----------------
 .../atlas/alerting-rules/alloy.rules.test.yml |  4 +--
 ...st.yml => logging-pipeline.rules.test.yml} | 26 +++++++-------
 4 files changed, 47 insertions(+), 47 deletions(-)
 rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{logging.rules.yaml => logging-pipeline.rules.yaml} (71%)
 rename test/tests/providers/global/platform/atlas/alerting-rules/{logging.rules.test.yml => logging-pipeline.rules.test.yml} (92%)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index 7c1270285..aa1959ded 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -18,7 +18,7 @@ spec:
           annotations:
             dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
             description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}'
-            opsrecipe: alloy-components/
+            opsrecipe: alloy/
             summary: Component evaluations are taking too long.
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
           for: 15m
@@ -35,7 +35,7 @@ spec:
           annotations:
             dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
             description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}'
-            opsrecipe: alloy-components/
+            opsrecipe: alloy/
             summary: Unhealthy components detected.
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
           for: 15m
@@ -48,3 +48,29 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
+    - name: logging-agent
+      rules:
+        # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
+        # and join the pods with the not running containers
+        - alert: LoggingAgentDown
+          annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
+            description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
+            opsrecipe: alloy/
+          expr: |-
+            kube_pod_info{pod=~"alloy-logs.*"}
+            * on(cluster_id, pod)
+              group_left ()
+              up{job="alloy-logs", container="alloy"} == 0
+          for: 30m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_node_unschedulable: "true"
+            cancel_if_node_not_ready: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml
similarity index 71%
rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml
index 155b2cda0..c45f70f42 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml
@@ -3,37 +3,11 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: logging.rules
+  name: logging-pipeline.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    - name: logging-agent
-      rules:
-        # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
-        # and join the pods with the not running containers
-        - alert: LoggingAgentDown
-          annotations:
-            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
-            description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
-            opsrecipe: logging-agent/
-          expr: |-
-            kube_pod_info{pod=~"alloy-logs.*"}
-            * on(cluster_id, pod)
-              group_left ()
-              up{job="alloy-logs", container="alloy"} == 0
-          for: 30m
-          labels:
-            area: platform
-            severity: page
-            team: atlas
-            topic: observability
-            cancel_if_outside_working_hours: "true"
-            cancel_if_cluster_status_creating: "true"
-            cancel_if_cluster_status_deleting: "true"
-            cancel_if_cluster_status_updating: "true"
-            cancel_if_node_unschedulable: "true"
-            cancel_if_node_not_ready: "true"
-    - name: log-ingestion
+    - name: logging-pipeline
       rules:
         # Any alloy component that uses the loki.write component can throw such errors.
         # This includes alloy-logs and the observability-gateway
@@ -41,7 +15,7 @@ spec:
           annotations:
             dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`More that 10% of the requests to Loki are failing.`}}'
-            opsrecipe: log-shipping-errors/
+            opsrecipe: logging-pipeline/
           expr: |-
             (
               100
@@ -79,7 +53,7 @@ spec:
           annotations:
             dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}'
-            opsrecipe: log-shipping-errors/
+            opsrecipe: logging-pipeline/
           expr: |-
             (
               100
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 62ad40ae6..2effa82d5 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -34,7 +34,7 @@ tests:
             exp_annotations:
               dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
               description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1."
-              opsrecipe: "alloy-components/"
+              opsrecipe: "alloy/"
               summary: "Component evaluations are taking too long."
       - alertname: AlloySlowComponentEvaluations
         eval_time: 80m
@@ -68,7 +68,7 @@ tests:
             exp_annotations:
               dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
               description: "Unhealthy components detected under job alloy-controller"
-              opsrecipe: "alloy-components/"
+              opsrecipe: "alloy/"
               summary: "Unhealthy components detected."
       - alertname: AlloyUnhealthyComponents
         eval_time: 80m
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
similarity index 92%
rename from test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml
rename to test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
index 2b0941d70..31217a0a7 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-  - logging.rules.yml
+  - logging-pipeline.rules.yml
 
 tests:
   # Test LoggingAgentDown
@@ -47,8 +47,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
       # Tests with 2 pods
       - alertname: LoggingAgentDown
         eval_time: 111m
@@ -72,8 +72,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
       - alertname: LoggingAgentDown
         eval_time: 121m
       - alertname: LoggingAgentDown
@@ -98,8 +98,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
           - exp_labels:
               area: platform
               cancel_if_outside_working_hours: "true"
@@ -119,8 +119,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
   # Test LogForwardingErrors
   - interval: 1m
     input_series:
@@ -155,7 +155,7 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the requests to Loki are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"
       - alertname: LogForwardingErrors
         eval_time: 330m
         exp_alerts:
@@ -173,7 +173,7 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the requests to Loki are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"
   # Test LogReceivingErrors
   - interval: 1m
     input_series:
@@ -208,7 +208,7 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the loki requests to the observability gateway are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"
       - alertname: LogReceivingErrors
         eval_time: 330m
         exp_alerts:
@@ -226,4 +226,4 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the loki requests to the observability gateway are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"

From 9e726643210d6c065e14351140a52ce4f16a4a4c Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 11:53:41 +0100
Subject: [PATCH 11/24] Clean up some rules a bit

---
 CHANGELOG.md                                  |   3 +-
 .../atlas/alerting-rules/alloy.rules.yml      |  21 ++-
 ...rter.rules.yml => grafana-cloud.rules.yml} |  30 +++-
 .../atlas/alerting-rules/grafana.rules.yml    |   4 +-
 .../kube-state-metrics.rules.yml              |   1 -
 .../atlas/alerting-rules/mimir.rules.yml      |  15 --
 .../atlas/alerting-rules/prometheus.rules.yml |  18 --
 ....rules.test.yml => grafana-cloud.test.yml} |   2 +-
 .../atlas/alerting-rules/mimir.rules.test.yml |  29 ----
 .../atlas/alerting-rules/alloy.rules.test.yml | 154 ++++++++++++++++++
 .../logging-pipeline.rules.test.yml           | 118 --------------
 11 files changed, 204 insertions(+), 191 deletions(-)
 rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.yml => grafana-cloud.rules.yml} (74%)
 rename test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.test.yml => grafana-cloud.test.yml} (99%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9ef272bf..92d0a37ec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
-  - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics.
 
 ### Changed
 
@@ -22,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-operator`
   - `alloy-rules`
   - `observability-gateway`
+- Move all `grafana-cloud` related alerts to their own file.
+- Move all alloy related alerts to the alloy alert file and fix alloy-logs tests.
 
 ## [4.23.0] - 2024-10-30
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index aa1959ded..8b3e6256c 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -6,7 +6,7 @@ metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: alloy.rules
-  namespace: {{ .Values.namespace  }}
+  namespace: {{ .Values.namespace }}
 spec:
   groups:
     # List of alerts for on the state of the alloy components.
@@ -48,7 +48,24 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
-    - name: logging-agent
+    - name: alloy.rules
+      rules:
+        - alert: AlloyForPrometheusRulesDown
+          annotations:
+            description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.'
+            opsrecipe: prometheus-rules/
+          expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0
+          for: 1h
+          labels:
+            area: platform
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_outside_working_hours: "true"
+            severity: page
+            team: atlas
+            topic: observability
+    - name: alloy.logs
       rules:
         # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
         # and join the pods with the not running containers
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
similarity index 74%
rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml
rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
index 40d76d3d2..9560570ef 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
@@ -1,13 +1,35 @@
-{{- if .Values.mimir.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: mimir-to-grafana-cloud-exporter.rules
-  namespace: {{ .Values.namespace }}
+    {{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+    {{- end }}
+  name: grafana-cloud.rules
+  namespace: {{ .Values.namespace  }}
 spec:
   groups:
+  - name: grafana-cloud
+    rules:
+    ## Pages Atlas when prometheus fails to send samples to cortex
+    - alert: PrometheusMissingGrafanaCloud
+      annotations:
+        description: 'Prometheus is not sending data to Grafana Cloud.'
+        opsrecipe: prometheus-grafanacloud/
+      {{- if .Values.mimir.enabled }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+      {{- else }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
+      {{- end }}
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+  {{- if .Values.mimir.enabled }}
   - name: mimir-to-grafana-cloud-exporter
     rules:
     - alert: MimirToGrafanaCloudExporterDown
@@ -73,4 +95,4 @@ spec:
         severity: page
         team: atlas
         topic: observability
-{{- end }}
+  {{- end }}
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
index 39fb4a0a0..97a10780b 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
@@ -3,9 +3,9 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if not .Values.mimir.enabled }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: grafana.rules
   namespace: {{ .Values.namespace }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
index 6c90a4e2c..83089fc33 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -85,7 +85,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-
     - alert: KubeConfigMapCreatedMetricMissing
       annotations:
         description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
index cd47324a8..6dc137889 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
@@ -61,21 +61,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: AlloyForPrometheusRulesDown
-      annotations:
-        description: 'Alloy sending PrometheusRules to Mimir ruler is down.'
-        opsrecipe: prometheus-rules/
-      expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: MimirRulerEventsFailed
       annotations:
         dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index b31713f90..a0bd48fe9 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,7 +1,6 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: prometheus.rules
@@ -27,23 +26,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    ## Pages Atlas when prometheus fails to send samples to cortex
-    - alert: PrometheusMissingGrafanaCloud
-      annotations:
-        description: 'Prometheus is not sending data to Grafana Cloud.'
-        opsrecipe: prometheus-grafanacloud/
-      {{- if .Values.mimir.enabled }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
-      {{- else }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
-      {{- end }}
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
       annotations:
         description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
similarity index 99%
rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml
rename to test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
index ee5645cf0..79c5aa0f1 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-- mimir-to-grafana-cloud-exporter.rules.yml
+- grafana-cloud.rules.yml
 
 tests:
   # Tests for `MimirToGrafanaCloudExporterDown` alert
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
index 37d40af1d..6bdfeaeab 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
@@ -86,35 +86,6 @@ tests:
               dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
               description: "Mimir component : mimir-ingester is down."
               opsrecipe: "mimir/"
-  - interval: 1m
-    input_series:
-      # test with 1 pod: none, up, down
-      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}'
-        values: "_x20 1+0x70 0+0x70"
-    alert_rule_test:
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 10m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 80m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 160m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cluster_id: golem
-              installation: golem
-              provider: capa
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Alloy sending PrometheusRules to Mimir ruler is down."
-              opsrecipe: "prometheus-rules/"
   - interval: 1m
     input_series:
       # test: none, rate > 0, rate = 0
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 2effa82d5..d8b9309a5 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -72,3 +72,157 @@ tests:
               summary: "Unhealthy components detected."
       - alertname: AlloyUnhealthyComponents
         eval_time: 80m
+
+  # Test AlloyForPrometheusRulesDown
+  - interval: 1m
+    input_series:
+      # test with 1 pod: none, up, down
+      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}'
+        values: "_x20 1+0x70 0+0x70"
+    alert_rule_test:
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 10m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 80m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 160m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: golem
+              installation: golem
+              provider: capa
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down."
+              opsrecipe: "prometheus-rules/"
+
+  # Test LoggingAgentDown
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: none, up, down
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 1+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 0+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+    alert_rule_test:
+      - alertname: LoggingAgentDown
+        eval_time: 10m
+      - alertname: LoggingAgentDown
+        eval_time: 30m
+      - alertname: LoggingAgentDown
+        eval_time: 71m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-1.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-1xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      # Tests with 2 pods
+      - alertname: LoggingAgentDown
+        eval_time: 111m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      - alertname: LoggingAgentDown
+        eval_time: 121m
+      - alertname: LoggingAgentDown
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-2.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-2xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
index 31217a0a7..fccbfa5a1 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
@@ -3,124 +3,6 @@ rule_files:
   - logging-pipeline.rules.yml
 
 tests:
-  # Test LoggingAgentDown
-  - interval: 1m
-    input_series:
-      # For the first 60min: test with 1 pod: none, up, down
-      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-1xxxx", provider="aws", pipeline="testing"}'
-        values: "_x20 1+0x20 0+0x40"
-      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
-        values: "1x180"
-      # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
-      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-2xxxx", provider="aws", pipeline="testing"}'
-        values: "_x80 1+0x40 1+0x20 0+0x40"
-      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
-        values: "1x180"
-      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-3xxxx", provider="aws", pipeline="testing"}'
-        values: "_x80 0+0x40 1+0x20 0+0x40"
-      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
-        values: "1x180"
-    alert_rule_test:
-      - alertname: LoggingAgentDown
-        eval_time: 10m
-      - alertname: LoggingAgentDown
-        eval_time: 30m
-      - alertname: LoggingAgentDown
-        eval_time: 71m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-1.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-1xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
-      # Tests with 2 pods
-      - alertname: LoggingAgentDown
-        eval_time: 111m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-3.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-3xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
-      - alertname: LoggingAgentDown
-        eval_time: 121m
-      - alertname: LoggingAgentDown
-        eval_time: 180m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-2.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-2xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_node_unschedulable: "true"
-              cancel_if_node_not_ready: "true"
-              cluster_id: gauss
-              cluster_type: management_cluster
-              installation: gauss
-              node: ip-10-0-5-3.eu-west-1.compute.internal
-              pipeline: testing
-              pod: alloy-3xxxx
-              provider: aws
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
-              opsrecipe: "alloy/"
   # Test LogForwardingErrors
   - interval: 1m
     input_series:

From b7d53b3b9fa88f1c9adb78c2a1536e02c3365aec Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 11:58:44 +0100
Subject: [PATCH 12/24] Update CHANGELOG.md

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9ef272bf..8913d9edc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
-  - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics.
 
 ### Changed
 

From 1d49161dafc4edb7e726813909be5ffa1bde2851 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 11:59:12 +0100
Subject: [PATCH 13/24] Update helm-operations.rules.yml

---
 .../honeybadger/recording-rules/helm-operations.rules.yml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml
index 2675857f0..a703dce91 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml
@@ -1,4 +1,4 @@
-# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:

From 868779ee37fb7266bb3e0d6516b12076a4e965f8 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 11:59:34 +0100
Subject: [PATCH 14/24] Update systemd.rules.yml

---
 .../templates/kaas/turtles/alerting-rules/systemd.rules.yml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml
index a58297b73..370c1a1f6 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml
@@ -10,7 +10,7 @@ spec:
   groups:
   - name: systemd
     rules:
-    ## TODO(@giantswarm/team-tenet) Update those lists when all vintage clusters are gone
+    ## TODO(@giantswarm/team-turtles) Update those lists when all vintage clusters are gone
     - alert: ClusterCriticalSystemdUnitFailed
       annotations:
         description: '{{`Critical systemd unit {{ $labels.name }} is failed on {{ $labels.instance }}.`}}'

From c15aab7116d73b073592d54f1097665d46ac1d00 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 12:00:00 +0100
Subject: [PATCH 15/24] Update
 helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml

---
 .../atlas/alerting-rules/deployment.workload-cluster.rules.yml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
index afbec4e1a..fa9087331 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
@@ -13,7 +13,7 @@ spec:
   groups:
   - name: deployment
     rules:
-    # TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+    # TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
     - alert: WorkloadClusterDeploymentNotSatisfied
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'

From 393738de9af960358b6c22b97d2594d8424f1d75 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 12:00:25 +0100
Subject: [PATCH 16/24] Update
 helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml

---
 .../platform/atlas/alerting-rules/prometheus-operator.rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
index a9d130014..6628f6601 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
@@ -10,7 +10,7 @@ spec:
   groups:
   - name: prometheus-operator
     rules:
-    ## TODO(@giantswarm/team-atlas) - remove once all clusters are passed v20
+    ## TODO(@giantswarm/team-atlas) remove once all clusters are passed v20
     - alert: DuplicatePrometheusOperatorKubeletService
       annotations:
         description: '{{`Prometheus-operator in cluster {{ $labels.cluster_id }} has duplicate kubelet service.`}}'

From 54f9f7217871f6f58f8cac8a74b87f53874a83de Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 12:00:50 +0100
Subject: [PATCH 17/24] Update
 helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml

---
 .../platform/honeybadger/alerting-rules/chart.rules.yml         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml
index 99048fe7b..1a584734b 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml
@@ -1,4 +1,4 @@
-# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:

From bb9abda0caf0c9cfec17e5317739f6e4020fca6e Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 12:01:09 +0100
Subject: [PATCH 18/24] Update
 helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml

---
 .../platform/honeybadger/alerting-rules/helm.rules.yml          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml
index 6ac690b2b..a07271c66 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml
@@ -1,4 +1,4 @@
-# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:

From 068d45dfd1d6cfdef6b3793031d60618e5529697 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 12:01:27 +0100
Subject: [PATCH 19/24] Update
 test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml

---
 .../honeybadger/alerting-rules/helm-operations.rules.test.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
index d20c15e33..54a65b1a6 100644
--- a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
+++ b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
@@ -1,4 +1,4 @@
-# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
+# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 ---
 rule_files:
   - helm-operations.rules.yml

From 2f9c07c61cb749c48219853238576e1ea271a702 Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Tue, 5 Nov 2024 22:35:36 +0100
Subject: [PATCH 20/24] add alerts for alloy-metrics

---
 CHANGELOG.md                                  |   6 ++
 .../atlas/alerting-rules/alloy.rules.yml      | 102 +++++++++++++++++-
 .../monitoring-pipeline.rules.yml             |  80 ++++++++++++++
 .../atlas/alerting-rules/prometheus.rules.yml |  59 +---------
 .../atlas/alerting-rules/alloy.rules.test.yml |  35 ++++++
 ...yml => monitoring-pipeline.rules.test.yml} |  60 ++++++++---
 6 files changed, 266 insertions(+), 76 deletions(-)
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
 rename test/tests/providers/global/platform/atlas/alerting-rules/{prometheus.rules.test.yml => monitoring-pipeline.rules.test.yml} (58%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92d0a37ec..b90b0378a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
+  - `MonitoringAgentDown` to be alerted when the monitoring agent is down.
+  - `MonitoringAgentShardsNotSatisfied` to be alerted when the monitoring agent is missing any number of desired shards.
 
 ### Changed
 
@@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-gateway`
 - Move all `grafana-cloud` related alerts to their own file.
 - Move all alloy related alerts to the alloy alert file and fix alloy-logs tests.
+- Rename and move the following alerts as they are not specific to Prometheus:
+  - `PrometheusCriticalJobScrapingFailure` => `CriticalJobScrapingFailure`
+  - `PrometheusJobScrapingFailure` => `JobScrapingFailure`
+  - `PrometheusFailsToCommunicateWithRemoteStorageAPI` => `MetricForwardingErrors`
 
 ## [4.23.0] - 2024-10-30
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index 8b3e6256c..7d984b2dd 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -1,5 +1,5 @@
 # This files describe common alloy alerting rules
-# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml).
+# For alerts regarding the monitoring pipeline and the logging pipeline, please go to the respective files (logging-pipeline.rules.yml and monitoring-pipeline.rules.yml).
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -91,3 +91,103 @@ spec:
             cancel_if_cluster_status_updating: "true"
             cancel_if_node_unschedulable: "true"
             cancel_if_node_not_ready: "true"
+    - name: alloy.metrics
+      rules:
+        # This alert pages if monitoring-agent fails to send samples to its remote write endpoint.
+        - alert: MonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 20m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_has_no_workers: "true"
+        ## Same as MonitoringAgentDown, but triggers inhibition earlier and does not page.
+        - alert: InhibitionMonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+        ## This alert pages if any of the monitoring-agent shard is not running.
+        - alert: MonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Missing agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 40m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_outside_working_hours: "true"
+        ## Same as MonitoringAgentShardsNotSatisfied but triggers inhibition earlier, and does not page.
+        - alert: InhibitionMonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Missing agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
new file mode 100644
index 000000000..e666ea277
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
@@ -0,0 +1,80 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: monitoring-pipeline.rules
+  namespace: {{ .Values.namespace }}
+spec:
+  groups:
+  - name: monitoring-pipeline
+    rules:
+    - alert: MetricForwardingErrors
+      annotations:
+        description: '{{`Monitoring agent can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
+        opsrecipe: monitoring-pipeline/
+        dashboard: promRW001/prometheus-remote-write
+      expr: |-
+        rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1
+          or rate(prometheus_remote_storage_samples_total[10m]) == 0
+          or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+    - alert: JobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      expr: |-
+        (
+          count(up == 0) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(up) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 1d
+      labels:
+        area: platform
+        severity: notify
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+    - alert: CriticalJobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      ## We ignore bastion hosts node exporters
+      expr: |-
+        (
+          count(
+            (
+              up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+              or
+              up{job="kubelet", metrics_path="/metrics"}
+            ) == 0
+          ) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(
+            up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+            or
+            up{job="kubelet", metrics_path="/metrics"}
+          ) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 3d
+      labels:
+        area: platform
+        severity: page
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index a0bd48fe9..7b48759a8 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,3 +1,4 @@
+# TODO(@giantswarm/team-atlas): revisit once vintage is gone
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -26,19 +27,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
-      annotations:
-        description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
-        opsrecipe: prometheus-cant-communicate-with-remote-storage-api/
-        dashboard: promRW001/prometheus-remote-write
-      expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusRuleFailures
       annotations:
         description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to evaluate rule(s) {{ printf "%.2f" $value }} time(s).`}}
@@ -52,48 +40,3 @@ spec:
         team: atlas
         topic: observability
         cancel_if_outside_working_hours: "true"
-    - alert: PrometheusJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1
-      for: 1d
-      labels:
-        area: platform
-        severity: notify
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-    - alert: PrometheusCriticalJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      ## We ignore bastion hosts node exporters
-      expr: |-
-        (
-          count(
-            (
-              up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-              or
-              up{job="kubelet", metrics_path="/metrics"}
-            ) == 0
-          ) BY (job, installation, cluster_id, provider, pipeline)
-          /
-          count(
-            up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-            or
-            up{job="kubelet", metrics_path="/metrics"}
-          ) BY (job, installation, cluster_id, provider, pipeline)
-        ) == 1
-      for: 3d
-      labels:
-        area: platform
-        severity: page
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-        cancel_if_cluster_is_not_running_monitoring_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index d8b9309a5..90e75a3fe 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -86,6 +86,7 @@ tests:
         eval_time: 80m
       - alertname: AlloyForPrometheusRulesDown
         eval_time: 160m
+
         exp_alerts:
           - exp_labels:
               area: platform
@@ -226,3 +227,37 @@ tests:
               dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
               description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
               opsrecipe: "alloy/"
+
+  # Test MonitoringAgentDown
+  - interval: 1m
+    input_series:
+      - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x70 0+0x70"
+      - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "1x150"
+    alert_rule_test:
+      - alertname: MonitoringAgentDown
+        eval_time: 10m
+      - alertname: MonitoringAgentDown
+        eval_time: 80m
+      - alertname: MonitoringAgentDown
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_has_no_workers: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml
similarity index 58%
rename from test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml
rename to test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml
index 77cdd2167..ad97acbb7 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml
@@ -1,13 +1,13 @@
 ---
 rule_files:
-  - prometheus.rules.yml
+  - monitoring-pipeline.rules.yml
 
 # Setting evaluation interval to 1h
 # to make it faster on long test duration.
 evaluation_interval: 1h
 
 tests:
-  # Test PrometheusJobScrapingFailure and PrometheusCriticalJobScrapingFailure
+  # Test JobScrapingFailure and CriticalJobScrapingFailure
   - interval: 1h
     input_series:
       - series: 'up{job="apiserver", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}'
@@ -30,14 +30,14 @@ tests:
       - series: 'up{job="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}'
         values: "1+0x120 0+0x120"
     alert_rule_test:
-      - alertname: PrometheusCriticalJobScrapingFailure
+      - alertname: CriticalJobScrapingFailure
         eval_time: 30m
-      - alertname: PrometheusJobScrapingFailure
+      - alertname: JobScrapingFailure
         eval_time: 1d
-      - alertname: PrometheusCriticalJobScrapingFailure
+      - alertname: CriticalJobScrapingFailure
         eval_time: 4d
       # This alert fires for both critical and non-critical targets
-      - alertname: PrometheusJobScrapingFailure
+      - alertname: JobScrapingFailure
         eval_time: 7d
         exp_alerts:
           - exp_labels:
@@ -52,9 +52,10 @@ tests:
               pipeline: "testing"
               job: "kube-controller-manager"
             exp_annotations:
-              opsrecipe: "prometheus-job-scraping-failure/"
-              summary: "Prometheus fails to scrape all targets in a job."
-              description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job."
+              dashboard: servicemonitors-details/servicemonitors-details
+              opsrecipe: "monitoring-job-scraping-failure/"
+              summary: "Monitoring agent failed to scrape all targets in a job."
+              description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job."
           - exp_labels:
               area: platform
               severity: notify
@@ -67,12 +68,13 @@ tests:
               pipeline: "testing"
               job: "app-exporter"
             exp_annotations:
-              opsrecipe: "prometheus-job-scraping-failure/"
-              summary: "Prometheus fails to scrape all targets in a job."
-              description: "Prometheus gauss/gauss has failed to scrape all targets in app-exporter job."
-
+              dashboard: servicemonitors-details/servicemonitors-details
+              opsrecipe: "monitoring-job-scraping-failure/"
+              summary: "Monitoring agent failed to scrape all targets in a job."
+              description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in app-exporter job."
+  
       # This fires only for critical target down.
-      - alertname: PrometheusCriticalJobScrapingFailure
+      - alertname: CriticalJobScrapingFailure
         eval_time: 9d
         exp_alerts:
           - exp_labels:
@@ -90,6 +92,30 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
             exp_annotations:
-              opsrecipe: "prometheus-job-scraping-failure/"
-              summary: "Prometheus fails to scrape all targets in a job."
-              description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job."
+              dashboard: servicemonitors-details/servicemonitors-details
+              opsrecipe: "monitoring-job-scraping-failure/"
+              summary: "Monitoring agent failed to scrape all targets in a job."
+              description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job."
+
+
+  # Test MetricForwardingErrors
+  - interval: 1m
+    input_series:
+      # remote write has no failure for 1 hour and then fails for 2 hours
+      - series: 'prometheus_remote_storage_samples_failed_total{url="http://remote-storage_samples_failed_total"}'
+        values: "0+0x60 0+100x120"
+    alert_rule_test:
+      - alertname: MetricForwardingErrors
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              severity: page
+              team: atlas
+              topic: observability
+              cancel_if_outside_working_hours: "true"
+              url: "http://remote-storage_samples_failed_total"
+            exp_annotations:
+              description: "Monitoring agent can't communicate with Remote Storage API at http://remote-storage_samples_failed_total."
+              opsrecipe: "monitoring-pipeline/"
+              dashboard: "promRW001/prometheus-remote-write"

From 4f9e241321c55610620453610a2cd2baccdefb50 Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Thu, 7 Nov 2024 20:52:54 +0100
Subject: [PATCH 21/24] improve monitoring agent down tests

---
 .../atlas/alerting-rules/alloy.rules.test.yml | 85 ++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 598b51ce2..749f9d916 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -230,14 +230,77 @@ tests:
   - interval: 1m
     input_series:
       - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
-        values: "_x20 1+0x70 0+0x70"
+        values: "_x40 1+0x50 0+0x70"
       - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}'
         values: "1x150"
     alert_rule_test:
       - alertname: MonitoringAgentDown
         eval_time: 10m
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 10m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: MonitoringAgentDown
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_has_no_workers: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
       - alertname: MonitoringAgentDown
         eval_time: 80m
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 80m
       - alertname: MonitoringAgentDown
         eval_time: 140m
         exp_alerts:
@@ -259,3 +322,23 @@ tests:
               opsrecipe: "alloy/#monitoring-agent-down"
               dashboard: "promRW001/prometheus-remote-write"
               summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."

From 553a1a49dd525938c3abec0c940345f1e3813d61 Mon Sep 17 00:00:00 2001
From: QuentinBisson <quentin@giantswarm.io>
Date: Thu, 7 Nov 2024 21:03:03 +0100
Subject: [PATCH 22/24] improve monitoring agent shards not satisfied tests

---
 .../atlas/alerting-rules/alloy.rules.yml      |  4 +-
 .../atlas/alerting-rules/alloy.rules.test.yml | 80 +++++++++++++++++++
 2 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index 13b1a3d85..fc364f285 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -156,7 +156,7 @@ spec:
         - alert: MonitoringAgentShardsNotSatisfied
           annotations:
             description: '{{`At least one of the monitoring agent shard is missing.`}}'
-            summary: Missing agent is missing shards.
+            summary: Monitoring agent is missing shards.
             opsrecipe: alloy/#monitoring-agent-down
           expr: |-
             kube_statefulset_status_replicas{statefulset="alloy-metrics"}
@@ -176,7 +176,7 @@ spec:
         - alert: InhibitionMonitoringAgentShardsNotSatisfied
           annotations:
             description: '{{`At least one of the monitoring agent shard is missing.`}}'
-            summary: Missing agent is missing shards.
+            summary: Monitoring agent is missing shards.
             opsrecipe: alloy/#monitoring-agent-down
           expr: |-
             kube_statefulset_status_replicas{statefulset="alloy-metrics"}
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 749f9d916..40aa3e248 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -342,3 +342,83 @@ tests:
               opsrecipe: "alloy/#monitoring-agent-down"
               dashboard: "promRW001/prometheus-remote-write"
               summary: "Monitoring agent fails to send samples to remote write endpoint."
+
+  # Test MonitoringAgentShardsNotSatisfied
+  - interval: 1m
+    input_series:
+      - series: 'kube_statefulset_status_replicas{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "3+0x50 3+0x50 3+0x50"
+      - series: 'kube_statefulset_status_replicas_ready{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "3+0x10 2+0x90 3+0x50"
+    alert_rule_test:
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 10m
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 30m
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 30m
+      - alertname: InhibitionMonitoringAgentShardsNotSatisfied
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              statefulset: alloy-metrics
+              team: atlas
+              topic: observability
+              inhibit_monitoring_agent_down: "true"
+            exp_annotations:
+              description: "At least one of the monitoring agent shard is missing."
+              summary: "Monitoring agent is missing shards."
+              opsrecipe: "alloy/#monitoring-agent-down"
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 60m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              statefulset: alloy-metrics
+              team: atlas
+              topic: observability
+              inhibit_monitoring_agent_down: "true"
+            exp_annotations:
+              description: "At least one of the monitoring agent shard is missing."
+              summary: "Monitoring agent is missing shards."
+              opsrecipe: "alloy/#monitoring-agent-down"
+      - alertname: InhibitionMonitoringAgentShardsNotSatisfied
+        eval_time: 60m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              statefulset: alloy-metrics
+              team: atlas
+              topic: observability
+              inhibit_monitoring_agent_down: "true"
+            exp_annotations:
+              description: "At least one of the monitoring agent shard is missing."
+              summary: "Monitoring agent is missing shards."
+              opsrecipe: "alloy/#monitoring-agent-down"
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 130m
+      - alertname: InhibitionMonitoringAgentShardsNotSatisfied
+        eval_time: 130m

From c7b460b7ae8b301566d6be15294c692f05f7e2f3 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 7 Nov 2024 23:21:58 +0100
Subject: [PATCH 23/24] Update
 test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 .../global/platform/atlas/alerting-rules/alloy.rules.test.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 40aa3e248..36c8bc5f5 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -347,7 +347,7 @@ tests:
   - interval: 1m
     input_series:
       - series: 'kube_statefulset_status_replicas{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
-        values: "3+0x50 3+0x50 3+0x50"
+        values: "3+0x10 3+0x90 3+0x50"
       - series: 'kube_statefulset_status_replicas_ready{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
         values: "3+0x10 2+0x90 3+0x50"
     alert_rule_test:

From b476eac9548fdda9a36aa31d168be63f1b980186 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 7 Nov 2024 23:22:13 +0100
Subject: [PATCH 24/24] Update
 test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 .../global/platform/atlas/alerting-rules/alloy.rules.test.yml   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 36c8bc5f5..98549b422 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -355,8 +355,6 @@ tests:
         eval_time: 10m
       - alertname: MonitoringAgentShardsNotSatisfied
         eval_time: 30m
-      - alertname: MonitoringAgentShardsNotSatisfied
-        eval_time: 30m
       - alertname: InhibitionMonitoringAgentShardsNotSatisfied
         eval_time: 30m
         exp_alerts: