giantswarm · QuentinBisson · Nov 12, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
+  - `MonitoringAgentDown` to be alerted when the monitoring agent is down.
+  - `MonitoringAgentShardsNotSatisfied` to be alerted when the monitoring agent is missing any number of desired shards.
 
 ### Changed
 
@@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-gateway`
 - Move all `grafana-cloud` related alerts to their own file.
 - Move all alloy related alerts to the alloy alert file.
+- Rename and move the following alerts as they are not specific to Prometheus:
+  - `PrometheusCriticalJobScrapingFailure` => `CriticalJobScrapingFailure`
+  - `PrometheusJobScrapingFailure` => `JobScrapingFailure`
+  - `PrometheusFailsToCommunicateWithRemoteStorageAPI` => `MetricForwardingErrors`
 
 ## [4.23.0] - 2024-10-30
 

@@ -1,5 +1,5 @@
 # This files describe common alloy alerting rules
-# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml).
+# For alerts regarding the monitoring pipeline and the logging pipeline, please go to the respective files (logging-pipeline.rules.yml and monitoring-pipeline.rules.yml).
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -91,3 +91,103 @@ spec:
             cancel_if_cluster_status_updating: "true"
             cancel_if_node_unschedulable: "true"
             cancel_if_node_not_ready: "true"
+    - name: alloy.metrics
+      rules:
+        # This alert pages if monitoring-agent fails to send samples to its remote write endpoint.
+        - alert: MonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 20m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_has_no_workers: "true"
+        ## Same as MonitoringAgentDown, but triggers inhibition earlier and does not page.
+        - alert: InhibitionMonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+        ## This alert pages if any of the monitoring-agent shard is not running.
+        - alert: MonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Monitoring agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 40m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_outside_working_hours: "true"
+        ## Same as MonitoringAgentShardsNotSatisfied but triggers inhibition earlier, and does not page.
+        - alert: InhibitionMonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Monitoring agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
@@ -0,0 +1,80 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: monitoring-pipeline.rules
+  namespace: {{ .Values.namespace }}
+spec:
+  groups:
+  - name: monitoring-pipeline
+    rules:
+    - alert: MetricForwardingErrors
+      annotations:
+        description: '{{`Monitoring agent can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
+        opsrecipe: monitoring-pipeline/
+        dashboard: promRW001/prometheus-remote-write
+      expr: |-
+        rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1
+          or rate(prometheus_remote_storage_samples_total[10m]) == 0
+          or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+    - alert: JobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      expr: |-
+        (
+          count(up == 0) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(up) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 1d
+      labels:
+        area: platform
+        severity: notify
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+    - alert: CriticalJobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      ## We ignore bastion hosts node exporters
+      expr: |-
+        (
+          count(
+            (
+              up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+              or
+              up{job="kubelet", metrics_path="/metrics"}
+            ) == 0
+          ) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(
+            up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+            or
+            up{job="kubelet", metrics_path="/metrics"}
+          ) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 3d
+      labels:
+        area: platform
+        severity: page
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+
@@ -1,3 +1,4 @@
+# TODO(@giantswarm/team-atlas): revisit once vintage is gone
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -26,19 +27,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
-      annotations:
-        description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
-        opsrecipe: prometheus-cant-communicate-with-remote-storage-api/
-        dashboard: promRW001/prometheus-remote-write
-      expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusRuleFailures
       annotations:
         description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to evaluate rule(s) {{ printf "%.2f" $value }} time(s).`}}
@@ -52,48 +40,3 @@ spec:
         team: atlas
         topic: observability
         cancel_if_outside_working_hours: "true"
-    - alert: PrometheusJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1
-      for: 1d
-      labels:
-        area: platform
-        severity: notify
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-    - alert: PrometheusCriticalJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      ## We ignore bastion hosts node exporters
-      expr: |-
-        (
-          count(
-            (
-              up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-              or
-              up{job="kubelet", metrics_path="/metrics"}
-            ) == 0
-          ) BY (job, installation, cluster_id, provider, pipeline)
-          /
-          count(
-            up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-            or
-            up{job="kubelet", metrics_path="/metrics"}
-          ) BY (job, installation, cluster_id, provider, pipeline)
-        ) == 1
-      for: 3d
-      labels:
-        area: platform
-        severity: page
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-        cancel_if_cluster_is_not_running_monitoring_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"