Clean up some rules a bit

giantswarm · Nov 5, 2024 · 9e72664 · 9e72664
1 parent fbc9c8d
commit 9e72664
Show file tree

Hide file tree

Showing 11 changed files with 204 additions and 191 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,14 +14,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
-  - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics.
 
 ### Changed
 
 - Update `DeploymentNotSatisfiedAtlas` to take into account the following components:
   - `observability-operator`
   - `alloy-rules`
   - `observability-gateway`
+- Move all `grafana-cloud` related alerts to their own file.
+- Move all alloy related alerts to the alloy alert file and fix alloy-logs tests.
 
 ## [4.23.0] - 2024-10-30
 

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -6,7 +6,7 @@ metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: alloy.rules
-  namespace: {{ .Values.namespace  }}
+  namespace: {{ .Values.namespace }}
 spec:
   groups:
     # List of alerts for on the state of the alloy components.
@@ -48,7 +48,24 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
-    - name: logging-agent
+    - name: alloy.rules
+      rules:
+        - alert: AlloyForPrometheusRulesDown
+          annotations:
+            description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.'
+            opsrecipe: prometheus-rules/
+          expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0
+          for: 1h
+          labels:
+            area: platform
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_outside_working_hours: "true"
+            severity: page
+            team: atlas
+            topic: observability
+    - name: alloy.logs
       rules:
         # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
         # and join the pods with the not running containers

diff --git a/...mimir-to-grafana-cloud-exporter.rules.yml → ...as/alerting-rules/grafana-cloud.rules.yml b/...mimir-to-grafana-cloud-exporter.rules.yml → ...as/alerting-rules/grafana-cloud.rules.yml
@@ -1,13 +1,35 @@
-{{- if .Values.mimir.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: mimir-to-grafana-cloud-exporter.rules
-  namespace: {{ .Values.namespace }}
+    {{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+    {{- end }}
+  name: grafana-cloud.rules
+  namespace: {{ .Values.namespace  }}
 spec:
   groups:
+  - name: grafana-cloud
+    rules:
+    ## Pages Atlas when prometheus fails to send samples to cortex
+    - alert: PrometheusMissingGrafanaCloud
+      annotations:
+        description: 'Prometheus is not sending data to Grafana Cloud.'
+        opsrecipe: prometheus-grafanacloud/
+      {{- if .Values.mimir.enabled }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+      {{- else }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
+      {{- end }}
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+  {{- if .Values.mimir.enabled }}
   - name: mimir-to-grafana-cloud-exporter
     rules:
     - alert: MimirToGrafanaCloudExporterDown
@@ -73,4 +95,4 @@ spec:
         severity: page
         team: atlas
         topic: observability
-{{- end }}
+  {{- end }}
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
@@ -3,9 +3,9 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if not .Values.mimir.enabled }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: grafana.rules
   namespace: {{ .Values.namespace }}
 spec:

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -85,7 +85,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-
     - alert: KubeConfigMapCreatedMetricMissing
       annotations:
         description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
@@ -61,21 +61,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: AlloyForPrometheusRulesDown
-      annotations:
-        description: 'Alloy sending PrometheusRules to Mimir ruler is down.'
-        opsrecipe: prometheus-rules/
-      expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: MimirRulerEventsFailed
       annotations:
         dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,7 +1,6 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: prometheus.rules
@@ -27,23 +26,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    ## Pages Atlas when prometheus fails to send samples to cortex
-    - alert: PrometheusMissingGrafanaCloud
-      annotations:
-        description: 'Prometheus is not sending data to Grafana Cloud.'
-        opsrecipe: prometheus-grafanacloud/
-      {{- if .Values.mimir.enabled }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
-      {{- else }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
-      {{- end }}
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
       annotations:
         description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'

diff --git a/...-to-grafana-cloud-exporter.rules.test.yml → ...las/alerting-rules/grafana-cloud.test.yml b/...-to-grafana-cloud-exporter.rules.test.yml → ...las/alerting-rules/grafana-cloud.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-- mimir-to-grafana-cloud-exporter.rules.yml
+- grafana-cloud.rules.yml
 
 tests:
   # Tests for `MimirToGrafanaCloudExporterDown` alert

diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
@@ -86,35 +86,6 @@ tests:
               dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
               description: "Mimir component : mimir-ingester is down."
               opsrecipe: "mimir/"
-  - interval: 1m
-    input_series:
-      # test with 1 pod: none, up, down
-      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}'
-        values: "_x20 1+0x70 0+0x70"
-    alert_rule_test:
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 10m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 80m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 160m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cluster_id: golem
-              installation: golem
-              provider: capa
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Alloy sending PrometheusRules to Mimir ruler is down."
-              opsrecipe: "prometheus-rules/"
   - interval: 1m
     input_series:
       # test: none, rate > 0, rate = 0

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -72,3 +72,157 @@ tests:
               summary: "Unhealthy components detected."
       - alertname: AlloyUnhealthyComponents
         eval_time: 80m
+
+  # Test AlloyForPrometheusRulesDown
+  - interval: 1m
+    input_series:
+      # test with 1 pod: none, up, down
+      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}'
+        values: "_x20 1+0x70 0+0x70"
+    alert_rule_test:
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 10m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 80m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 160m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: golem
+              installation: golem
+              provider: capa
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down."
+              opsrecipe: "prometheus-rules/"
+
+  # Test LoggingAgentDown
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: none, up, down
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 1+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 0+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+    alert_rule_test:
+      - alertname: LoggingAgentDown
+        eval_time: 10m
+      - alertname: LoggingAgentDown
+        eval_time: 30m
+      - alertname: LoggingAgentDown
+        eval_time: 71m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-1.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-1xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      # Tests with 2 pods
+      - alertname: LoggingAgentDown
+        eval_time: 111m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      - alertname: LoggingAgentDown
+        eval_time: 121m
+      - alertname: LoggingAgentDown
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-2.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-2xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"