change based on ops-recipes

giantswarm · Nov 4, 2024 · fbc9c8d · fbc9c8d
1 parent 40452a5
commit fbc9c8d
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 47 deletions.
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -18,7 +18,7 @@ spec:
           annotations:
             dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
             description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}'
-            opsrecipe: alloy-components/
+            opsrecipe: alloy/
             summary: Component evaluations are taking too long.
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
           for: 15m
@@ -35,7 +35,7 @@ spec:
           annotations:
             dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
             description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}'
-            opsrecipe: alloy-components/
+            opsrecipe: alloy/
             summary: Unhealthy components detected.
           expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
           for: 15m
@@ -48,3 +48,29 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
+    - name: logging-agent
+      rules:
+        # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
+        # and join the pods with the not running containers
+        - alert: LoggingAgentDown
+          annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
+            description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
+            opsrecipe: alloy/
+          expr: |-
+            kube_pod_info{pod=~"alloy-logs.*"}
+            * on(cluster_id, pod)
+              group_left ()
+              up{job="alloy-logs", container="alloy"} == 0
+          for: 30m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_node_unschedulable: "true"
+            cancel_if_node_not_ready: "true"
diff --git a/...m/atlas/alerting-rules/logging.rules.yaml → ...lerting-rules/logging-pipeline.rules.yaml b/...m/atlas/alerting-rules/logging.rules.yaml → ...lerting-rules/logging-pipeline.rules.yaml
@@ -3,45 +3,19 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: logging.rules
+  name: logging-pipeline.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-    - name: logging-agent
-      rules:
-        # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
-        # and join the pods with the not running containers
-        - alert: LoggingAgentDown
-          annotations:
-            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
-            description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
-            opsrecipe: logging-agent/
-          expr: |-
-            kube_pod_info{pod=~"alloy-logs.*"}
-            * on(cluster_id, pod)
-              group_left ()
-              up{job="alloy-logs", container="alloy"} == 0
-          for: 30m
-          labels:
-            area: platform
-            severity: page
-            team: atlas
-            topic: observability
-            cancel_if_outside_working_hours: "true"
-            cancel_if_cluster_status_creating: "true"
-            cancel_if_cluster_status_deleting: "true"
-            cancel_if_cluster_status_updating: "true"
-            cancel_if_node_unschedulable: "true"
-            cancel_if_node_not_ready: "true"
-    - name: log-ingestion
+    - name: logging-pipeline
       rules:
         # Any alloy component that uses the loki.write component can throw such errors.
         # This includes alloy-logs and the observability-gateway
         - alert: LogForwardingErrors
           annotations:
             dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`More that 10% of the requests to Loki are failing.`}}'
-            opsrecipe: log-shipping-errors/
+            opsrecipe: logging-pipeline/
           expr: |-
             (
               100
@@ -79,7 +53,7 @@ spec:
           annotations:
             dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
             description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}'
-            opsrecipe: log-shipping-errors/
+            opsrecipe: logging-pipeline/
           expr: |-
             (
               100

diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -34,7 +34,7 @@ tests:
             exp_annotations:
               dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
               description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1."
-              opsrecipe: "alloy-components/"
+              opsrecipe: "alloy/"
               summary: "Component evaluations are taking too long."
       - alertname: AlloySlowComponentEvaluations
         eval_time: 80m
@@ -68,7 +68,7 @@ tests:
             exp_annotations:
               dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
               description: "Unhealthy components detected under job alloy-controller"
-              opsrecipe: "alloy-components/"
+              opsrecipe: "alloy/"
               summary: "Unhealthy components detected."
       - alertname: AlloyUnhealthyComponents
         eval_time: 80m
diff --git a/...las/alerting-rules/logging.rules.test.yml → ...ing-rules/logging-pipeline.rules.test.yml b/...las/alerting-rules/logging.rules.test.yml → ...ing-rules/logging-pipeline.rules.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-  - logging.rules.yml
+  - logging-pipeline.rules.yml
 
 tests:
   # Test LoggingAgentDown
@@ -47,8 +47,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
       # Tests with 2 pods
       - alertname: LoggingAgentDown
         eval_time: 111m
@@ -72,8 +72,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
       - alertname: LoggingAgentDown
         eval_time: 121m
       - alertname: LoggingAgentDown
@@ -98,8 +98,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
           - exp_labels:
               area: platform
               cancel_if_outside_working_hours: "true"
@@ -119,8 +119,8 @@ tests:
               team: atlas
               topic: observability
             exp_annotations:
-              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
-              opsrecipe: "logging-agent/"
+              description: "Scraping of all alloy pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
   # Test LogForwardingErrors
   - interval: 1m
     input_series:
@@ -155,7 +155,7 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the requests to Loki are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"
       - alertname: LogForwardingErrors
         eval_time: 330m
         exp_alerts:
@@ -173,7 +173,7 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the requests to Loki are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"
   # Test LogReceivingErrors
   - interval: 1m
     input_series:
@@ -208,7 +208,7 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the loki requests to the observability gateway are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"
       - alertname: LogReceivingErrors
         eval_time: 330m
         exp_alerts:
@@ -226,4 +226,4 @@ tests:
               topic: observability
             exp_annotations:
               description: "More that 10% of the loki requests to the observability gateway are failing."
-              opsrecipe: "log-shipping-errors/"
+              opsrecipe: "logging-pipeline/"