Get rid of the app, role and node external labels in Atlas rules (#1199)

* Get rid of the app, role and node external labels in Atlas rules * Remove extra line * Fix KSM * Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml Co-authored-by: Quentin Bisson <[email protected]> * Fix KSM --------- Co-authored-by: Quentin Bisson <[email protected]>
giantswarm · May 30, 2024 · a5b251d · a5b251d
1 parent 6442f5b
commit a5b251d
Show file tree

Hide file tree

Showing 32 changed files with 300 additions and 301 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Get rid of the `app`, `role` and `node` external labels in Atlas rules.
+
 ## [4.1.0] - 2024-05-30
 
 ### Added 

diff --git a/README.md b/README.md
@@ -265,7 +265,7 @@ In order to incorporate the SLO Framework in the Prometheus rules, several rules
 Those rules can be written according to this template :
 ```
 # Amout of requests for VPA
-- expr: "count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id)"
+- expr: "count(up{job=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id)"
   labels:
     class: MEDIUM
     area: platform
@@ -278,7 +278,7 @@ Those rules can be written according to this template :
 # and summed with 1 so the final result is 0 : no error recorded.
 # If up was unsuccessful, there is an error. Up returns 0, multiplied by -1 and summed
 # with 1 so the final result is 1 : 1 error is recorded .
-- expr: "sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type)"
+- expr: "sum((up{job=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type)"
   labels:
     class: MEDIUM
     area: platform

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alertmanager.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alertmanager.rules.yml
@@ -20,7 +20,7 @@ spec:
       expr: rate(alertmanager_notifications_failed_total{integration!="opsgenie", cluster_type="management_cluster"}[20m]) > 0
       for: 45m
       labels:
-        area: empowerment
+        area: platform
         severity: page
         team: atlas
         topic: monitoring
@@ -33,7 +33,7 @@ spec:
       expr: rate(alertmanager_notifications_failed_total{integration="opsgenie", cluster_type="management_cluster"}[20m]) > 0
       for: 30m
       labels:
-        area: empowerment
+        area: platform
         severity: notify
         team: atlas
         topic: monitoring
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml
@@ -18,7 +18,7 @@ spec:
       expr: rate(fluentbit_output_retries_failed_total[10m]) > 0
       for: 20m
       labels:
-        area: empowerment
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -34,7 +34,7 @@ spec:
       expr: rate(fluentbit_output_dropped_records_total[10m]) / (rate(fluentbit_output_proc_records_total[10m]) + rate(fluentbit_output_dropped_records_total[10m])) > 0.01
       for: 20m
       labels:
-        area: empowerment
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -46,10 +46,10 @@ spec:
       annotations:
         description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
         opsrecipe: fluentbit-down/
-      expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, installation, provider, pipeline, job, namespace, node) == 0
+      expr: sum(up{job="fluent-logshipping-app"}) by (job, cluster_id, installation, provider, pipeline, namespace, node) == 0
       for: 15m
       labels:
-        area: empowerment
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -64,11 +64,11 @@ spec:
       expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0
       for: 1h
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas
-        topic: managementcluster
+        topic: observability
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
@@ -20,7 +20,7 @@ spec:
       expr: up{service="grafana", cluster_type="management_cluster"} == 0
       for: 1h
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -38,7 +38,7 @@ spec:
       expr: sum by(cluster_id, installation, provider, pipeline) (increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
       for: 6h
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -60,7 +60,7 @@ spec:
       expr: sum by (cronjob, cluster_id, installation, provider, pipeline) (label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) > 0
       for: 6h
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -77,7 +77,7 @@ spec:
       expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400
             or count by (cluster_id, cronjob, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permissions-.+", cluster_type="management_cluster"}[1d]), "cronjob", "grafana-permissions", "job_name", "grafana-permissions-.+") == 1) == 0
       labels:
-        area: empowerment
+        area: platform
         severity: page
         team: atlas
         topic: managementcluster

diff --git a/...ometheus-rules/templates/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.yml b/...ometheus-rules/templates/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.yml
@@ -33,13 +33,13 @@ spec:
         count(
           label_replace(
             sum_over_time(
-              app_operator_app_info{app="prometheus-agent"}[5m]
+              app_operator_app_info{name="prometheus-agent"}[5m]
             ), "cluster_id", "$1", "namespace", "(.*)"
           )
         ) by (cluster_id)
       labels:
         cluster_is_not_running_prometheus_agent: "true"
-        area: empowerment
+        area: platform
         team: atlas
         topic: monitoring
 {{- end }}
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml
@@ -16,7 +16,7 @@ spec:
       expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0
       for: 10m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -31,7 +31,7 @@ spec:
       expr: increase(keda_scaled_object_errors[10m])> 0
       for: 15m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -46,7 +46,7 @@ spec:
       expr: increase(keda_webhook_scaled_object_validation_errors[10m]) > 0
       for: 15m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -61,7 +61,7 @@ spec:
       expr: increase(keda_scaler_errors[10m]) > 0
       for: 15m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -18,16 +18,17 @@ spec:
       expr: |-
         (
           # modern clusters
-          label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1)
+          label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1)
         )
         and
         (
           # vintage clusters without servicemonitor
+          # We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(atlas): Remove when this is the case
           label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
         )
       {{- else }}
       expr: |-
-        count by (cluster_id, installation, provider, pipeline) (label_replace(up{app="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0
+        count by (cluster_id, installation, provider, pipeline) (label_replace(up{job="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0
         or (
               label_replace(
                   capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
@@ -37,12 +38,12 @@ spec:
                   "(.*)"
                 ) == 1
               ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
-                count(up{app="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region)
+                count(up{job="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region)
           )
       {{- end }}
       for: 15m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -61,7 +62,7 @@ spec:
       expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id, installation, provider, pipeline)) > 7
       for: 15m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -78,10 +79,11 @@ spec:
         opsrecipe: kube-state-metrics-down/
       expr: |-
         # When it looks up but we don't have metrics
-        count({app="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100
+        # We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(atlas): Remove when this is the case
+        count({job="kube-state-metrics", __name__=~"kube_.+"} or {app="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100
       for: 20m
       labels:
-        area: kaas
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -116,7 +118,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -144,7 +146,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -172,7 +174,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -200,7 +202,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -228,7 +230,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -256,7 +258,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -284,7 +286,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -312,7 +314,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -340,7 +342,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page
@@ -368,7 +370,7 @@ spec:
       {{- end }}
       for: 30m
       labels:
-        area: kaas
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_kube_state_metrics_down: "true"
         severity: page

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
@@ -22,7 +22,7 @@ spec:
         ) > 5
       for: 5m
       labels:
-        area: managedservices
+        area: platform
         # This label is used to ensure the alert go through even for non-stable installations
         all_pipelines: "true"
         cancel_if_outside_working_hours: "true"
@@ -41,7 +41,7 @@ spec:
           > 10
       for: 120m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -58,7 +58,7 @@ spec:
       expr: |
         sum(increase(loki_panic_total[10m])) by (cluster_id, installation, provider, pipeline, namespace, job) > 0
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -73,10 +73,10 @@ spec:
         description: '{{`Loki pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) sees {{ $value }} unhealthy ring members`}}'
         opsrecipe: loki/
       expr: |
-        sum by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) (cortex_ring_members{state="Unhealthy", app="loki"}) > 0
+        sum by (job, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) (cortex_ring_members{state="Unhealthy", job=~"loki/.*"}) > 0
       for: 30m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"