remove deorecated app labels for ksm metrics (#1373)

giantswarm · Sep 23, 2024 · afa68df · afa68df
1 parent 671c69f
commit afa68df
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,7 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - Dashboard links in alertmanager and mimir rules
-- Remove deprecated app labels for external-dns and ingress-nginx alerts.
+- Remove deprecated app labels for `external-dns` and `ingress-nginx` alerts.
+- Remove deprecated app labels for `kube-state-metrics` alerts.
 - Fix falco events alerts node label to hostname as node does not exist.
 
 ## [4.15.2] - 2024-09-17

diff --git a/helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml
@@ -482,7 +482,7 @@ spec:
           )
         record: node_namespace_pod_container:container_memory_swap
       - expr: |
-          kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
+          kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
           group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
             (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
           )
@@ -491,15 +491,15 @@ spec:
           sum by (namespace, cluster_id, installation, pipeline, provider) (
               sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
                   max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
-                    kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"}
+                    kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
                   ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
                     kube_pod_status_phase{phase=~"Pending|Running"} == 1
                   )
               )
           )
         record: namespace_memory:kube_pod_container_resource_requests:sum
       - expr: |
-          kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
+          kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
           group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
             (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
           )
@@ -508,15 +508,15 @@ spec:
           sum by (namespace, cluster_id, installation, pipeline, provider) (
               sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
                   max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
-                    kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"}
+                    kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
                   ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
                     kube_pod_status_phase{phase=~"Pending|Running"} == 1
                   )
               )
           )
         record: namespace_cpu:kube_pod_container_resource_requests:sum
       - expr: |
-          kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
+          kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
           group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
             (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
           )
@@ -525,15 +525,15 @@ spec:
           sum by (namespace, cluster_id, installation, pipeline, provider) (
               sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
                   max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
-                    kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"}
+                    kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
                   ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
                     kube_pod_status_phase{phase=~"Pending|Running"} == 1
                   )
               )
           )
         record: namespace_memory:kube_pod_container_resource_limits:sum
       - expr: |
-          kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
+          kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster_id, installation, pipeline, provider)
           group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
            )
@@ -542,7 +542,7 @@ spec:
           sum by (namespace, cluster_id, installation, pipeline, provider) (
               sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
                   max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
-                    kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"}
+                    kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
                   ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
                     kube_pod_status_phase{phase=~"Pending|Running"} == 1
                   )
@@ -553,11 +553,11 @@ spec:
           max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
             label_replace(
               label_replace(
-                kube_pod_owner{app="kube-state-metrics", owner_kind="ReplicaSet"},
+                kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
                 "replicaset", "$1", "owner_name", "(.*)"
               ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
                 1, max by (replicaset, namespace, owner_name) (
-                  kube_replicaset_owner{app="kube-state-metrics"}
+                  kube_replicaset_owner{job="kube-state-metrics"}
                 )
               ),
               "workload", "$1", "owner_name", "(.*)"
@@ -569,7 +569,7 @@ spec:
       - expr: |
           max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
             label_replace(
-              kube_pod_owner{app="kube-state-metrics", owner_kind="DaemonSet"},
+              kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
               "workload", "$1", "owner_name", "(.*)"
             )
           )
@@ -579,7 +579,7 @@ spec:
       - expr: |
           max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
             label_replace(
-              kube_pod_owner{app="kube-state-metrics", owner_kind="StatefulSet"},
+              kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
               "workload", "$1", "owner_name", "(.*)"
             )
           )
@@ -589,7 +589,7 @@ spec:
       - expr: |
           max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
             label_replace(
-              kube_pod_owner{app="kube-state-metrics", owner_kind="Job"},
+              kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
               "workload", "$1", "owner_name", "(.*)"
             )
           )
@@ -648,7 +648,7 @@ spec:
       - expr: |
           topk by(cluster_id, installation, pipeline, provider, namespace, pod) (1,
             max by (cluster_id, installation, pipeline, provider, node, namespace, pod) (
-              label_replace(kube_pod_info{app="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
+              label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
           ))
         record: 'node_namespace_pod:kube_pod_info:'
       - expr: |
@@ -697,4 +697,4 @@ spec:
           histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, installation, pipeline, provider, instance, le) * on(cluster_id, installation, pipeline, provider, instance) group_left(node) kubelet_node_name{app="kubelet"})
         labels:
           quantile: "0.5"
-        record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
+        record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -16,16 +16,7 @@ spec:
         opsrecipe: kube-state-metrics-down/
       {{- if not .Values.mimir.enabled }}
       expr: |-
-        (
-          # modern clusters
-          label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1)
-        )
-        and
-        (
-          # vintage clusters without servicemonitor
-          # We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(@giantswarm/team-atlas): Remove when this is the case
-          label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
-        )
+        label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1)
       {{- else }}
       expr: |-
         count by (cluster_id, installation, provider, pipeline) (label_replace(up{job="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0
@@ -79,8 +70,7 @@ spec:
         opsrecipe: kube-state-metrics-down/
       expr: |-
         # When it looks up but we don't have metrics
-        # We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(@giantswarm/team-atlas): Remove when this is the case
-        count({job="kube-state-metrics", __name__=~"kube_.+"} or {app="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100
+        count({job="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100
       for: 20m
       labels:
         area: platform

diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/crossplane.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/crossplane.rules.test.yml
@@ -5,15 +5,15 @@ rule_files:
 tests:
   - interval: 1m
     input_series:
-      - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="crossplane", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
+      - series: 'kube_deployment_status_replicas_unavailable{job="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="crossplane", installation="gauss", instance="100.64.5.122:8080", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
         values: "0+0x20 1+0x100"
     alert_rule_test:
       - alertname: CrossplaneDeploymentNotSatisfied
         eval_time: 60m
         exp_alerts:
           - exp_labels:
               alertname: CrossplaneDeploymentNotSatisfied
-              app: kube-state-metrics
+              job: kube-state-metrics
               area: platform
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
@@ -26,7 +26,6 @@ tests:
               deployment: crossplane
               installation: gauss
               instance: 100.64.5.122:8080
-              job: gauss-prometheus/workload-gauss/0
               namespace: crossplane
               node: ip-10-0-5-119.eu-west-1.compute.internal
               organization: giantswarm
@@ -41,15 +40,15 @@ tests:
               opsrecipe: "deployment-not-satisfied/"
   - interval: 1m
     input_series:
-      - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="caicloud-event-exporter", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
+      - series: 'kube_deployment_status_replicas_unavailable{job="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="caicloud-event-exporter", installation="gauss", instance="100.64.5.122:8080", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
         values: "0+0x20 1+0x100"
     alert_rule_test:
       - alertname: CrossplaneDeploymentNotSatisfied
         eval_time: 51m
         exp_alerts:
           - exp_labels:
               alertname: CrossplaneDeploymentNotSatisfied
-              app: kube-state-metrics
+              job: kube-state-metrics
               area: platform
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
@@ -62,7 +61,6 @@ tests:
               deployment: caicloud-event-exporter
               installation: gauss
               instance: 100.64.5.122:8080
-              job: gauss-prometheus/workload-gauss/0
               namespace: crossplane
               node: ip-10-0-5-119.eu-west-1.compute.internal
               organization: giantswarm