Skip to content

Commit

Permalink
remove deorecated app labels for ksm metrics (#1373)
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored Sep 23, 2024
1 parent 671c69f commit afa68df
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 48 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

- Dashboard links in alertmanager and mimir rules
- Remove deprecated app labels for external-dns and ingress-nginx alerts.
- Remove deprecated app labels for `external-dns` and `ingress-nginx` alerts.
- Remove deprecated app labels for `kube-state-metrics` alerts.
- Fix falco events alerts node label to hostname as node does not exist.

## [4.15.2] - 2024-09-17
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ spec:
)
record: node_namespace_pod_container:container_memory_swap
- expr: |
kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -491,15 +491,15 @@ spec:
sum by (namespace, cluster_id, installation, pipeline, provider) (
sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"}
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- expr: |
kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -508,15 +508,15 @@ spec:
sum by (namespace, cluster_id, installation, pipeline, provider) (
sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"}
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: |
kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -525,15 +525,15 @@ spec:
sum by (namespace, cluster_id, installation, pipeline, provider) (
sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"}
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: |
kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider)
group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -542,7 +542,7 @@ spec:
sum by (namespace, cluster_id, installation, pipeline, provider) (
sum by (namespace, pod, cluster_id, installation, pipeline, provider) (
max by (namespace, pod, container, cluster_id, installation, pipeline, provider) (
kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"}
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
Expand All @@ -553,11 +553,11 @@ spec:
max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{app="kube-state-metrics", owner_kind="ReplicaSet"},
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{app="kube-state-metrics"}
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
Expand All @@ -569,7 +569,7 @@ spec:
- expr: |
max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
label_replace(
kube_pod_owner{app="kube-state-metrics", owner_kind="DaemonSet"},
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
Expand All @@ -579,7 +579,7 @@ spec:
- expr: |
max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
label_replace(
kube_pod_owner{app="kube-state-metrics", owner_kind="StatefulSet"},
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
Expand All @@ -589,7 +589,7 @@ spec:
- expr: |
max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) (
label_replace(
kube_pod_owner{app="kube-state-metrics", owner_kind="Job"},
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
Expand Down Expand Up @@ -648,7 +648,7 @@ spec:
- expr: |
topk by(cluster_id, installation, pipeline, provider, namespace, pod) (1,
max by (cluster_id, installation, pipeline, provider, node, namespace, pod) (
label_replace(kube_pod_info{app="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
Expand Down Expand Up @@ -697,4 +697,4 @@ spec:
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, installation, pipeline, provider, instance, le) * on(cluster_id, installation, pipeline, provider, instance) group_left(node) kubelet_node_name{app="kubelet"})
labels:
quantile: "0.5"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,7 @@ spec:
opsrecipe: kube-state-metrics-down/
{{- if not .Values.mimir.enabled }}
expr: |-
(
# modern clusters
label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1)
)
and
(
# vintage clusters without servicemonitor
# We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(@giantswarm/team-atlas): Remove when this is the case
label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
)
label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1)
{{- else }}
expr: |-
count by (cluster_id, installation, provider, pipeline) (label_replace(up{job="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0
Expand Down Expand Up @@ -79,8 +70,7 @@ spec:
opsrecipe: kube-state-metrics-down/
expr: |-
# When it looks up but we don't have metrics
# We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(@giantswarm/team-atlas): Remove when this is the case
count({job="kube-state-metrics", __name__=~"kube_.+"} or {app="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100
count({job="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100
for: 20m
labels:
area: platform
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ rule_files:
tests:
- interval: 1m
input_series:
- series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="crossplane", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
- series: 'kube_deployment_status_replicas_unavailable{job="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="crossplane", installation="gauss", instance="100.64.5.122:8080", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
values: "0+0x20 1+0x100"
alert_rule_test:
- alertname: CrossplaneDeploymentNotSatisfied
eval_time: 60m
exp_alerts:
- exp_labels:
alertname: CrossplaneDeploymentNotSatisfied
app: kube-state-metrics
job: kube-state-metrics
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
Expand All @@ -26,7 +26,6 @@ tests:
deployment: crossplane
installation: gauss
instance: 100.64.5.122:8080
job: gauss-prometheus/workload-gauss/0
namespace: crossplane
node: ip-10-0-5-119.eu-west-1.compute.internal
organization: giantswarm
Expand All @@ -41,15 +40,15 @@ tests:
opsrecipe: "deployment-not-satisfied/"
- interval: 1m
input_series:
- series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="caicloud-event-exporter", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
- series: 'kube_deployment_status_replicas_unavailable{job="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="caicloud-event-exporter", installation="gauss", instance="100.64.5.122:8080", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}'
values: "0+0x20 1+0x100"
alert_rule_test:
- alertname: CrossplaneDeploymentNotSatisfied
eval_time: 51m
exp_alerts:
- exp_labels:
alertname: CrossplaneDeploymentNotSatisfied
app: kube-state-metrics
job: kube-state-metrics
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
Expand All @@ -62,7 +61,6 @@ tests:
deployment: caicloud-event-exporter
installation: gauss
instance: 100.64.5.122:8080
job: gauss-prometheus/workload-gauss/0
namespace: crossplane
node: ip-10-0-5-119.eu-west-1.compute.internal
organization: giantswarm
Expand Down
Loading

0 comments on commit afa68df

Please sign in to comment.