From bb1f4ea9cd334f92d0848d53fb345b667246c31b Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Mon, 10 Jun 2024 21:18:58 +0200 Subject: [PATCH] Fix some shared alert ownership (#1228) * Fix some shared alert ownership Signed-off-by: QuentinBisson * Update helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml --------- Signed-off-by: QuentinBisson --- .../phoenix/alerting-rules/aws.job.rules.yml | 2 +- .../aws.node.workload-cluster.yml | 42 ++++++++++++++++ .../turtles}/alerting-rules/job.rules.yml | 0 .../turtles/alerting-rules/kubelet.rules.yml | 19 ++++++++ .../node.management-cluster.rules.yml} | 0 .../node.workload-cluster.rules.yml} | 31 +----------- .../alerting-rules/chart.rules.yml | 19 ++++++++ .../shared/alerting-rules/up.rules.yml | 48 ------------------- ....rules.test.yml => kyverno.rules.test.yml} | 2 +- 9 files changed, 83 insertions(+), 80 deletions(-) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.yml rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/job.rules.yml (100%) rename helm/prometheus-rules/templates/{shared/alerting-rules/node.management_cluster.rules.yml => kaas/turtles/alerting-rules/node.management-cluster.rules.yml} (100%) rename helm/prometheus-rules/templates/{shared/alerting-rules/node.workload_cluster.rules.yml => kaas/turtles/alerting-rules/node.workload-cluster.rules.yml} (82%) delete mode 100644 helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml rename test/tests/providers/global/platform/shield/alerting-rules/{kyverno.all.rules.test.yml => kyverno.rules.test.yml} (99%) diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml index b906ad11e..37544d7ef 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml @@ -1,4 +1,4 @@ -## TODO Remove with vintage +## TODO Remove when all vintage installations are gone # This rule applies to vintage aws management clusters {{- if eq .Values.managementCluster.provider.flavor "vintage" }} apiVersion: monitoring.coreos.com/v1 diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.yml new file mode 100644 index 000000000..912fd2544 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.yml @@ -0,0 +1,42 @@ +{{- if eq .Values.managementCluster.provider.flavor "vintage" }} +## TODO Remove when all vintage installations are gone +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + # No need for .Values.mimir.enabled condition - will be gone with Vintage + cluster_type: "workload_cluster" + name: aws.node.workload-cluster.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: aws.node + rules: + - alert: AWSWorkloadClusterNodeTooManyAutoTermination + annotations: + description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}' + opsrecipe: node-too-many-auto-termination-aws/ + expr: increase(aws_operator_unhealthy_node_termination_count[60m]) > 10 + for: 15m + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + severity: page + team: phoenix + topic: kubernetes + - alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes + annotations: + description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}' + opsrecipe: aws-node-taint-NodeWithImpairedVolumes/ + expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"} + for: 30m + labels: + area: kaas + severity: notify + team: {{ include "providerTeam" . }} + topic: kubernetes +{{- end }} diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/job.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/job.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/job.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/job.rules.yml diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml index 3c3c003f4..ea0163a88 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml @@ -8,6 +8,25 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: + - name: cadvisor + rules: + - alert: CadvisorDown + annotations: + description: '{{`Cadvisor ({{ $labels.instance }}) is down.`}}' + opsrecipe: kubelet-is-down/ + expr: label_replace(up{job="kubelet", metrics_path="/metrics/cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0 + for: 1h + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_outside_working_hours: "true" + cancel_if_prometheus_agent_down: "true" + severity: page + team: {{ include "providerTeam" . }} + topic: kubernetes - name: kubelet rules: - alert: KubeletConditionBad diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/node.management_cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/node.management_cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/node.workload_cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml similarity index 82% rename from helm/prometheus-rules/templates/shared/alerting-rules/node.workload_cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml index 17148054b..2299ab01a 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/node.workload_cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml @@ -7,7 +7,7 @@ metadata: {{- if not .Values.mimir.enabled }} cluster_type: "workload_cluster" {{- end }} - name: node.rules + name: node.workload-cluster.rules namespace: {{ .Values.namespace }} spec: groups: @@ -27,22 +27,6 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: kubernetes - {{- if eq .Values.managementCluster.provider.kind "aws" }} - - alert: AWSWorkloadClusterNodeTooManyAutoTermination - annotations: - description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}' - opsrecipe: node-too-many-auto-termination-aws/ - expr: increase(aws_operator_unhealthy_node_termination_count[60m]) > 10 - for: 15m - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - severity: page - team: phoenix - topic: kubernetes - {{- end }} - alert: NodeStateFlappingUnderLoad # Check if the kubelet status is flapping, unless the node is under load. # It helps to read this rule from the bottom upwards. @@ -126,19 +110,6 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: infrastructure - {{- if eq .Values.managementCluster.provider.kind "aws" }} - - alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes - annotations: - description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}' - opsrecipe: aws-node-taint-NodeWithImpairedVolumes/ - expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"} - for: 30m - labels: - area: kaas - severity: notify - team: {{ include "providerTeam" . }} - topic: kubernetes - {{- end }} - alert: WorkloadClusterMasterMemoryUsageTooHigh annotations: description: '{{`Machine {{ $labels.instance }} memory usage is too high (less than 10% and 2G of allocatable memory).`}}' diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml index 0616a0149..e5038175e 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml @@ -10,6 +10,25 @@ spec: groups: - name: chart rules: + - alert: ChartOperatorDown + annotations: + description: '{{`ChartOperator ({{ $labels.instance }}) is down.`}}' + opsrecipe: chart-operator-down/ + expr: label_replace(up{app=~"chart-operator.*"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 + for: 15m + labels: + area: platform + cancel_if_cluster_control_plane_unhealthy: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_kubelet_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" + severity: notify + team: honeybadger + topic: releng - alert: ChartOrphanConfigMap annotations: description: '{{`Chart configmaps have not been deleted.`}}' diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml deleted file mode 100644 index f0cf0bdd5..000000000 --- a/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml +++ /dev/null @@ -1,48 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - name: up.all.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: up.all - rules: - - alert: ChartOperatorDown - annotations: - description: '{{`ChartOperator ({{ $labels.instance }}) is down.`}}' - opsrecipe: chart-operator-down/ - expr: label_replace(up{app=~"chart-operator.*"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 - for: 15m - labels: - area: platform - cancel_if_cluster_control_plane_unhealthy: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_kubelet_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - cancel_if_prometheus_agent_down: "true" - severity: notify - team: honeybadger - topic: releng - - alert: CadvisorDown - annotations: - description: '{{`Cadvisor ({{ $labels.instance }}) is down.`}}' - opsrecipe: kubelet-is-down/ - expr: label_replace(up{job="kubelet", metrics_path="/metrics/cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0 - for: 1h - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_kubelet_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_outside_working_hours: "true" - cancel_if_prometheus_agent_down: "true" - severity: page - team: phoenix - topic: kubernetes diff --git a/test/tests/providers/global/platform/shield/alerting-rules/kyverno.all.rules.test.yml b/test/tests/providers/global/platform/shield/alerting-rules/kyverno.rules.test.yml similarity index 99% rename from test/tests/providers/global/platform/shield/alerting-rules/kyverno.all.rules.test.yml rename to test/tests/providers/global/platform/shield/alerting-rules/kyverno.rules.test.yml index aff336e2d..46bf9b7c3 100644 --- a/test/tests/providers/global/platform/shield/alerting-rules/kyverno.all.rules.test.yml +++ b/test/tests/providers/global/platform/shield/alerting-rules/kyverno.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: - - kyverno.all.rules.yml + - kyverno.rules.yml tests: - interval: 1m input_series: