Skip to content

Commit

Permalink
Fix some shared alert ownership
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
  • Loading branch information
QuentinBisson committed Jun 10, 2024
1 parent 90fb609 commit d48f106
Show file tree
Hide file tree
Showing 10 changed files with 83 additions and 80 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## TODO Remove with vintage
## TODO Remove when all vintage installations are gone
# This rule applies to vintage aws management clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
## TODO Remove when all vintage installations are gone
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
# No need for .Values.mimir.enabled condition - will be gone with Vintage
cluster_type: "workload_cluster"
name: aws.node.workload-cluster.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: aws.node
rules:
- alert: AWSWorkloadClusterNodeTooManyAutoTermination
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}'
opsrecipe: node-too-many-auto-termination-aws/
expr: increase(aws_operator_unhealthy_node_termination_count[60m]) > 10
for: 15m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
severity: page
team: phoenix
topic: kubernetes
- alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes
annotations:
description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}'
opsrecipe: aws-node-taint-NodeWithImpairedVolumes/
expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"}
for: 30m
labels:
area: kaas
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,25 @@ metadata:
namespace: {{ .Values.namespace }}
spec:
groups:
- name: cadvisor
rules:
- alert: CadvisorDown
annotations:
description: '{{`Cadvisor ({{ $labels.instance }}) is down.`}}'
opsrecipe: kubelet-is-down/
expr: label_replace(up{app="cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0
for: 1h
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_kubelet_down: "true"
cancel_if_cluster_has_no_workers: "true"
cancel_if_outside_working_hours: "true"
cancel_if_prometheus_agent_down: "true"
severity: page
team: {{ include "providerTeam" . }}
topic: kubernetes
- name: kubelet
rules:
- alert: KubeletConditionBad
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ metadata:
{{- if not .Values.mimir.enabled }}
cluster_type: "workload_cluster"
{{- end }}
name: node.rules
name: node.workload-cluster.rules
namespace: {{ .Values.namespace }}
spec:
groups:
Expand All @@ -27,22 +27,6 @@ spec:
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
{{- if eq .Values.managementCluster.provider.kind "aws" }}
- alert: AWSWorkloadClusterNodeTooManyAutoTermination
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}'
opsrecipe: node-too-many-auto-termination-aws/
expr: increase(aws_operator_unhealthy_node_termination_count[60m]) > 10
for: 15m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
severity: page
team: phoenix
topic: kubernetes
{{- end }}
- alert: NodeStateFlappingUnderLoad
# Check if the kubelet status is flapping, unless the node is under load.
# It helps to read this rule from the bottom upwards.
Expand Down Expand Up @@ -126,19 +110,6 @@ spec:
severity: notify
team: {{ include "providerTeam" . }}
topic: infrastructure
{{- if eq .Values.managementCluster.provider.kind "aws" }}
- alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes
annotations:
description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}'
opsrecipe: aws-node-taint-NodeWithImpairedVolumes/
expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"}
for: 30m
labels:
area: kaas
severity: notify
team: {{ include "providerTeam" . }}
topic: kubernetes
{{- end }}
- alert: WorkloadClusterMasterMemoryUsageTooHigh
annotations:
description: '{{`Machine {{ $labels.instance }} memory usage is too high (less than 10% and 2G of allocatable memory).`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,25 @@ spec:
groups:
- name: chart
rules:
- alert: ChartOperatorDown
annotations:
description: '{{`ChartOperator ({{ $labels.instance }}) is down.`}}'
opsrecipe: chart-operator-down/
expr: label_replace(up{app=~"chart-operator.*"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
for: 15m
labels:
area: platform
cancel_if_cluster_control_plane_unhealthy: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_kubelet_down: "true"
cancel_if_cluster_has_no_workers: "true"
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
cancel_if_prometheus_agent_down: "true"
severity: notify
team: honeybadger
topic: releng
- alert: ChartOrphanConfigMap
annotations:
description: '{{`Chart configmaps have not been deleted.`}}'
Expand Down
48 changes: 0 additions & 48 deletions helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
rule_files:
- kyverno.all.rules.yml
- kyverno.rules.yml
tests:
- interval: 1m
input_series:
Expand Down

0 comments on commit d48f106

Please sign in to comment.