Skip to content

Commit

Permalink
review phoenix alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Jun 10, 2024
1 parent 6a20ebf commit c64ff05
Show file tree
Hide file tree
Showing 12 changed files with 111 additions and 57 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if eq .Values.managementCluster.provider.kind "aws" }}
# This rule applies to vintage aws and capa workload clusters
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -18,10 +18,10 @@ spec:
annotations:
description: '{{`AWS load balancer controller pod {{ $labels.namespace}}/{{ $labels.pod }} on {{ $labels.cluster_id}} is throwing {{ $labels.error_code }} errors when contacting AWS API.`}}'
opsrecipe: alb-errors
expr: sum(increase(aws_api_calls_total{error_code != ""}[20m])) by (error_code,namespace,pod,cluster_id) > 0
expr: sum(increase(aws_api_calls_total{cluster_type="workload_cluster", error_code != "", provider=~"aws|capa|eks"}[20m])) by (cluster_id, error_code, installation, namespace, pipeline, provider, pod) > 0
for: 40m
labels:
area: managedservices
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -33,15 +33,14 @@ spec:
annotations:
description: '{{`AWS load balancer controller pod {{ $labels.namespace }}/{{ $labels.pod }} on {{ $labels.cluster_id }} is throwing errors while reconciling the {{ $labels.controller }} controller.`}}'
opsrecipe: alb-errors
expr: sum(increase(controller_runtime_reconcile_total{service="aws-load-balancer-controller", result = "error"}[20m])) by (controller,namespace,pod,cluster_id) > 0
expr: sum(increase(controller_runtime_reconcile_total{cluster_type="workload_cluster", provider=~"aws|capa|eks", result = "error", service="aws-load-balancer-controller"}[20m])) by (cluster_id, controller, installation, namespace, pipeline, provider, pod) > 0
for: 40m
labels:
area: managedservices
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: page
team: phoenix
topic: alb
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
{{- if eq .Values.managementCluster.provider.kind "aws" }}
## TODO Remove with vintage
# This rule applies to vintage aws management clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down Expand Up @@ -161,32 +163,4 @@ spec:
severity: page
team: phoenix
topic: kubernetes
- alert: IRSATooManyErrors
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
expr: irsa_operator_cluster_errors > 0
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
- alert: IRSAACMCertificateExpiringInLessThan60Days
annotations:
description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
opsrecipe: irsa-acm-certificate-expiring/
expr: min(irsa_operator_acm_certificate_not_after) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if eq .Values.managementCluster.provider.kind "aws" }}
{{- if or (eq .Values.managementCluster.provider.kind "aws") (eq .Values.managementCluster.provider.kind "capa") }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -13,11 +13,12 @@ spec:
groups:
- name: aws
rules:
## TODO review this one (cluster-autoscaler, etcd-kubernetes-resources-count-exporter should be turtles)
- alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS
annotations:
description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
opsrecipe: container-is-restarting-too-often/
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-plugin.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10
for: 10m
labels:
area: kaas
Expand All @@ -29,10 +30,13 @@ spec:
severity: page
team: phoenix
topic: kubernetes
## TODO Remove with Vintage
{{- if eq .Values.managementCluster.provider.kind "aws"}}
- alert: WorkloadClusterCriticalPodNotRunningAWS
annotations:
description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}'
opsrecipe: critical-pod-is-not-running/
### Those pods only exists in vintage
expr: kube_pod_container_status_running{namespace="kube-system",container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1
for: 20m
labels:
Expand All @@ -43,6 +47,8 @@ spec:
severity: page
team: phoenix
topic: kubernetes
{{- end }}
## TODO review this one. Should it be turtles?
- alert: WorkloadClusterControlPlaneNodeMissingAWS
annotations:
description: '{{`Control plane node is missing.`}}'
Expand All @@ -57,6 +63,7 @@ spec:
severity: page
team: phoenix
topic: kubernetes
## TODO review this one. Should it be turtles?
- alert: WorkloadClusterHAControlPlaneDownForTooLong
annotations:
description: '{{`Control plane node in HA setup is down for a long time.`}}'
Expand All @@ -72,11 +79,12 @@ spec:
severity: page
team: phoenix
topic: kubernetes
## TODO review this one (cluster-autoscaler, etcd-kubernetes-resources-count-exporter should be turtles)
- alert: WorkloadClusterPodPendingAWS
annotations:
description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
opsrecipe: pod-stuck-in-pending/
expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*)",phase="Pending"} == 1
expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-(plugin|csi).*)",phase="Pending"} == 1
for: 15m
labels:
area: kaas
Expand All @@ -88,9 +96,11 @@ spec:
cancel_if_cluster_has_no_workers: "true"
severity: page
team: phoenix
## TODO Remove with Vintage
{{- if eq .Values.managementCluster.provider.kind "aws"}}
- alert: WorkloadClusterAWSCNIIpAlmostExhausted
annotations:
description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availabvility_zone }}.`}}'
description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availability_zone }}.`}}'
opsrecipe: aws-ips-exhausted/
expr: min(aws_operator_subnet_available_ips_percentage{subnet_type="aws-cni"}) by (account, availability_zone, cluster_id, id) < 0.1
for: 5m
Expand All @@ -111,4 +121,5 @@ spec:
severity: page
team: phoenix
topic: workloadcluster
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# This rule applies to capa management clusters
{{- if eq .Values.managementCluster.provider.kind "capa" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
Expand All @@ -12,7 +13,7 @@ metadata:
namespace: {{ .Values.namespace }}
spec:
groups:
- name: capa
- name: capa.management-cluster
rules:
- alert: ManagementClusterPodPendingCAPA
annotations:
Expand Down Expand Up @@ -62,18 +63,4 @@ spec:
severity: page
team: phoenix
topic: kubernetes
- alert: IRSATooManyErrors
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
expr: irsa_operator_cluster_errors > 0
for: 10m
labels:
area: kaas
cancel_if_kube_state_metrics_down: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
## TODO Remove with vintage
# This rule applies to vintage aws management clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
# Kept because it will be gone with Vintage
# No need for .Values.mimir.enabled condition - will be gone with Vintage
cluster_type: "management_cluster"
name: credentiald.rules
namespace: {{ .Values.namespace }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
## TODO Remove with vintage
# This rule applies to vintage aws management clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
## TODO Remove when all vintage installations are gone
# This rule applies to vintage aws management clusters
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
<<<<<<< HEAD
## TODO Remove when all vintage installations are gone
# This rule applies to vintage aws clusters
=======
>>>>>>> ba55ada (review phoenix alerts)
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# This rule applies to vintage aws or capa management clusters
{{- if or (eq .Values.managementCluster.provider.kind "aws") (eq .Values.managementCluster.provider.kind "capa") }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
{{- if not .Values.mimir.enabled }}
cluster_type: "management_cluster"
{{- end }}
name: irsa.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: irsa-operator
rules:
- alert: IRSATooManyErrors
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
## TODO remove this when all clusters are migrated to capi
{{- if eq .Values.managementCluster.provider.flavor "capi" }}
dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
{{- end }}
expr: irsa_operator_cluster_errors{cluster_type="management_cluster"} > 0
for: 10m
labels:
area: kaas
cancel_if_kube_state_metrics_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
- alert: IRSAACMCertificateExpiringInLessThan60Days
annotations:
description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}'
opsrecipe: irsa-acm-certificate-expiring/
expr: min(irsa_operator_acm_certificate_not_after{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000
for: 10m
labels:
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: aws
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
{{- if eq .Values.managementCluster.provider.kind "aws" }}
## TODO Remove with vintage
# This rule applies to vintage aws workload clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -20,7 +22,7 @@ spec:
expr: increase(kiam_metadata_find_role_errors_total[10m]) > 0
for: 15m
labels:
area: managedservices
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
<<<<<<< HEAD
{{- if eq .Values.managementCluster.provider.flavor "capi" }}
=======
>>>>>>> ba55ada (review phoenix alerts)
# This rule applies to all capi management clusters
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
Expand Down Expand Up @@ -27,4 +30,7 @@ spec:
cluster_control_plane_unhealthy: "true"
team: turtles
topic: status
<<<<<<< HEAD
{{- end }}
=======
>>>>>>> ba55ada (review phoenix alerts)
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
<<<<<<< HEAD
# This rule applies to all clusters
=======
>>>>>>> ba55ada (review phoenix alerts)
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -9,14 +12,26 @@ metadata:
namespace: {{ .Values.namespace }}
spec:
groups:
<<<<<<< HEAD
- name: inhibit.kubelet
rules:
- alert: InhibitionKubeletDown
annotations:
description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
=======
- name: inhibit.all
rules:
- alert: InhibitionKubeletDown
>>>>>>> ba55ada (review phoenix alerts)
expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
labels:
kubelet_down: "true"
area: kaas
topic: kubernetes
team: turtles
<<<<<<< HEAD
=======
annotations:
description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'

>>>>>>> ba55ada (review phoenix alerts)
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,8 @@ spec:
cancel_if_outside_working_hours: "true"
severity: page
team: turtles
<<<<<<< HEAD
topic: autoscaling
=======
topic: observability
>>>>>>> ba55ada (review phoenix alerts)

0 comments on commit c64ff05

Please sign in to comment.