diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml index 042f8440..2f5e080f 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml @@ -1,4 +1,4 @@ -{{- if eq .Values.managementCluster.provider.kind "aws" }} +# This rule applies to vintage aws and capa workload clusters apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -18,10 +18,10 @@ spec: annotations: description: '{{`AWS load balancer controller pod {{ $labels.namespace}}/{{ $labels.pod }} on {{ $labels.cluster_id}} is throwing {{ $labels.error_code }} errors when contacting AWS API.`}}' opsrecipe: alb-errors - expr: sum(increase(aws_api_calls_total{error_code != ""}[20m])) by (error_code,namespace,pod,cluster_id) > 0 + expr: sum(increase(aws_api_calls_total{cluster_type="workload_cluster", error_code != "", provider=~"aws|capa|eks"}[20m])) by (cluster_id, error_code, installation, namespace, pipeline, provider, pod) > 0 for: 40m labels: - area: managedservices + area: kaas cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -33,10 +33,10 @@ spec: annotations: description: '{{`AWS load balancer controller pod {{ $labels.namespace }}/{{ $labels.pod }} on {{ $labels.cluster_id }} is throwing errors while reconciling the {{ $labels.controller }} controller.`}}' opsrecipe: alb-errors - expr: sum(increase(controller_runtime_reconcile_total{service="aws-load-balancer-controller", result = "error"}[20m])) by (controller,namespace,pod,cluster_id) > 0 + expr: sum(increase(controller_runtime_reconcile_total{cluster_type="workload_cluster", provider=~"aws|capa|eks", result = "error", service="aws-load-balancer-controller"}[20m])) by (cluster_id, controller, installation, namespace, pipeline, provider, pod) > 0 for: 40m labels: - area: managedservices + area: kaas cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -44,4 +44,3 @@ spec: severity: page team: phoenix topic: alb -{{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml index b906ad11..24f64133 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml @@ -1,6 +1,6 @@ -## TODO Remove with vintage -# This rule applies to vintage aws management clusters {{- if eq .Values.managementCluster.provider.flavor "vintage" }} +## TODO Remove when all vintage installations are gone +# This rule applies to vintage aws management clusters apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml index fa11b5a5..4f2fdf5f 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml @@ -1,4 +1,6 @@ -{{- if eq .Values.managementCluster.provider.kind "aws" }} +{{- if eq .Values.managementCluster.provider.flavor "vintage" }} +## TODO Remove when all vintage installations are gone +# This rule applies to vintage aws management clusters apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -161,32 +163,4 @@ spec: severity: page team: phoenix topic: kubernetes - - alert: IRSATooManyErrors - annotations: - description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' - opsrecipe: irsa-operator-error/ - expr: irsa_operator_cluster_errors > 0 - for: 10m - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: phoenix - topic: aws - - alert: IRSAACMCertificateExpiringInLessThan60Days - annotations: - description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}' - opsrecipe: irsa-acm-certificate-expiring/ - expr: min(irsa_operator_acm_certificate_not_after) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000 - for: 10m - labels: - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: phoenix - topic: aws {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml index 8812acd7..06391995 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml @@ -1,4 +1,4 @@ -{{- if eq .Values.managementCluster.provider.kind "aws" }} +{{- if or (eq .Values.managementCluster.provider.kind "aws") (eq .Values.managementCluster.provider.kind "capa") }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -13,11 +13,12 @@ spec: groups: - name: aws rules: + ## TODO review this one (cluster-autoscaler, etcd-kubernetes-resources-count-exporter should be turtles) - alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS annotations: description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' opsrecipe: container-is-restarting-too-often/ - expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-plugin.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10 + expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10 for: 10m labels: area: kaas @@ -29,10 +30,13 @@ spec: severity: page team: phoenix topic: kubernetes + ## TODO Remove when all vintage installations are gone + {{- if eq .Values.managementCluster.provider.kind "aws"}} - alert: WorkloadClusterCriticalPodNotRunningAWS annotations: description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}' opsrecipe: critical-pod-is-not-running/ + ### Those pods only exists in vintage expr: kube_pod_container_status_running{namespace="kube-system",container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1 for: 20m labels: @@ -43,6 +47,8 @@ spec: severity: page team: phoenix topic: kubernetes + {{- end }} + ## TODO review this one. Should it be turtles? - alert: WorkloadClusterControlPlaneNodeMissingAWS annotations: description: '{{`Control plane node is missing.`}}' @@ -57,6 +63,7 @@ spec: severity: page team: phoenix topic: kubernetes + ## TODO review this one. Should it be turtles? - alert: WorkloadClusterHAControlPlaneDownForTooLong annotations: description: '{{`Control plane node in HA setup is down for a long time.`}}' @@ -72,11 +79,12 @@ spec: severity: page team: phoenix topic: kubernetes + ## TODO review this one (cluster-autoscaler, etcd-kubernetes-resources-count-exporter should be turtles) - alert: WorkloadClusterPodPendingAWS annotations: description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}' opsrecipe: pod-stuck-in-pending/ - expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*)",phase="Pending"} == 1 + expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-(plugin|csi).*)",phase="Pending"} == 1 for: 15m labels: area: kaas @@ -88,9 +96,11 @@ spec: cancel_if_cluster_has_no_workers: "true" severity: page team: phoenix + ## TODO Remove when all vintage installations are gone + {{- if eq .Values.managementCluster.provider.kind "aws"}} - alert: WorkloadClusterAWSCNIIpAlmostExhausted annotations: - description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availabvility_zone }}.`}}' + description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availability_zone }}.`}}' opsrecipe: aws-ips-exhausted/ expr: min(aws_operator_subnet_available_ips_percentage{subnet_type="aws-cni"}) by (account, availability_zone, cluster_id, id) < 0.1 for: 5m @@ -111,4 +121,5 @@ spec: severity: page team: phoenix topic: workloadcluster + {{- end }} {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml index 088624c8..aacc3ddf 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml @@ -1,3 +1,4 @@ +# This rule applies to capa management clusters {{- if eq .Values.managementCluster.provider.kind "capa" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -12,7 +13,7 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - - name: capa + - name: capa.management-cluster rules: - alert: ManagementClusterPodPendingCAPA annotations: @@ -62,18 +63,4 @@ spec: severity: page team: phoenix topic: kubernetes - - alert: IRSATooManyErrors - annotations: - description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' - opsrecipe: irsa-operator-error/ - dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers - expr: irsa_operator_cluster_errors > 0 - for: 10m - labels: - area: kaas - cancel_if_kube_state_metrics_down: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: phoenix - topic: aws {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/credentiald.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/credentiald.rules.yml index 7d2039dc..183e69cf 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/credentiald.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/credentiald.rules.yml @@ -1,11 +1,13 @@ {{- if eq .Values.managementCluster.provider.flavor "vintage" }} +## TODO Remove when all vintage installations are gone +# This rule applies to vintage aws management clusters apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - # Kept because it will be gone with Vintage + # No need for .Values.mimir.enabled condition - will be gone with Vintage cluster_type: "management_cluster" name: credentiald.rules namespace: {{ .Values.namespace }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml index a35d7f50..90d64f13 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml @@ -1,6 +1,5 @@ {{- if eq .Values.managementCluster.provider.flavor "vintage" }} ## TODO Remove when all vintage installations are gone -# This rule applies to vintage aws clusters apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml new file mode 100644 index 00000000..b482c783 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml @@ -0,0 +1,49 @@ +# This rule applies to vintage aws or capa management clusters +{{- if or (eq .Values.managementCluster.provider.kind "aws") (eq .Values.managementCluster.provider.kind "capa") }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} +{{- if not .Values.mimir.enabled }} + cluster_type: "management_cluster" +{{- end }} + name: irsa.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: irsa-operator + rules: + - alert: IRSATooManyErrors + annotations: + description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' + opsrecipe: irsa-operator-error/ + ## TODO Remove when all vintage installations are gone + {{- if eq .Values.managementCluster.provider.flavor "capi" }} + dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers + {{- end }} + expr: irsa_operator_cluster_errors{cluster_type="management_cluster"} > 0 + for: 10m + labels: + area: kaas + cancel_if_kube_state_metrics_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: aws + - alert: IRSAACMCertificateExpiringInLessThan60Days + annotations: + description: '{{`IRSA ACM certificate for Cluster {{ $labels.cluster_id }} ({{ $labels.certificate_name }}) will expire in less than 2 months.`}}' + opsrecipe: irsa-acm-certificate-expiring/ + expr: min(irsa_operator_acm_certificate_not_after{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider, certificate_name) - time() < 5184000 + for: 10m + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: aws diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/kiam.rules.yml index 9d72328c..772d6c7c 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/kiam.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/kiam.rules.yml @@ -1,4 +1,6 @@ -{{- if eq .Values.managementCluster.provider.kind "aws" }} +## TODO Remove when all vintage installations are gone +# This rule applies to vintage aws workload clusters +{{- if eq .Values.managementCluster.provider.flavor "vintage" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -20,7 +22,7 @@ spec: expr: increase(kiam_metadata_find_role_errors_total[10m]) > 0 for: 15m labels: - area: managedservices + area: kaas cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true"