From a9e519879e13c38dcc44a835c3948a508668363d Mon Sep 17 00:00:00 2001 From: calvix Date: Wed, 17 Apr 2024 10:55:37 +0200 Subject: [PATCH] capi-add-dashboard-to-alerts (#1121) * capi-add-dashboard-to-alerts --- CHANGELOG.md | 4 ++++ .../alerting-rules/capa.management-cluster.rules.yml | 2 ++ .../templates/alerting-rules/capi-cluster.rules.yml | 3 +++ .../alerting-rules/capi-kubeadmcontrolplane.rules.yml | 2 ++ .../templates/alerting-rules/capi-machine.rules.yml | 2 ++ .../templates/alerting-rules/capi-machinedeployment.rules.yml | 2 ++ .../templates/alerting-rules/capi-machinepool.rules.yml | 2 ++ .../templates/alerting-rules/capi-machineset.rules.yml | 1 + .../alerting-rules/capi.management-cluster.rules.yml | 1 + test/tests/providers/capi/capz/capi-cluster.rules.test.yml | 3 +++ .../capi/capz/capi-kubeadmcontrolplane.rules.test.yml | 2 ++ test/tests/providers/capi/capz/capi-machine.rules.test.yml | 2 ++ .../providers/capi/capz/capi-machinedeployment.rules.test.yml | 2 ++ .../tests/providers/capi/capz/capi-machinepool.rules.test.yml | 2 ++ test/tests/providers/capi/capz/capi-machineset.rules.test.yml | 1 + 15 files changed, 31 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bf79344a..4cb1b80a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add CAPI and CAPA dashbaord to the coresponding alerts. + ### Fixed - link to `PrometheusMissingGrafanaCloud` opsrecipe diff --git a/helm/prometheus-rules/templates/alerting-rules/capa.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capa.management-cluster.rules.yml index c5213bab8..088624c81 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capa.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capa.management-cluster.rules.yml @@ -32,6 +32,7 @@ spec: annotations: description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' opsrecipe: container-is-restarting-too-often/ + dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"(aws.*|capa-.*|irsa-operator.*)", cluster_type="management_cluster"}[1h]), "service", "/", "namespace", "pod") > 6 for: 5m labels: @@ -65,6 +66,7 @@ spec: annotations: description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}' opsrecipe: irsa-operator-error/ + dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers expr: irsa_operator_cluster_errors > 0 for: 10m labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml index 36b11d623..369b34b35 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml @@ -23,6 +23,7 @@ spec: description: |- {{`Cluster {{ $labels.exported_namespace }}/{{ $labels.name }} stuck in {{ $labels.phase }} phase.`}} opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alert: ClusterStatusNotReady expr: capi_cluster_status_condition{status="False", type="Ready"} > 0 @@ -38,6 +39,7 @@ spec: description: |- {{`Cluster {{ $labels.exported_namespace }}/{{ $labels.name }} is not ready.`}} opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alert: ClusterPaused expr: capi_cluster_annotation_paused{paused_value="true"} > 0 @@ -53,4 +55,5 @@ spec: description: |- {{`The cluster {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}} opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-kubeadmcontrolplane.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-kubeadmcontrolplane.rules.yml index dadc322cd..23f7e8933 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-kubeadmcontrolplane.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-kubeadmcontrolplane.rules.yml @@ -24,6 +24,7 @@ spec: description: |- {{`The clusters {{$labels.cluster_name}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} does not match the expected number of replicas for longer than 90 minutes.`}} opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alert: KubeadmControlPlanePaused expr: capi_kubeadmcontrolplane_annotation_paused{paused_value="true"} > 0 @@ -39,4 +40,5 @@ spec: description: |- {{`The clusters {{$labels.cluster_name}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}} opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-machine.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-machine.rules.yml index 1d0003947..49688d57f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-machine.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-machine.rules.yml @@ -14,6 +14,7 @@ spec: description: |- {{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} stuck in phase {{ $labels.phase }} for more than 30 minutes.`}} opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers expr: capi_machine_status_phase{phase!~"Running", name!~".*bastion.*"} > 0 for: 30m labels: @@ -38,4 +39,5 @@ spec: description: |- {{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} is paused.`}} opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-machinedeployment.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-machinedeployment.rules.yml index eb2b90f9e..d2992db35 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-machinedeployment.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-machinedeployment.rules.yml @@ -23,6 +23,7 @@ spec: description: |- {{`The clusters {{$labels.cluster_name}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is not healthy.`}} opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alert: MachineDeploymentPaused expr: capi_machinedeployment_annotation_paused{paused_value="true"} > 0 for: 1h @@ -37,4 +38,5 @@ spec: description: |- {{`The clusters {{$labels.cluster_name}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}} opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-machinepool.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-machinepool.rules.yml index 1367962f9..924af2252 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-machinepool.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-machinepool.rules.yml @@ -23,6 +23,7 @@ spec: description: |- {{`The clusters {{ $labels.cluster_name }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is not healthy.`}} opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alert: MachinePoolPaused expr: capi_machinepool_annotation_paused{paused_value="true"} > 0 @@ -38,4 +39,5 @@ spec: description: |- {{`The clusters {{ $labels.cluster_name }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}} opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-machineset.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-machineset.rules.yml index 41392c847..e17edad61 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-machineset.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-machineset.rules.yml @@ -23,4 +23,5 @@ spec: description: |- {{`Machineset {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}} opsrecipe: capi-machineset/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers {{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/capi.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi.management-cluster.rules.yml index c75d7862e..71b1729ea 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi.management-cluster.rules.yml @@ -31,6 +31,7 @@ spec: annotations: description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' opsrecipe: container-is-restarting-too-often/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"capi.*", cluster_type="management_cluster"}[1h]), "service", "/", "namespace", "pod") > 6 for: 5m labels: diff --git a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml index 881c8fe3f..5b53e552d 100644 --- a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml @@ -31,6 +31,7 @@ tests: exp_annotations: description: "Cluster giantswarm/clippaxy stuck in Pending phase." opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alertname: ClusterStatusNotReady eval_time: 75m exp_alerts: @@ -48,6 +49,7 @@ tests: exp_annotations: description: "Cluster giantswarm/grumpy is not ready." opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alertname: ClusterPaused eval_time: 75m exp_alerts: @@ -64,3 +66,4 @@ tests: exp_annotations: description: "The cluster giantswarm/grumpy is paused." opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml b/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml index 7fab46d5d..9dd6f3aa6 100644 --- a/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml @@ -31,6 +31,7 @@ tests: exp_annotations: description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes." opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alertname: KubeadmControlPlanePaused eval_time: 75m exp_alerts: @@ -48,3 +49,4 @@ tests: exp_annotations: description: "The clusters grumpy kubeadmcontrolplane giantswarm/grumpy-72r5c is paused." opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capz/capi-machine.rules.test.yml b/test/tests/providers/capi/capz/capi-machine.rules.test.yml index b166d6800..50b6d2888 100644 --- a/test/tests/providers/capi/capz/capi-machine.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machine.rules.test.yml @@ -28,6 +28,7 @@ tests: exp_annotations: description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes." opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alertname: MachinePaused eval_time: 75m exp_alerts: @@ -45,3 +46,4 @@ tests: exp_annotations: description: "Machine giantswarm/grumpy-72r5c is paused." opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml b/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml index a6f0bb225..58f719cd3 100644 --- a/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml @@ -26,6 +26,7 @@ tests: exp_annotations: description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy." opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alertname: MachineDeploymentPaused eval_time: 75m exp_alerts: @@ -43,3 +44,4 @@ tests: exp_annotations: description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused." opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml b/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml index dbb6a2952..7e7709628 100644 --- a/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml @@ -26,6 +26,7 @@ tests: exp_annotations: description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy." opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers - alertname: MachinePoolPaused eval_time: 75m exp_alerts: @@ -43,3 +44,4 @@ tests: exp_annotations: description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused." opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capz/capi-machineset.rules.test.yml b/test/tests/providers/capi/capz/capi-machineset.rules.test.yml index 729f72676..5d75ec758 100644 --- a/test/tests/providers/capi/capz/capi-machineset.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machineset.rules.test.yml @@ -24,3 +24,4 @@ tests: exp_annotations: description: "Machineset giantswarm/grumpy-def99 is paused." opsrecipe: capi-machineset/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers