Skip to content

Commit

Permalink
capi-add-dashboard-to-alerts (#1121)
Browse files Browse the repository at this point in the history
* capi-add-dashboard-to-alerts
  • Loading branch information
calvix authored Apr 17, 2024
1 parent a9b1c63 commit a9e5198
Show file tree
Hide file tree
Showing 15 changed files with 31 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Add CAPI and CAPA dashbaord to the coresponding alerts.

### Fixed

- link to `PrometheusMissingGrafanaCloud` opsrecipe
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ spec:
annotations:
description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
opsrecipe: container-is-restarting-too-often/
dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"(aws.*|capa-.*|irsa-operator.*)", cluster_type="management_cluster"}[1h]), "service", "/", "namespace", "pod") > 6
for: 5m
labels:
Expand Down Expand Up @@ -65,6 +66,7 @@ spec:
annotations:
description: '{{`IRSA bootstrap for Cluster {{ $labels.cluster_id }} failed.`}}'
opsrecipe: irsa-operator-error/
dashboard: bdiako8tt1b7kc/capa-agregated-error-logs-for-capa-controllers
expr: irsa_operator_cluster_errors > 0
for: 10m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ spec:
description: |-
{{`Cluster {{ $labels.exported_namespace }}/{{ $labels.name }} stuck in {{ $labels.phase }} phase.`}}
opsrecipe: capi-cluster/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers

- alert: ClusterStatusNotReady
expr: capi_cluster_status_condition{status="False", type="Ready"} > 0
Expand All @@ -38,6 +39,7 @@ spec:
description: |-
{{`Cluster {{ $labels.exported_namespace }}/{{ $labels.name }} is not ready.`}}
opsrecipe: capi-cluster/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers

- alert: ClusterPaused
expr: capi_cluster_annotation_paused{paused_value="true"} > 0
Expand All @@ -53,4 +55,5 @@ spec:
description: |-
{{`The cluster {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}}
opsrecipe: capi-cluster/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ spec:
description: |-
{{`The clusters {{$labels.cluster_name}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} does not match the expected number of replicas for longer than 90 minutes.`}}
opsrecipe: capi-kubeadmcontrolplane/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers

- alert: KubeadmControlPlanePaused
expr: capi_kubeadmcontrolplane_annotation_paused{paused_value="true"} > 0
Expand All @@ -39,4 +40,5 @@ spec:
description: |-
{{`The clusters {{$labels.cluster_name}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}}
opsrecipe: capi-kubeadmcontrolplane/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ spec:
description: |-
{{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} stuck in phase {{ $labels.phase }} for more than 30 minutes.`}}
opsrecipe: capi-machine/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
expr: capi_machine_status_phase{phase!~"Running", name!~".*bastion.*"} > 0
for: 30m
labels:
Expand All @@ -38,4 +39,5 @@ spec:
description: |-
{{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} is paused.`}}
opsrecipe: capi-machine/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ spec:
description: |-
{{`The clusters {{$labels.cluster_name}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is not healthy.`}}
opsrecipe: capi-machinedeployment/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alert: MachineDeploymentPaused
expr: capi_machinedeployment_annotation_paused{paused_value="true"} > 0
for: 1h
Expand All @@ -37,4 +38,5 @@ spec:
description: |-
{{`The clusters {{$labels.cluster_name}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}}
opsrecipe: capi-machinedeployment/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ spec:
description: |-
{{`The clusters {{ $labels.cluster_name }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is not healthy.`}}
opsrecipe: capi-machinepool/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers

- alert: MachinePoolPaused
expr: capi_machinepool_annotation_paused{paused_value="true"} > 0
Expand All @@ -38,4 +39,5 @@ spec:
description: |-
{{`The clusters {{ $labels.cluster_name }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}}
opsrecipe: capi-machinepool/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ spec:
description: |-
{{`Machineset {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}}
opsrecipe: capi-machineset/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ spec:
annotations:
description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
opsrecipe: container-is-restarting-too-often/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"capi.*", cluster_type="management_cluster"}[1h]), "service", "/", "namespace", "pod") > 6
for: 5m
labels:
Expand Down
3 changes: 3 additions & 0 deletions test/tests/providers/capi/capz/capi-cluster.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ tests:
exp_annotations:
description: "Cluster giantswarm/clippaxy stuck in Pending phase."
opsrecipe: capi-cluster/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alertname: ClusterStatusNotReady
eval_time: 75m
exp_alerts:
Expand All @@ -48,6 +49,7 @@ tests:
exp_annotations:
description: "Cluster giantswarm/grumpy is not ready."
opsrecipe: capi-cluster/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alertname: ClusterPaused
eval_time: 75m
exp_alerts:
Expand All @@ -64,3 +66,4 @@ tests:
exp_annotations:
description: "The cluster giantswarm/grumpy is paused."
opsrecipe: capi-cluster/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ tests:
exp_annotations:
description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes."
opsrecipe: capi-kubeadmcontrolplane/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alertname: KubeadmControlPlanePaused
eval_time: 75m
exp_alerts:
Expand All @@ -48,3 +49,4 @@ tests:
exp_annotations:
description: "The clusters grumpy kubeadmcontrolplane giantswarm/grumpy-72r5c is paused."
opsrecipe: capi-kubeadmcontrolplane/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
2 changes: 2 additions & 0 deletions test/tests/providers/capi/capz/capi-machine.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ tests:
exp_annotations:
description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes."
opsrecipe: capi-machine/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alertname: MachinePaused
eval_time: 75m
exp_alerts:
Expand All @@ -45,3 +46,4 @@ tests:
exp_annotations:
description: "Machine giantswarm/grumpy-72r5c is paused."
opsrecipe: capi-machine/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tests:
exp_annotations:
description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy."
opsrecipe: capi-machinedeployment/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alertname: MachineDeploymentPaused
eval_time: 75m
exp_alerts:
Expand All @@ -43,3 +44,4 @@ tests:
exp_annotations:
description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused."
opsrecipe: capi-machinedeployment/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tests:
exp_annotations:
description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy."
opsrecipe: capi-machinepool/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
- alertname: MachinePoolPaused
eval_time: 75m
exp_alerts:
Expand All @@ -43,3 +44,4 @@ tests:
exp_annotations:
description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused."
opsrecipe: capi-machinepool/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ tests:
exp_annotations:
description: "Machineset giantswarm/grumpy-def99 is paused."
opsrecipe: capi-machineset/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers

0 comments on commit a9e5198

Please sign in to comment.