From 1a163b2e153e388f9bc2e77070641a83eae5592d Mon Sep 17 00:00:00 2001 From: Jose Armesto Date: Mon, 9 Dec 2024 15:23:39 +0100 Subject: [PATCH] Add karpenter alerts (#1449) * Add karpenter alerts * Add cluster_id label --- CHANGELOG.md | 4 ++ .../alerting-rules/karpenter.rules.yml | 54 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/karpenter.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e54912b..37c8ce4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add alerts for `karpenter` issues. + ## [4.29.0] - 2024-12-09 ### Changed diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/karpenter.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/karpenter.rules.yml new file mode 100644 index 00000000..75e9a4cd --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/karpenter.rules.yml @@ -0,0 +1,54 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + labels: {{- include "labels.common" . | nindent 4}} + name: karpenter.rules + namespace: {{.Values.namespace}} + name: karpenter +spec: + groups: + - name: karpenter + rules: + - alert: KarpenterCanNotRegisterNewNodes + annotations: + description: | + Karpenter provisioner {{`{{ $labels.provisioner }}`}} on cluster {{`{{ $labels.cluster_id }}`}} launched new nodes, but some of nodes did not registered in the cluster + opsrecipe: karpenter/ + expr: sum by (provisioner, cluster_id, installation, pipeline, provider) (karpenter_machines_launched) - sum by (provisioner, cluster_id, installation, pipeline, provider)(karpenter_machines_registered) != 0 + for: 1h + labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: {{ include "providerTeam" . }} + topic: karpenter + - alert: KarpenterProvisionerAlmostFull + annotations: + description: | + Provisioner {{`{{ $labels.provisioner }}`}} on cluster {{`{{ $labels.cluster_id }}`}} is almost full. + opsrecipe: karpenter/ + expr: karpenter_provisioner_usage_pct > 90 + for: 72h + labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: {{ include "providerTeam" . }} + topic: karpenter + - alert: KarpenterCloudproviderErrors + annotations: + description: | + Karpenter on cluster {{`{{ $labels.cluster_id }}`}} is getting errors during API calls to the cloud provider. + opsrecipe: karpenter/ + expr: rate(karpenter_cloudprovider_errors_total{}[5m]) > 0.1 + for: 10m + labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: {{ include "providerTeam" . }} + topic: karpenter