From ba332efcf16fb4fb157974fd4eda5f141e9f1caa Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Mon, 6 Nov 2023 15:23:20 +0100 Subject: [PATCH 1/2] Add keda alerting rules Signed-off-by: QuentinBisson --- CHANGELOG.md | 4 ++ .../templates/alerting-rules/keda.rules.yml | 71 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 helm/prometheus-rules/templates/alerting-rules/keda.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9248f56d6..463467c46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add KEDA alerting rules. + ### Changed - Added `namespace` label to Flux helm release related alerts diff --git a/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml new file mode 100644 index 000000000..bf602f6f6 --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml @@ -0,0 +1,71 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: keda.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: Keda + rules: + - alert: KedaDown + annotations: + description: 'Keda is down.' + expr: count (up{container=~"keda-.*"} == 0) > 0 + for: 10m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + severity: page + team: phoenix + topic: autoscaling + - alert: KedaScaledObjectErrors + annotations: + description: '{{`Errors detected in scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' + expr: increase(keda_scaled_object_errors[10m])> 0 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: autoscaling + - alert: KedaWebhookScaledObjectValidationErrors + annotations: + description: '{{`Validation errors detected in webhook for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' + expr: increase(keda_webhook_scaled_object_validation_errors[10m]) > 0 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: autoscaling + - alert: KedaScalerErrors + annotations: + description: '{{`Errors detected in scaler {{ $labels.scaler }} for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' + expr: increase(keda_scaler_errors[10m]) > 0 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: autoscaling From 6f0b44c23ecb706e4b2392233ab473a01a4abfcd Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Mon, 6 Nov 2023 17:18:37 +0100 Subject: [PATCH 2/2] move to atlas and make alerts notify only --- .../templates/alerting-rules/keda.rules.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml index bf602f6f6..abd0b8880 100644 --- a/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml @@ -21,8 +21,9 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - severity: page - team: phoenix + cancel_if_outside_working_hours: "true" + severity: notify + team: atlas topic: autoscaling - alert: KedaScaledObjectErrors annotations: @@ -37,7 +38,7 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" severity: notify - team: phoenix + team: atlas topic: autoscaling - alert: KedaWebhookScaledObjectValidationErrors annotations: @@ -52,7 +53,7 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" severity: notify - team: phoenix + team: atlas topic: autoscaling - alert: KedaScalerErrors annotations: @@ -67,5 +68,5 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" severity: notify - team: phoenix + team: atlas topic: autoscaling