diff --git a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml index ece840e2d..a7cbb8f9d 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml @@ -25,7 +25,7 @@ spec: severity: page team: shield topic: kyverno - - name: reports + - name: resources rules: - alert: KyvernoAdmissionReportCountTooHigh annotations: @@ -42,6 +42,21 @@ spec: severity: page team: shield topic: kyverno + - alert: KyvernoUpdateRequestsCountTooHigh + annotations: + description: "{{`Kyverno {{ $labels.kind }} are too high. This is an indicator that Kyverno\'s background controller may not be able to create some resources.`}}" + opsrecipe: kyverno-updaterequests/ + expr: aggregation:kyverno_resource_counts{kind=~"(generate|update)requests.kyverno.io"} > 5000 + for: 15m + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + severity: notify + team: shield + topic: kyverno - name: replicas rules: - alert: KyvernoScaledDownTooLong diff --git a/test/tests/providers/global/kyverno.all.rules.test.yml b/test/tests/providers/global/kyverno.all.rules.test.yml index b855bb76b..68e803f71 100644 --- a/test/tests/providers/global/kyverno.all.rules.test.yml +++ b/test/tests/providers/global/kyverno.all.rules.test.yml @@ -20,6 +20,9 @@ tests: # Kyverno admission reports - series: 'aggregation:kyverno_resource_counts{kind="admissionreports.kyverno.io"}' values: "0+1000x30 30000+1500x30" + # Kyverno updaterequests + - series: 'aggregation:kyverno_resource_counts{kind="updaterequests.kyverno.io"}' + values: "0+100x15 5000+1500x30" alert_rule_test: # Webhooks alert - alertname: KyvernoWebhookHasNoAvailableReplicas @@ -54,6 +57,23 @@ tests: exp_annotations: description: "Kyverno admissionreports.kyverno.io are too high. This is an indicator that Kyverno's report processing may not be keeping up with cluster demand." opsrecipe: "kyverno-reports/" + # Kyverno reports too high alert + - alertname: KyvernoUpdateRequestsCountTooHigh + eval_time: 45m + exp_alerts: + - exp_labels: + area: managedservices + severity: notify + team: shield + topic: kyverno + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + kind: "updaterequests.kyverno.io" + exp_annotations: + description: "Kyverno updaterequests.kyverno.io are too high. This is an indicator that Kyverno's background controller may not be able to create some resources." + opsrecipe: "kyverno-updaterequests/" # Kyverno scaled down alert - alertname: KyvernoScaledDownTooLong eval_time: 240m