From 7da3cb3762c7dd007cc53cd778bbf721d1e91949 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 5 Mar 2024 12:17:22 +0100 Subject: [PATCH] fix grafana and silence operator alerts for mimir --- ...nt-cluster.rules.yml => grafana.rules.yml} | 29 +++++++++------- .../templates/alerting-rules/job.rules.yml | 33 ++----------------- .../alerting-rules/silence-operator.rules.yml | 17 ++++++++-- .../global/silence-operator.rules.test.yml | 2 +- 4 files changed, 37 insertions(+), 44 deletions(-) rename helm/prometheus-rules/templates/alerting-rules/{grafana.management-cluster.rules.yml => grafana.rules.yml} (65%) diff --git a/helm/prometheus-rules/templates/alerting-rules/grafana.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml similarity index 65% rename from helm/prometheus-rules/templates/alerting-rules/grafana.management-cluster.rules.yml rename to helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml index 0d7fe747f..c0b40f317 100644 --- a/helm/prometheus-rules/templates/alerting-rules/grafana.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml @@ -3,8 +3,7 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - cluster_type: "management_cluster" - name: grafana.management-cluster.rules + name: grafana.rules namespace: {{ .Values.namespace }} spec: groups: @@ -15,15 +14,13 @@ spec: description: '{{`Grafana ({{ $labels.instance }}) is down.`}}' opsrecipe: grafana-down/ dashboard: qRQXmRnik/Grafana - expr: up{service="grafana"} == 0 + expr: up{service="grafana", cluster_type="management_cluster"} == 0 for: 1h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -35,15 +32,13 @@ spec: annotations: description: '{{`Grafana Folder not updated for ({{ $labels.instance }}).`}}' opsrecipe: grafana-perms/ - expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200"}) + expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}) for: 6h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -59,16 +54,28 @@ spec: # - we create cronjob label from cron name (label_replace) # - we sum number of failed to have one global value # - we avg_over_time to avoid 0 value when a cron was skipped for whatever reason - expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0 + expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0 for: 6h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability + - alert: GrafanaPermissionJobHasNotBeenScheduledForTooLong + annotations: + description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}' + opsrecipe: job-has-not-been-scheduled-for-too-long/ + # This alert triggers when the grafana permission job did not schedule for more than 1 day + # or if the job did not run successfully at least once in the last day + expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400 + or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0 + labels: + area: empowerment + severity: page + team: atlas + topic: managementcluster + cancel_if_outside_working_hours: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/job.rules.yml b/helm/prometheus-rules/templates/alerting-rules/job.rules.yml index 1bb6afc2f..533343618 100644 --- a/helm/prometheus-rules/templates/alerting-rules/job.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/job.rules.yml @@ -4,7 +4,6 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - cluster_type: "management_cluster" name: job.rules namespace: {{ .Values.namespace }} spec: @@ -22,42 +21,16 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: managementcluster - - alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong - annotations: - description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}' - opsrecipe: job-has-not-been-scheduled-for-too-long/ - # This alert triggers when the silence operator sync job did not schedule for more than 1 day - # or if the job did not run successfully at least once in the last day - expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync"} ) > 86400 - or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+"}[1d]) == 1) == 0 - labels: - area: empowerment - severity: page - team: atlas - topic: managementcluster - - alert: GrafanaPermissionJobHasNotBeenScheduledForTooLong - annotations: - description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}' - opsrecipe: job-has-not-been-scheduled-for-too-long/ - # This alert triggers when the grafana permission job did not schedule for more than 1 day - # or if the job did not run successfully at least once in the last day - expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions"} ) > 86400 - or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+"}[1d]) == 1) == 0 - labels: - area: empowerment - severity: page - team: atlas - topic: managementcluster - {{- if eq .Values.managementCluster.provider.kind "aws" }} +{{- if eq .Values.managementCluster.provider.kind "aws" }} - alert: JobHasNotBeenScheduledForTooLong annotations: description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}' opsrecipe: job-has-not-been-scheduled-for-too-long/ - expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"} ) > 7200 + expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200 for: 15m labels: area: kaas severity: page team: phoenix topic: managementcluster - {{- end }} +{{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml index 0c0b038a7..ce09c1305 100644 --- a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml @@ -7,14 +7,14 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - - name: silences + - name: silence-operator rules: - alert: "SilenceOperatorReconcileErrors" annotations: description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}' opsrecipe: "silence-operator-reconcile-errors/" expr: | - avg_over_time(operatorkit_controller_errors_total{app="silence-operator"}[20m]) > 0 + avg_over_time(operatorkit_controller_errors_total{app="silence-operator", cluster_type="management_cluster"}[20m]) > 0 for: 1h labels: area: "empowerment" @@ -23,3 +23,16 @@ spec: severity: "page" team: "atlas" topic: "observability" + - alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong + annotations: + description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}' + opsrecipe: job-has-not-been-scheduled-for-too-long/ + # This alert triggers when the silence operator sync job did not schedule for more than 1 day + # or if the job did not run successfully at least once in the last day + expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400 + or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) == 0 + labels: + area: empowerment + severity: page + team: atlas + topic: managementcluster \ No newline at end of file diff --git a/test/tests/providers/global/silence-operator.rules.test.yml b/test/tests/providers/global/silence-operator.rules.test.yml index f6556027d..3efa91c42 100644 --- a/test/tests/providers/global/silence-operator.rules.test.yml +++ b/test/tests/providers/global/silence-operator.rules.test.yml @@ -5,7 +5,7 @@ rule_files: tests: - interval: 1m input_series: - - series: 'operatorkit_controller_errors_total{app="silence-operator", controller="silence-controller"}' + - series: 'operatorkit_controller_errors_total{app="silence-operator", controller="silence-controller", cluster_type="management_cluster"}' values: "0x30 1+0x20 20x45 20-1x20 0x100" alert_rule_test: - alertname: SilenceOperatorReconcileErrors