Skip to content

Commit

Permalink
fix grafana and silence operator alerts for mimir
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Mar 5, 2024
1 parent c417a5b commit 7da3cb3
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
cluster_type: "management_cluster"
name: grafana.management-cluster.rules
name: grafana.rules
namespace: {{ .Values.namespace }}
spec:
groups:
Expand All @@ -15,15 +14,13 @@ spec:
description: '{{`Grafana ({{ $labels.instance }}) is down.`}}'
opsrecipe: grafana-down/
dashboard: qRQXmRnik/Grafana
expr: up{service="grafana"} == 0
expr: up{service="grafana", cluster_type="management_cluster"} == 0
for: 1h
labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
Expand All @@ -35,15 +32,13 @@ spec:
annotations:
description: '{{`Grafana Folder not updated for ({{ $labels.instance }}).`}}'
opsrecipe: grafana-perms/
expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200"})
expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
for: 6h
labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
Expand All @@ -59,16 +54,28 @@ spec:
# - we create cronjob label from cron name (label_replace)
# - we sum number of failed to have one global value
# - we avg_over_time to avoid 0 value when a cron was skipped for whatever reason
expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0
expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0
for: 6h
labels:
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: GrafanaPermissionJobHasNotBeenScheduledForTooLong
annotations:
description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}'
opsrecipe: job-has-not-been-scheduled-for-too-long/
# This alert triggers when the grafana permission job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0
labels:
area: empowerment
severity: page
team: atlas
topic: managementcluster
cancel_if_outside_working_hours: "true"
33 changes: 3 additions & 30 deletions helm/prometheus-rules/templates/alerting-rules/job.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
cluster_type: "management_cluster"
name: job.rules
namespace: {{ .Values.namespace }}
spec:
Expand All @@ -22,42 +21,16 @@ spec:
severity: notify
team: {{ include "providerTeam" . }}
topic: managementcluster
- alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong
annotations:
description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}'
opsrecipe: job-has-not-been-scheduled-for-too-long/
# This alert triggers when the silence operator sync job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync"} ) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+"}[1d]) == 1) == 0
labels:
area: empowerment
severity: page
team: atlas
topic: managementcluster
- alert: GrafanaPermissionJobHasNotBeenScheduledForTooLong
annotations:
description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}'
opsrecipe: job-has-not-been-scheduled-for-too-long/
# This alert triggers when the grafana permission job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions"} ) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+"}[1d]) == 1) == 0
labels:
area: empowerment
severity: page
team: atlas
topic: managementcluster
{{- if eq .Values.managementCluster.provider.kind "aws" }}
{{- if eq .Values.managementCluster.provider.kind "aws" }}
- alert: JobHasNotBeenScheduledForTooLong
annotations:
description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}'
opsrecipe: job-has-not-been-scheduled-for-too-long/
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"} ) > 7200
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200
for: 15m
labels:
area: kaas
severity: page
team: phoenix
topic: managementcluster
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ metadata:
namespace: {{ .Values.namespace }}
spec:
groups:
- name: silences
- name: silence-operator
rules:
- alert: "SilenceOperatorReconcileErrors"
annotations:
description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}'
opsrecipe: "silence-operator-reconcile-errors/"
expr: |
avg_over_time(operatorkit_controller_errors_total{app="silence-operator"}[20m]) > 0
avg_over_time(operatorkit_controller_errors_total{app="silence-operator", cluster_type="management_cluster"}[20m]) > 0
for: 1h
labels:
area: "empowerment"
Expand All @@ -23,3 +23,16 @@ spec:
severity: "page"
team: "atlas"
topic: "observability"
- alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong
annotations:
description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}'
opsrecipe: job-has-not-been-scheduled-for-too-long/
# This alert triggers when the silence operator sync job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) == 0
labels:
area: empowerment
severity: page
team: atlas
topic: managementcluster
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ rule_files:
tests:
- interval: 1m
input_series:
- series: 'operatorkit_controller_errors_total{app="silence-operator", controller="silence-controller"}'
- series: 'operatorkit_controller_errors_total{app="silence-operator", controller="silence-controller", cluster_type="management_cluster"}'
values: "0x30 1+0x20 20x45 20-1x20 0x100"
alert_rule_test:
- alertname: SilenceOperatorReconcileErrors
Expand Down

0 comments on commit 7da3cb3

Please sign in to comment.