Skip to content

Commit

Permalink
Fix monitoring alerts (#1050)
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored Mar 5, 2024
1 parent 213e9b1 commit e6b011c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ spec:
max_over_time(sum(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"}
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
)
!=
sum(
Expand Down Expand Up @@ -111,7 +111,7 @@ spec:
max_over_time(sum(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"}
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
)
!=
sum(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ spec:
- alert: PrometheusOperatorDown
annotations:
description: '{{`Prometheus-operator ({{ $labels.instance }}) is down.`}}'
expr: up{app="prometheus-operator"} == 0
expr: up{app=~"prometheus-operator.*|kube-prometheus-.*"} == 0
for: 15m
labels:
area: empowerment
Expand All @@ -31,7 +31,7 @@ spec:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
expr: (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[10m]))) > 0.4
expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
area: empowerment
Expand All @@ -42,7 +42,7 @@ spec:
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
expr: (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[10m]))) > 0.4
expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
area: empowerment
Expand All @@ -53,7 +53,7 @@ spec:
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
expr: min_over_time(prometheus_operator_syncs{status="failed",app="prometheus-operator",namespace="{{ .Values.namespace }}"}[5m]) > 0
expr: min_over_time(prometheus_operator_syncs{status="failed",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0
for: 10m
labels:
area: empowerment
Expand All @@ -64,7 +64,7 @@ spec:
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
expr: (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[5m]))) > 0.1
expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
for: 10m
labels:
area: empowerment
Expand All @@ -75,7 +75,7 @@ spec:
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
expr: rate(prometheus_operator_node_address_lookup_errors_total{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[5m]) > 0.1
expr: rate(prometheus_operator_node_address_lookup_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0.1
for: 10m
labels:
area: empowerment
Expand All @@ -86,7 +86,7 @@ spec:
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{app="prometheus-operator",namespace="{{ .Values.namespace }}"}[5m]) == 0)
expr: min by(cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
for: 5m
labels:
area: empowerment
Expand All @@ -97,7 +97,7 @@ spec:
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",app="prometheus-operator",namespace="{{ .Values.namespace }}"}[5m]) > 0
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0
for: 5m
labels:
area: empowerment
Expand Down

0 comments on commit e6b011c

Please sign in to comment.