Skip to content

Commit

Permalink
Fix some missing ops-recipes (#1107)
Browse files Browse the repository at this point in the history
* Fix some missing ops-recipes

* Fix tests
  • Loading branch information
QuentinBisson authored Apr 8, 2024
1 parent 953aa3a commit a96e9cd
Show file tree
Hide file tree
Showing 8 changed files with 20 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Fix missing ops-recipes.

### Changed

- Make Atlas rules compatible with Mimir:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ spec:
- alert: WorkloadClusterDeploymentNotSatisfiedBigMac
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled down to zero for prolonged period of time.`}}'
opsrecipe: workload-cluster-deployment-not-satisfied/
expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0
for: 30m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ spec:
- alert: FluentbitDaemonSetNotSatisfied
annotations:
description: '{{`Daemonset {{ $labels.namespace}}/{{ $labels.daemonset }} is not satisfied.`}}'
opsrecipe: daemonset-not-satisfied/
expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0
for: 1h
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ spec:
- alert: PrometheusOperatorDown
annotations:
description: '{{`Prometheus-operator ({{ $labels.instance }}) is down.`}}'
opsrecipe: "prometheus-operator/"
expr: up{app=~"prometheus-operator.*|kube-prometheus-.*"} == 0
for: 15m
labels:
Expand All @@ -33,6 +34,7 @@ spec:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
opsrecipe: "prometheus-operator/"
expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
Expand All @@ -44,6 +46,7 @@ spec:
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
opsrecipe: "prometheus-operator/"
expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
Expand All @@ -55,6 +58,7 @@ spec:
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
opsrecipe: "prometheus-operator/"
expr: min_over_time(prometheus_operator_syncs{status="failed",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0
for: 10m
labels:
Expand All @@ -66,6 +70,7 @@ spec:
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
opsrecipe: "prometheus-operator/"
expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
for: 10m
labels:
Expand All @@ -77,6 +82,7 @@ spec:
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
opsrecipe: "prometheus-operator/"
expr: rate(prometheus_operator_node_address_lookup_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0.1
for: 10m
labels:
Expand All @@ -88,6 +94,7 @@ spec:
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
opsrecipe: "prometheus-operator/"
expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
for: 5m
labels:
Expand All @@ -99,6 +106,7 @@ spec:
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
opsrecipe: "prometheus-operator/"
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0
for: 5m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
- alert: "SilenceOperatorReconcileErrors"
annotations:
description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}'
opsrecipe: "silence-operator-reconcile-errors/"
opsrecipe: "operator-not-reconciling/"
expr: |
avg_over_time(operatorkit_controller_errors_total{app="silence-operator", cluster_type="management_cluster"}[20m]) > 0
for: 1h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ spec:
- alert: SlothDown
annotations:
description: 'Sloth is down.'
opsrecipe: sloth-down/
expr: count(up{app="sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0
for: 5m
labels:
Expand All @@ -21,7 +22,7 @@ spec:
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ tests:
topic: "observability"
exp_annotations:
description: "silence-operator controller silence-controller too many reconcile errors."
opsrecipe: "silence-operator-reconcile-errors/"
opsrecipe: "operator-not-reconciling/"
- alertname: SilenceOperatorReconcileErrors
eval_time: 215m
3 changes: 2 additions & 1 deletion test/tests/providers/global/sloth.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ tests:
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "false"
cancel_if_outside_working_hours: "true"
exp_annotations:
description: "Sloth is down."
opsrecipe: sloth-down/

0 comments on commit a96e9cd

Please sign in to comment.