From a96e9cdeb190cb8b13f643dd99940884dedbccd0 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Mon, 8 Apr 2024 10:39:16 +0200 Subject: [PATCH] Fix some missing ops-recipes (#1107) * Fix some missing ops-recipes * Fix tests --- CHANGELOG.md | 4 ++++ .../alerting-rules/deployment.workload-cluster.rules.yml | 1 + .../templates/alerting-rules/fluentbit.rules.yml | 1 + .../alerting-rules/prometheus-operator.rules.yml | 8 ++++++++ .../templates/alerting-rules/silence-operator.rules.yml | 2 +- .../templates/alerting-rules/sloth.rules.yml | 3 ++- .../providers/global/silence-operator.rules.test.yml | 2 +- test/tests/providers/global/sloth.rules.test.yml | 3 ++- 8 files changed, 20 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ccc3d1b6..b127c1baf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Fix missing ops-recipes. + ### Changed - Make Atlas rules compatible with Mimir: diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml index 1f7bfb6c9..4295569b6 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml @@ -75,6 +75,7 @@ spec: - alert: WorkloadClusterDeploymentNotSatisfiedBigMac annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled down to zero for prolonged period of time.`}}' + opsrecipe: workload-cluster-deployment-not-satisfied/ expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0 for: 30m labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml index 383e9579c..b0090193a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml @@ -60,6 +60,7 @@ spec: - alert: FluentbitDaemonSetNotSatisfied annotations: description: '{{`Daemonset {{ $labels.namespace}}/{{ $labels.daemonset }} is not satisfied.`}}' + opsrecipe: daemonset-not-satisfied/ expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0 for: 1h labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml index f3eb598f4..032bf13fe 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml @@ -16,6 +16,7 @@ spec: - alert: PrometheusOperatorDown annotations: description: '{{`Prometheus-operator ({{ $labels.instance }}) is down.`}}' + opsrecipe: "prometheus-operator/" expr: up{app=~"prometheus-operator.*|kube-prometheus-.*"} == 0 for: 15m labels: @@ -33,6 +34,7 @@ spec: - alert: PrometheusOperatorListErrors annotations: description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. + opsrecipe: "prometheus-operator/" expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 for: 15m labels: @@ -44,6 +46,7 @@ spec: - alert: PrometheusOperatorWatchErrors annotations: description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. + opsrecipe: "prometheus-operator/" expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 for: 15m labels: @@ -55,6 +58,7 @@ spec: - alert: PrometheusOperatorSyncFailed annotations: description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects. + opsrecipe: "prometheus-operator/" expr: min_over_time(prometheus_operator_syncs{status="failed",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0 for: 10m labels: @@ -66,6 +70,7 @@ spec: - alert: PrometheusOperatorReconcileErrors annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.' + opsrecipe: "prometheus-operator/" expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1 for: 10m labels: @@ -77,6 +82,7 @@ spec: - alert: PrometheusOperatorNodeLookupErrors annotations: description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. + opsrecipe: "prometheus-operator/" expr: rate(prometheus_operator_node_address_lookup_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0.1 for: 10m labels: @@ -88,6 +94,7 @@ spec: - alert: PrometheusOperatorNotReady annotations: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. + opsrecipe: "prometheus-operator/" expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0) for: 5m labels: @@ -99,6 +106,7 @@ spec: - alert: PrometheusOperatorRejectedResources annotations: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources. + opsrecipe: "prometheus-operator/" expr: min_over_time(prometheus_operator_managed_resources{state="rejected",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0 for: 5m labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml index 0f1b7c68e..60efbe21b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml @@ -12,7 +12,7 @@ spec: - alert: "SilenceOperatorReconcileErrors" annotations: description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}' - opsrecipe: "silence-operator-reconcile-errors/" + opsrecipe: "operator-not-reconciling/" expr: | avg_over_time(operatorkit_controller_errors_total{app="silence-operator", cluster_type="management_cluster"}[20m]) > 0 for: 1h diff --git a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml index 1aaab0de0..120326263 100644 --- a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml @@ -12,6 +12,7 @@ spec: - alert: SlothDown annotations: description: 'Sloth is down.' + opsrecipe: sloth-down/ expr: count(up{app="sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 5m labels: @@ -21,7 +22,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability diff --git a/test/tests/providers/global/silence-operator.rules.test.yml b/test/tests/providers/global/silence-operator.rules.test.yml index 23d092d3b..799f41e38 100644 --- a/test/tests/providers/global/silence-operator.rules.test.yml +++ b/test/tests/providers/global/silence-operator.rules.test.yml @@ -25,6 +25,6 @@ tests: topic: "observability" exp_annotations: description: "silence-operator controller silence-controller too many reconcile errors." - opsrecipe: "silence-operator-reconcile-errors/" + opsrecipe: "operator-not-reconciling/" - alertname: SilenceOperatorReconcileErrors eval_time: 215m diff --git a/test/tests/providers/global/sloth.rules.test.yml b/test/tests/providers/global/sloth.rules.test.yml index 76a14bd1a..813eff327 100644 --- a/test/tests/providers/global/sloth.rules.test.yml +++ b/test/tests/providers/global/sloth.rules.test.yml @@ -30,6 +30,7 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" exp_annotations: description: "Sloth is down." + opsrecipe: sloth-down/