From a96e9cdeb190cb8b13f643dd99940884dedbccd0 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Mon, 8 Apr 2024 10:39:16 +0200
Subject: [PATCH] Fix some missing ops-recipes (#1107)

* Fix some missing ops-recipes

* Fix tests
---
 CHANGELOG.md                                              | 4 ++++
 .../alerting-rules/deployment.workload-cluster.rules.yml  | 1 +
 .../templates/alerting-rules/fluentbit.rules.yml          | 1 +
 .../alerting-rules/prometheus-operator.rules.yml          | 8 ++++++++
 .../templates/alerting-rules/silence-operator.rules.yml   | 2 +-
 .../templates/alerting-rules/sloth.rules.yml              | 3 ++-
 .../providers/global/silence-operator.rules.test.yml      | 2 +-
 test/tests/providers/global/sloth.rules.test.yml          | 3 ++-
 8 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ccc3d1b6..b127c1baf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- Fix missing ops-recipes.
+
 ### Changed
 
 - Make Atlas rules compatible with Mimir:
diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
index 1f7bfb6c9..4295569b6 100644
--- a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml
@@ -75,6 +75,7 @@ spec:
     - alert: WorkloadClusterDeploymentNotSatisfiedBigMac
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled down to zero for prolonged period of time.`}}'
+        opsrecipe: workload-cluster-deployment-not-satisfied/
       expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0
       for: 30m
       labels:
diff --git a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
index 383e9579c..b0090193a 100644
--- a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
@@ -60,6 +60,7 @@ spec:
     - alert: FluentbitDaemonSetNotSatisfied
       annotations:
         description: '{{`Daemonset {{ $labels.namespace}}/{{ $labels.daemonset }} is not satisfied.`}}'
+        opsrecipe: daemonset-not-satisfied/
       expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0
       for: 1h
       labels:
diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml
index f3eb598f4..032bf13fe 100644
--- a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml
@@ -16,6 +16,7 @@ spec:
     - alert: PrometheusOperatorDown
       annotations:
         description: '{{`Prometheus-operator ({{ $labels.instance }}) is down.`}}'
+        opsrecipe: "prometheus-operator/"
       expr: up{app=~"prometheus-operator.*|kube-prometheus-.*"} == 0
       for: 15m
       labels:
@@ -33,6 +34,7 @@ spec:
     - alert: PrometheusOperatorListErrors
       annotations:
         description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
+        opsrecipe: "prometheus-operator/"
       expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
       for: 15m
       labels:
@@ -44,6 +46,7 @@ spec:
     - alert: PrometheusOperatorWatchErrors
       annotations:
         description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
+        opsrecipe: "prometheus-operator/"
       expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
       for: 15m
       labels:
@@ -55,6 +58,7 @@ spec:
     - alert: PrometheusOperatorSyncFailed
       annotations:
         description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
+        opsrecipe: "prometheus-operator/"
       expr: min_over_time(prometheus_operator_syncs{status="failed",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0
       for: 10m
       labels:
@@ -66,6 +70,7 @@ spec:
     - alert: PrometheusOperatorReconcileErrors
       annotations:
         description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
+        opsrecipe: "prometheus-operator/"
       expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
       for: 10m
       labels:
@@ -77,6 +82,7 @@ spec:
     - alert: PrometheusOperatorNodeLookupErrors
       annotations:
         description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
+        opsrecipe: "prometheus-operator/"
       expr: rate(prometheus_operator_node_address_lookup_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0.1
       for: 10m
       labels:
@@ -88,6 +94,7 @@ spec:
     - alert: PrometheusOperatorNotReady
       annotations:
         description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
+        opsrecipe: "prometheus-operator/"
       expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
       for: 5m
       labels:
@@ -99,6 +106,7 @@ spec:
     - alert: PrometheusOperatorRejectedResources
       annotations:
         description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
+        opsrecipe: "prometheus-operator/"
       expr: min_over_time(prometheus_operator_managed_resources{state="rejected",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0
       for: 5m
       labels:
diff --git a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml
index 0f1b7c68e..60efbe21b 100644
--- a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml
@@ -12,7 +12,7 @@ spec:
     - alert: "SilenceOperatorReconcileErrors"
       annotations:
         description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}'
-        opsrecipe: "silence-operator-reconcile-errors/"
+        opsrecipe: "operator-not-reconciling/"
       expr: |
         avg_over_time(operatorkit_controller_errors_total{app="silence-operator", cluster_type="management_cluster"}[20m]) > 0
       for: 1h
diff --git a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml
index 1aaab0de0..120326263 100644
--- a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml
@@ -12,6 +12,7 @@ spec:
     - alert: SlothDown
       annotations:
         description: 'Sloth is down.'
+        opsrecipe: sloth-down/
       expr: count(up{app="sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0
       for: 5m
       labels:
@@ -21,7 +22,7 @@ spec:
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_scrape_timeout: "true"
-        cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
+        cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas
         topic: observability
diff --git a/test/tests/providers/global/silence-operator.rules.test.yml b/test/tests/providers/global/silence-operator.rules.test.yml
index 23d092d3b..799f41e38 100644
--- a/test/tests/providers/global/silence-operator.rules.test.yml
+++ b/test/tests/providers/global/silence-operator.rules.test.yml
@@ -25,6 +25,6 @@ tests:
               topic: "observability"
             exp_annotations:
               description: "silence-operator controller silence-controller too many reconcile errors."
-              opsrecipe: "silence-operator-reconcile-errors/"
+              opsrecipe: "operator-not-reconciling/"
       - alertname: SilenceOperatorReconcileErrors
         eval_time: 215m
diff --git a/test/tests/providers/global/sloth.rules.test.yml b/test/tests/providers/global/sloth.rules.test.yml
index 76a14bd1a..813eff327 100644
--- a/test/tests/providers/global/sloth.rules.test.yml
+++ b/test/tests/providers/global/sloth.rules.test.yml
@@ -30,6 +30,7 @@ tests:
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
               cancel_if_scrape_timeout: "true"
-              cancel_if_outside_working_hours: "false"
+              cancel_if_outside_working_hours: "true"
             exp_annotations:
               description: "Sloth is down."
+              opsrecipe: sloth-down/