diff --git a/CHANGELOG.md b/CHANGELOG.md index baa667dc1..da710126d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add ops recipe for flux being suspended for too long alert. + ## [3.11.1] - 2024-04-17 ### Added diff --git a/README.md b/README.md index d485a7718..30572e5ac 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ tests: ``` Let's breakdown the above example: -* For the first input series, the prometheus timesies returns an `empty query result` for 20 minutes (20*interval), then it is returning the value `1` for 20 minutes. Finally, it is returning the value `0` for 20 minutes. +* For the first input series, the prometheus timeseries returns an `empty query result` for 20 minutes (20*interval), then it is returning the value `1` for 20 minutes. Finally, it is returning the value `0` for 20 minutes. This is a good example of an input series for testing an `up` query. * The second series introduce a timeseries which first returns a `0` value and which adds `600` every minutes (=interval) for 40 minutes. After 40 minutes it has reached a value of `24000` (600x40) and goes on by adding `400` every minutes for 40 more minutes. This is a good example of an input series for testing a `range` query. diff --git a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml index 8dda9022d..b431ea688 100644 --- a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml @@ -33,7 +33,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", namespace="flux-giantswarm", exported_namespace=~".*giantswarm.*"} > 0 for: 10m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page team: honeybadger @@ -48,7 +48,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="workload_cluster", organization="giantswarm"} > 0 for: 2h labels: - area: kaas + area: empowerment severity: page cancel_if_outside_working_hours: "true" team: honeybadger @@ -63,7 +63,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="Kustomization", cluster_type="management_cluster", namespace="flux-giantswarm", exported_namespace=~".*giantswarm.*"} > 0 for: 20m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page team: honeybadger @@ -76,7 +76,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="Kustomization", cluster_type="workload_cluster", organization="giantswarm"} > 0 for: 2h labels: - area: kaas + area: empowerment severity: page cancel_if_outside_working_hours: "true" team: honeybadger @@ -89,7 +89,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind=~"GitRepository|HelmRepository|Bucket", cluster_type="management_cluster", namespace="flux-giantswarm", exported_namespace=~".*giantswarm.*"} > 0 for: 2h labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page team: honeybadger @@ -102,7 +102,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind=~"GitRepository|HelmRepository|Bucket", cluster_type="workload_cluster", organization="giantswarm"} > 0 for: 2h labels: - area: kaas + area: empowerment severity: page cancel_if_outside_working_hours: "true" team: honeybadger @@ -125,7 +125,7 @@ spec: / (7*24*6) < 0.97 for: 10m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page team: honeybadger @@ -134,10 +134,11 @@ spec: annotations: description: |- {{`Flux {{ $labels.kind }} {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }} has been suspended for 24h.`}} + opsrecipe: fluxcd-suspended-for-too-long/ expr: gotk_suspend_status{namespace="flux-giantswarm", exported_namespace="flux-giantswarm"} > 0 for: 24h labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page team: honeybadger @@ -167,7 +168,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!~".*giantswarm.*"} > 0 for: 10m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify team: honeybadger @@ -180,7 +181,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="workload_cluster", organization!="giantswarm"} > 0 for: 2h labels: - area: kaas + area: empowerment severity: notify cancel_if_outside_working_hours: "true" team: honeybadger @@ -193,7 +194,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="Kustomization", cluster_type="management_cluster", exported_namespace!~".*giantswarm.*"} > 0 for: 10m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify team: honeybadger @@ -206,7 +207,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind="Kustomization", cluster_type="workload_cluster", organization!="giantswarm"} > 0 for: 2h labels: - area: kaas + area: empowerment severity: notify cancel_if_outside_working_hours: "true" team: honeybadger @@ -219,7 +220,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind=~"GitRepository|HelmRepository|Bucket", cluster_type="management_cluster", exported_namespace!~".*giantswarm.*"} > 0 for: 2h labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify team: honeybadger @@ -232,7 +233,7 @@ spec: expr: gotk_reconcile_condition{type="Ready", status="False", kind=~"GitRepository|HelmRepository|Bucket", cluster_type="workload_cluster", organization!="giantswarm"} > 0 for: 2h labels: - area: kaas + area: empowerment severity: notify cancel_if_outside_working_hours: "true" team: honeybadger @@ -247,7 +248,7 @@ spec: sum(rate(controller_runtime_reconcile_time_seconds_count{app=~".*flux.*", namespace!~".*giantswarm.*"}[5m])) by (installation, cluster_id, controller)) > 60 for: 10m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: notify team: honeybadger @@ -261,7 +262,7 @@ spec: sum by (name, namespace) (workqueue_unfinished_work_seconds{namespace=~"flux-giantswarm|flux-system"}) > 3600.0 for: 10m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page team: honeybadger diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index 39fd58127..cfc7305df 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -27,7 +27,6 @@ templates/alerting-rules/external-dns.rules.yml templates/alerting-rules/fairness.rules.yml templates/alerting-rules/falco.rules.yml templates/alerting-rules/fluentbit.rules.yml -templates/alerting-rules/flux.rules.yml templates/alerting-rules/helm.rules.yml templates/alerting-rules/ingress-controller.rules.yml templates/alerting-rules/inhibit.all.rules.yml diff --git a/test/tests/providers/global/flux.rules.test.yml b/test/tests/providers/global/flux.rules.test.yml new file mode 100644 index 000000000..62b3ccfe1 --- /dev/null +++ b/test/tests/providers/global/flux.rules.test.yml @@ -0,0 +1,28 @@ +--- +rule_files: + - flux.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'gotk_suspend_status{installation="test", namespace="flux-giantswarm", exported_namespace="flux-giantswarm", kind="Kustomization", name="flux"}' + values: "1x60 0+1x60 1+0x1500" + alert_rule_test: + - alertname: FluxSuspendedForTooLong + eval_time: 1560m + exp_alerts: + - exp_labels: + alertname: "FluxSuspendedForTooLong" + area: "empowerment" + cancel_if_outside_working_hours: "true" + exported_namespace: "flux-giantswarm" + installation: "test" + kind: "Kustomization" + name: "flux" + namespace: "flux-giantswarm" + severity: "page" + team: "honeybadger" + topic: "releng" + exp_annotations: + description: "Flux Kustomization flux in ns flux-giantswarm on test has been suspended for 24h." + opsrecipe: "fluxcd-suspended-for-too-long/"