diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a60e1598..2207a90bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add CiliumFailedNetworkPolicy alert. + ## [3.10.1] - 2024-04-12 ### Fixed diff --git a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml index 7708c2656..0f3ec9f40 100644 --- a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml @@ -33,3 +33,18 @@ spec: severity: page team: cabbage topic: cilium + - alert: CiliumNetworkPolicyFailed + annotations: + description: '{{`Too many Cilium Network Policy errors.`}}' + opsrecipe: unsupported-cilium-network-policy/ + # cilium_policy_change_total - for cilium >=1.15 + # cilium_policy_import_errors_total - for cilium <1.15 + expr: max(rate(cilium_policy_change_total{outcome=~"fail.*"}[20m]) OR rate(cilium_policy_import_errors_total[20m])) > 0 + for: 10m + labels: + area: managedservices + cancel_if_outside_working_hours: "true" + severity: page + team: cabbage + topic: cilium + diff --git a/test/tests/providers/global/cilium.rules.test.yml b/test/tests/providers/global/cilium.rules.test.yml index 7f14344c3..4f7a547d0 100644 --- a/test/tests/providers/global/cilium.rules.test.yml +++ b/test/tests/providers/global/cilium.rules.test.yml @@ -9,11 +9,11 @@ tests: - series: 'cilium_bpf_map_pressure{map_name="policy_00001"}' values: "_x20 20+0x20 90+0x20" alert_rule_test: - - alertname: CiliumBPFMapAlmostFull + - alertname: CiliumBPFMapAlmostFull eval_time: 10m - - alertname: CiliumBPFMapAlmostFull + - alertname: CiliumBPFMapAlmostFull eval_time: 30m - - alertname: CiliumBPFMapAlmostFull + - alertname: CiliumBPFMapAlmostFull eval_time: 50m exp_alerts: - exp_labels: @@ -31,11 +31,11 @@ tests: - series: 'cilium_bpf_map_pressure{map_name="policy_00001"}' values: "_x20 20+0x20 90+0x20 98+0x20" alert_rule_test: - - alertname: CiliumBPFMapFull + - alertname: CiliumBPFMapFull eval_time: 10m - - alertname: CiliumBPFMapFull + - alertname: CiliumBPFMapFull eval_time: 30m - - alertname: CiliumBPFMapFull + - alertname: CiliumBPFMapFull eval_time: 70m exp_alerts: - exp_labels: @@ -47,3 +47,55 @@ tests: exp_annotations: description: "Cilium BPF map is about filled up." opsrecipe: "cilium-bpf-map/" + # CiliumNetworkPolicyFailed for 1.15+ (cilium_policy_change_total{outcome="fail.*"}) + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'cilium_policy_change_total{outcome="fail"}' + values: "_x20 0+0x20 0+100x30 _x1000" + - series: 'cilium_policy_change_total{outcome="success"}' + values: "_x120 1+10000x50 _x1000" + - series: 'cilium_policy_import_errors_total{}' + values: "_x220 0+0x20 0+100x30 _x1000" + alert_rule_test: + # cilium_policy_change_total{outcome="fail"} + - alertname: CiliumNetworkPolicyFailed + eval_time: 10m + - alertname: CiliumNetworkPolicyFailed + eval_time: 30m + - alertname: CiliumNetworkPolicyFailed + eval_time: 60m + exp_alerts: + - exp_labels: + area: managedservices + severity: page + team: cabbage + topic: cilium + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Too many Cilium Network Policy errors." + opsrecipe: "unsupported-cilium-network-policy/" + # cilium_policy_change_total{outcome="success"} + - alertname: CiliumNetworkPolicyFailed + eval_time: 110m + - alertname: CiliumNetworkPolicyFailed + eval_time: 130m + - alertname: CiliumNetworkPolicyFailed + eval_time: 160m + # cilium_policy_import_errors_total{} + - alertname: CiliumNetworkPolicyFailed + eval_time: 210m + - alertname: CiliumNetworkPolicyFailed + eval_time: 230m + - alertname: CiliumNetworkPolicyFailed + eval_time: 260m + exp_alerts: + - exp_labels: + area: managedservices + severity: page + team: cabbage + topic: cilium + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Too many Cilium Network Policy errors." + opsrecipe: "unsupported-cilium-network-policy/"