From c4b0db38847dd873656579bf85c1d302cd5d9ddc Mon Sep 17 00:00:00 2001 From: Marcus Noble Date: Mon, 22 Jan 2024 10:03:12 +0000 Subject: [PATCH] Use workingHoursOnly on more alerts (#1009) * Use workingHoursOnly on more alerts Signed-off-by: Marcus Noble * Removed old openstack unit tests Signed-off-by: Marcus Noble * No longer silence all CAPA and CAPZ alerts out of hours by default Signed-off-by: Marcus Noble * Updated unit tests Signed-off-by: Marcus Noble --------- Signed-off-by: Marcus Noble --- CHANGELOG.md | 4 +- helm/prometheus-rules/templates/_helpers.tpl | 2 +- .../aws-load-balancer-controller.rules.yml | 4 +- .../templates/alerting-rules/kong.rules.yml | 4 +- .../kube-state-metrics.rules.yml | 2 +- .../alerting-rules/kyverno.all.rules.yml | 6 +- .../linkerd.deployment.rules.yml | 2 +- .../alerting-rules/prometheus.rules.yml | 2 +- .../templates/alerting-rules/sloth.rules.yml | 2 +- test/conf/providers | 1 - .../capi/capz/capi-cluster.rules.test.yml | 6 +- .../capi-kubeadmcontrolplane.rules.test.yml | 4 +- .../capi/capz/capi-machine.rules.test.yml | 4 +- .../capi-machinedeployment.rules.test.yml | 4 +- .../capi/capz/capi-machinepool.rules.test.yml | 4 +- .../capi/capz/capi-machineset.rules.test.yml | 2 +- .../capz/dns-operator-azure.rules.test.yml | 4 +- .../capi/openstack/capi.rules.test.yml | 97 ------------------- .../capi/openstack/capo.rules.test.yml | 74 -------------- .../openstack/cert-manager.rules.test.yml | 48 --------- .../openstack/certificate.all.rules.test.yml | 94 ------------------ .../node-exporter.all.rules.test.yml | 67 ------------- 22 files changed, 29 insertions(+), 408 deletions(-) delete mode 100644 test/tests/providers/capi/openstack/capi.rules.test.yml delete mode 100644 test/tests/providers/capi/openstack/capo.rules.test.yml delete mode 100644 test/tests/providers/capi/openstack/cert-manager.rules.test.yml delete mode 100644 test/tests/providers/capi/openstack/certificate.all.rules.test.yml delete mode 100644 test/tests/providers/capi/openstack/node-exporter.all.rules.test.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index a2ba069d1..f5394a072 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Changed teleport alerts to take into accont only `Provisioned` clusters +- Made use of `workingHoursOnly` template on more alerts to ensure `stable-testing` MCs don't page out of hours +- No longer silence all CAPA and CAPZ alerts out of hours by default ## [2.148.0] - 2024-01-17 @@ -150,7 +152,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- fixed `aggregation:kyverno_policy_job_status_team` expression. +- fixed `aggregation:kyverno_policy_job_status_team` expression. ### Added diff --git a/helm/prometheus-rules/templates/_helpers.tpl b/helm/prometheus-rules/templates/_helpers.tpl index b735b6664..987d805e8 100644 --- a/helm/prometheus-rules/templates/_helpers.tpl +++ b/helm/prometheus-rules/templates/_helpers.tpl @@ -40,7 +40,7 @@ phoenix {{- end -}} {{- define "workingHoursOnly" -}} -{{- if has .Values.managementCluster.provider.kind (list "openstack" "capz" "capa") -}} +{{- if has .Values.managementCluster.provider.kind (list "openstack") -}} "true" {{- else if eq .Values.managementCluster.pipeline "stable-testing" -}} "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml index 8eef7b0e5..557748349 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml @@ -23,7 +23,7 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: phoenix topic: alb @@ -38,7 +38,7 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: phoenix topic: alb diff --git a/helm/prometheus-rules/templates/alerting-rules/kong.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kong.rules.yml index 14a3927ad..706c0f293 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kong.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kong.rules.yml @@ -22,7 +22,7 @@ spec: area: managedservices cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: cabbage topic: kong @@ -36,7 +36,7 @@ spec: area: managedservices cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: cabbage topic: kong diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index e635ae988..8b15812f2 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -34,7 +34,7 @@ spec: inhibit_kube_state_metrics_down: "true" cancel_if_prometheus_agent_down: "true" cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml index 17b7ce063..9cfab2cbb 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml @@ -21,7 +21,7 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: shield topic: kyverno @@ -38,7 +38,7 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: shield topic: kyverno @@ -53,7 +53,7 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify team: shield topic: kyverno diff --git a/helm/prometheus-rules/templates/alerting-rules/linkerd.deployment.rules.yml b/helm/prometheus-rules/templates/alerting-rules/linkerd.deployment.rules.yml index 833448c9a..4d4c0db67 100644 --- a/helm/prometheus-rules/templates/alerting-rules/linkerd.deployment.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/linkerd.deployment.rules.yml @@ -18,7 +18,7 @@ spec: for: 30m labels: area: managedservices - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: cabbage topic: linkerd diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml index 0fb422a1d..bb3592245 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml @@ -23,7 +23,7 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_has_no_workers: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml index 6e4b1967c..0daee50a3 100644 --- a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml @@ -21,7 +21,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: atlas topic: observability diff --git a/test/conf/providers b/test/conf/providers index 48f4f81a3..cc1b11d09 100644 --- a/test/conf/providers +++ b/test/conf/providers @@ -1,3 +1,2 @@ vintage/aws -capi/openstack capi/capz diff --git a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml index ff2c8c1a6..656d813cb 100644 --- a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml @@ -20,7 +20,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: page team: phoenix topic: managementcluster @@ -35,7 +35,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster @@ -51,7 +51,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml b/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml index fd03e4afb..ab21dcac8 100644 --- a/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml @@ -20,7 +20,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster @@ -35,7 +35,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/capz/capi-machine.rules.test.yml b/test/tests/providers/capi/capz/capi-machine.rules.test.yml index 25d2694bb..44b1785bb 100644 --- a/test/tests/providers/capi/capz/capi-machine.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machine.rules.test.yml @@ -16,7 +16,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster @@ -32,7 +32,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml b/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml index c2df07936..0f008514d 100644 --- a/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml @@ -20,7 +20,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster @@ -35,7 +35,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml b/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml index 5d3fba71c..bc236e2de 100644 --- a/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml @@ -20,7 +20,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster @@ -35,7 +35,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/capz/capi-machineset.rules.test.yml b/test/tests/providers/capi/capz/capi-machineset.rules.test.yml index 8d6119275..a2d5f9d45 100644 --- a/test/tests/providers/capi/capz/capi-machineset.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machineset.rules.test.yml @@ -12,7 +12,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml b/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml index d31e3b317..10673efc6 100644 --- a/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml +++ b/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml @@ -20,7 +20,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster @@ -36,7 +36,7 @@ tests: exp_alerts: - exp_labels: area: kaas - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: "false" severity: notify team: phoenix topic: managementcluster diff --git a/test/tests/providers/capi/openstack/capi.rules.test.yml b/test/tests/providers/capi/openstack/capi.rules.test.yml deleted file mode 100644 index b54ff6702..000000000 --- a/test/tests/providers/capi/openstack/capi.rules.test.yml +++ /dev/null @@ -1,97 +0,0 @@ -rule_files: - - capi.rules.yml - -tests: - - interval: 1m - input_series: - - series: 'capi_machine_status_phase{cluster_name="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm", phase="Running"}' - values: "1+0x10 0+0x35" - - series: 'capi_machine_status_phase{cluster_name="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}' - values: "0+0x10 1+0x35" - alert_rule_test: - - alertname: MachineUnhealthyPhase - eval_time: 45m - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - cluster_name: galaxy - name: galaxy-72jq5 - exported_namespace: giantswarm - phase: Failed - exp_annotations: - description: "Machine giantswarm/galaxy-72jq5 stuck in phase Failed for more than 30 minutes." - - interval: 1m - input_series: - - series: 'capi_machinedeployment_spec_replicas{cluster_name="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm"}' - values: "0+3x75" - - series: 'capi_machinedeployment_status_replicas_available{cluster_name="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm"}' - values: "0+3x75" - - series: 'capi_machinedeployment_spec_replicas{cluster_name="galaxy", name="galaxy-72jzy", exported_namespace="giantswarm"}' - values: "0+3x75" - - series: 'capi_machinedeployment_status_replicas_available{cluster_name="galaxy", name="galaxy-72jzy", exported_namespace="giantswarm"}' - values: "0+2x75" - alert_rule_test: - - alertname: MachineDeploymentReplicasMismatch - eval_time: 75m - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - cluster_name: galaxy - name: galaxy-72jzy - exported_namespace: giantswarm - exp_annotations: - description: "The clusters galaxy machinedeployment giantswarm/galaxy-72jzy does not match the expected number of replicas for longer than 1h." - - interval: 1m - input_series: - - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm"}' - values: "0+3x100" - - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm"}' - values: "0+3x100" - - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="galaxy", name="galaxy-72jzy", exported_namespace="giantswarm"}' - values: "0+3x100" - - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="galaxy", name="galaxy-72jzy", exported_namespace="giantswarm"}' - values: "0+2x100" - alert_rule_test: - - alertname: KubeadmControlPlaneReplicasMismatch - eval_time: 100m - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - cluster_name: galaxy - name: galaxy-72jzy - exported_namespace: giantswarm - exp_annotations: - description: "The clusters galaxy kubeadmcontrolplane giantswarm/galaxy-72jzy does not match the expected number of replicas for longer than 90 minutes." - - interval: 1m - input_series: - - series: 'capi_cluster_status_phase{name="galaxy", exported_namespace="giantswarm", phase="Provisioned"}' - values: "1+0x75" - - series: 'capi_cluster_status_phase{name="galaxy", exported_namespace="giantswarm", phase="Pending"}' - values: "1+0x75" - alert_rule_test: - - alertname: ClusterUnhealthyPhase - eval_time: 75m - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - name: galaxy - exported_namespace: giantswarm - phase: Pending - exp_annotations: - description: "Cluster giantswarm/galaxy is in a non healthy phase." diff --git a/test/tests/providers/capi/openstack/capo.rules.test.yml b/test/tests/providers/capi/openstack/capo.rules.test.yml deleted file mode 100644 index 916266f8c..000000000 --- a/test/tests/providers/capi/openstack/capo.rules.test.yml +++ /dev/null @@ -1,74 +0,0 @@ -rule_files: - - capo.rules.yml - -tests: - - interval: 1m - input_series: - - series: 'capi_openstackmachine_status_instance_state{cluster="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm", state="ACTIVE"}' - values: "1+0x10 0+0x35" - - series: 'capi_openstackmachine_status_instance_state{cluster="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm", state="ERROR"}' - values: "0+0x10 1+0x35" - - series: 'capi_openstackmachine_status_instance_state{cluster="galaxy", name="galaxy-9xjq5", exported_namespace="giantswarm", state="ERROR"}' - values: "1+0x60" - alert_rule_test: - - alertname: OpenStackMachineUnexpectedInstanceState - eval_time: 1h - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - cluster: galaxy - name: galaxy-9xjq5 - exported_namespace: giantswarm - state: ERROR - exp_annotations: - description: "OpenStackMachine giantswarm/galaxy-9xjq5 instance state is ERROR." - - interval: 1m - input_series: - - series: 'capi_openstackmachine_status_failure{cluster="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm"}' - values: "0+0x45" - - series: 'capi_openstackmachine_status_failure{cluster="galaxy", name="galaxy-98jq5", exported_namespace="giantswarm", reason="transient error reason"}' - values: "1+0x45" - alert_rule_test: - - alertname: OpenStackMachineFailure - eval_time: 45m - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - cluster: galaxy - name: galaxy-98jq5 - exported_namespace: giantswarm - reason: transient error reason - exp_annotations: - description: "OpenStackMachine giantswarm/galaxy-98jq5 failure reason is transient error reason." - opsrecipe: "remove-errors-from-capi-capo-crs/" - - interval: 1m - input_series: - - series: 'capi_openstackcluster_status_failure{cluster="galaxy", name="galaxy-72jq5", exported_namespace="giantswarm"}' - values: "0+0x45" - - series: 'capi_openstackcluster_status_failure{cluster="galaxy", name="galaxy-9xjq5", exported_namespace="giantswarm", reason="LB stuck in PENDING_UPDATE"}' - values: "1+0x45" - alert_rule_test: - - alertname: OpenStackClusterFailure - eval_time: 45m - exp_alerts: - - exp_labels: - area: kaas - cancel_if_outside_working_hours: "true" - severity: notify - team: rocket - topic: managementcluster - cluster: galaxy - name: galaxy-9xjq5 - exported_namespace: giantswarm - reason: LB stuck in PENDING_UPDATE - exp_annotations: - description: "OpenStackCluster giantswarm/galaxy-9xjq5 failure reason is LB stuck in PENDING_UPDATE." - opsrecipe: "remove-errors-from-capi-capo-crs/" diff --git a/test/tests/providers/capi/openstack/cert-manager.rules.test.yml b/test/tests/providers/capi/openstack/cert-manager.rules.test.yml deleted file mode 100644 index 58b88a18a..000000000 --- a/test/tests/providers/capi/openstack/cert-manager.rules.test.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -rule_files: - - cert-manager.rules.yml - -tests: - - interval: 1m - input_series: - - series: 'up{app="cert-manager-app", cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", node="ip-10-0-0-0.eu-central-1.compute.internal", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="openstack", service_priority="highest"}' - values: "0+0x60" - alert_rule_test: - - alertname: CertManagerDown - eval_time: 15m - exp_alerts: - - exp_labels: - alertname: CertManagerDown - app: cert-manager-app - area: kaas - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: "true" - cluster_id: 12345 - cluster_type: workload_cluster - container: cert-manager - customer: giantswarm - instance: 10.0.0.0:1234 - ip: 10.0.0.0 - job: 12345-prometheus/workload-12345/0 - namespace: kube-system - node: ip-10-0-0-0.eu-central-1.compute.internal - organization: giantswarm - pod: cert-manager-controller-7fcc585578-gnprd - provider: openstack - installation: gollem - service_priority: highest - severity: page - team: bigmac - topic: cert-manager - exp_annotations: - description: "cert-manager in namespace kube-system is down." - opsrecipe: "cert-manager-down/" - - interval: 1m - input_series: - - series: 'up{app="cert-manager-app", cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", node="ip-10-0-0-0.eu-central-1.compute.internal", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="openstack", service_priority="highest"}' - values: "1+0x60" - alert_rule_test: - - alertname: CertManagerDown - eval_time: 15m diff --git a/test/tests/providers/capi/openstack/certificate.all.rules.test.yml b/test/tests/providers/capi/openstack/certificate.all.rules.test.yml deleted file mode 100644 index c316e427d..000000000 --- a/test/tests/providers/capi/openstack/certificate.all.rules.test.yml +++ /dev/null @@ -1,94 +0,0 @@ ---- -rule_files: - - certificate.all.rules.yml - -tests: - # CertificateSecretWillExpireInLessThanTwoWeeks within 2 weeks of expiration - - interval: 1d - input_series: - - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}' - values: "2678400x60" - alert_rule_test: - - alertname: CertificateSecretWillExpireInLessThanTwoWeeks - eval_time: 20d - exp_alerts: - - exp_labels: - alertname: CertificateSecretWillExpireInLessThanTwoWeeks - app: cert-exporter-deployment - area: kaas - cancel_if_outside_working_hours: "true" - cluster_id: gollem - cluster_type: management_cluster - container: cert-exporter - customer: giantswarm - exported_namespace: giantswarm - instance: 10.0.0.0:1234 - job: gollem-prometheus/workload-gollem/0 - namespace: giantswarm - node: 10.0.0.0 - organization: giantswarm - pod: cert-exporter-deployment-5c47b4c55c-49wt9 - provider: kvm - name: athena-certs-secret - installation: gollem - service_priority: highest - severity: page - secretkey: tls.crt - team: rocket - topic: cert-manager - exp_annotations: - description: "Certificate stored in Secret giantswarm/athena-certs-secret on gollem will expire in less than two weeks." - opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/" - # CertificateSecretWillExpireInLessThanTwoWeeks not within 2 weeks of expiration - - interval: 1d - input_series: - - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}' - values: "2678400x60" - alert_rule_test: - - alertname: CertificateSecretWillExpireInLessThanTwoWeeks - eval_time: 10d - # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks within 2 weeks of expiration - - interval: 1d - input_series: - - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}' - values: "2678400x60" - alert_rule_test: - - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks - eval_time: 20d - exp_alerts: - - exp_labels: - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks - app: cert-exporter-deployment - area: kaas - cancel_if_outside_working_hours: "true" - cluster_id: 12345 - cluster_type: workload_cluster - container: cert-exporter - customer: giantswarm - exported_namespace: giantswarm - instance: 10.0.0.0:1234 - job: 12345-prometheus/workload-12345/0 - namespace: kube-system - node: 10.0.0.0 - organization: giantswarm - pod: cert-exporter-deployment-57bbbfd856-8r8dr - provider: kvm - name: kiam-agent - installation: gollem - service_priority: highest - severity: page - team: rocket - topic: cert-manager - issuer_ref: kiam-ca-issuer - managed_issuer: "true" - exp_annotations: - description: "Certificate CR kube-system/kiam-agent on 12345 will expire in less than two weeks." - opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/" - # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks not within 2 weeks of expiration - - interval: 1d - input_series: - - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}' - values: "2678400x60" - alert_rule_test: - - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks - eval_time: 10d diff --git a/test/tests/providers/capi/openstack/node-exporter.all.rules.test.yml b/test/tests/providers/capi/openstack/node-exporter.all.rules.test.yml deleted file mode 100644 index 7c4c8da71..000000000 --- a/test/tests/providers/capi/openstack/node-exporter.all.rules.test.yml +++ /dev/null @@ -1,67 +0,0 @@ ---- -rule_files: - - node-exporter.all.rules.yml - -tests: - # NodeExporterCollectorFailed tests - - interval: 1m - input_series: - # No data for 20 minutes, then all good, then cpu collector fails, then bonding collector fails - - series: 'node_scrape_collector_success{app="node-exporter", collector="cpu", instance="10.0.5.111:10300"}' - values: "_x20 1+0x20 0+0x20 1+0x20" - - series: 'node_scrape_collector_success{app="node-exporter", collector="bonding", instance="10.0.5.111:10300"}' - values: "_x20 1+0x20 1+0x20 0+0x20" - alert_rule_test: - - alertname: NodeExporterCollectorFailed - eval_time: 10m - - alertname: NodeExporterCollectorFailed - eval_time: 30m - - alertname: NodeExporterCollectorFailed - eval_time: 50m - exp_alerts: - - exp_labels: - alertname: NodeExporterCollectorFailed - app: "node-exporter" - area: "kaas" - cancel_if_outside_working_hours: "true" - collector: "cpu" - instance: "10.0.5.111:10300" - severity: "page" - team: "rocket" - topic: "observability" - exp_annotations: - description: "NodeExporter Collector cpu on 10.0.5.111:10300 is failed." - opsrecipe: "node-exporter-device-error/" - - alertname: NodeExporterCollectorFailed - eval_time: 70m - - # NodeExporterDeviceError tests - - interval: 1m - input_series: - - series: 'node_filesystem_device_error{device="/dev/mapper/usr", fstype="ext4", instance="10.0.5.111:10300", mountpoint="/var/lib/kubelet", cluster_type="workload_cluster"}' - values: "_x20 1+0x20 0+0x20" - alert_rule_test: - - alertname: NodeExporterDeviceError - eval_time: 10m - exp_alerts: - - alertname: NodeExporterDeviceError - eval_time: 20m - exp_alerts: - - alertname: NodeExporterDeviceError - eval_time: 30m - exp_alerts: - - exp_labels: - alertname: NodeExporterDeviceError - area: "kaas" - cancel_if_outside_working_hours: "true" - cluster_type: "workload_cluster" - device: "/dev/mapper/usr" - fstype: "ext4" - instance: "10.0.5.111:10300" - mountpoint: "/var/lib/kubelet" - severity: "page" - team: "rocket" - topic: "observability" - exp_annotations: - description: "NodeExporter Mountpoint /var/lib/kubelet on device /dev/mapper/usr on 10.0.5.111:10300 is erroring." - opsrecipe: "node-exporter-device-error/"