From de44204245bb1e0790ef24400a8b116433b68c39 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 13:22:28 +0100 Subject: [PATCH] replace mimir.enabled property with mc provider flavor as mimir is everywhere in capi (#1424) * replace mimir.enabled property with mc provider flavor as mimir is everywhere in capi * Update test/conf/providers --- CHANGELOG.md | 4 + README.md | 6 +- .../templates/alloy-rules-configmap.yaml | 2 +- .../templates/alloy-rules.yaml | 2 +- .../aws-load-balancer-controller.rules.yml | 4 +- .../aws.node.workload-cluster.rules.yml | 4 +- .../aws.workload-cluster.rules.yml | 2 +- .../capa.management-cluster.rules.yml | 4 +- .../phoenix/alerting-rules/irsa.rules.yml | 4 +- .../apiserver.management-cluster.rules.yml | 4 +- .../apiserver.workload-cluster.rules.yml | 4 +- .../capi.management-cluster.rules.yml | 4 +- .../certificate.management-cluster.rules.yml | 4 +- .../certificate.workload-cluster.rules.yml | 4 +- .../cluster-autoscaler.rules.yml | 4 +- .../etcd.management-cluster.rules.yml | 4 +- .../etcd.workload-cluster.rules.yml | 4 +- .../alerting-rules/etcdbackup.rules.yml | 4 +- .../alerting-rules/inhibit.nodes.rules.yml | 4 +- .../management-cluster.rules.yml | 4 +- .../node.management-cluster.rules.yml | 4 +- .../node.workload-cluster.rules.yml | 4 +- .../alerting-rules/pods.core.rules.yml | 2 +- .../storage.management-cluster.rules.yml | 4 +- .../storage.workload-cluster.rules.yml | 4 +- .../loki-ruler-datasource-configmap.yaml | 2 +- .../deployment.management-cluster.rules.yml | 4 +- .../deployment.workload-cluster.rules.yml | 4 +- .../alerting-rules/grafana-cloud.rules.yml | 6 +- .../atlas/alerting-rules/grafana.rules.yml | 2 +- .../kube-state-metrics.rules.yml | 22 +- .../atlas/alerting-rules/mimir.rules.yml | 2 +- .../alerting-rules/prometheus-agent.rules.yml | 2 +- .../prometheus-meta-operator.rules.yml | 2 +- .../alerting-rules/statefulset.rules.yml | 4 +- .../atlas/alerting-rules/storage.rules.yml | 4 +- .../recording-rules/grafana-cloud.rules.yml | 4 +- .../recording-rules/mimir-mixins.rules.yml | 2 +- ...oring.resource-usage-estimation.rules.yaml | 2 +- .../alerting-rules/external-dns.rules.yml | 4 +- .../honeybadger/alerting-rules/app.rules.yml | 4 +- .../shield/alerting-rules/dex.rules.yml | 6 +- .../shield/alerting-rules/falco.rules.yml | 4 +- helm/prometheus-rules/values.schema.json | 8 - helm/prometheus-rules/values.yaml | 3 - mimir/update.sh | 2 +- test/conf/providers | 1 - test/hack/bin/run-pint.sh | 6 +- test/hack/bin/template-chart.sh | 2 - .../prometheus-agent.rules.test.yml | 216 ---------- .../alerting-rules/zot.rules.test.yml | 54 --- .../capa.inhibition.rules.test.yml | 0 .../capi-cluster.rules.test.yml | 69 +++ .../capi-kubeadmcontrolplane.rules.test.yml | 52 +++ .../capi-machine.rules.test.yml | 49 +++ .../capi-machinedeployment.rules.test.yml | 47 +++ .../capi-machinepool.rules.test.yml | 47 +++ .../capi-machineset.rules.test.yml | 27 ++ .../alerting-rules/capi.rules.test.yml | 91 ++++ .../certificate.all.rules.test.yml | 94 +++++ .../node-exporter.rules.test.yml | 38 ++ .../alerting-rules/grafana-cloud.test.yml | 0 .../atlas/alerting-rules/mimir.rules.test.yml | 0 .../prometheus-agent.rules.test.yml | 92 ---- .../cert-manager.rules.test.yml | 46 ++ .../alerting-rules/teleport.rules.test.yml | 59 +++ .../alerting-rules/grafana-cloud.test.yml | 156 +++++++ .../atlas/alerting-rules/mimir.rules.test.yml | 392 ++++++++++++++++++ .../prometheus-agent.rules.test.yml | 92 ---- 69 files changed, 1261 insertions(+), 556 deletions(-) delete mode 100644 test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml delete mode 100644 test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml rename test/tests/providers/capi/{capa-mimir => capa}/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml (100%) create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml rename test/tests/providers/capi/{capa-mimir => capa}/platform/atlas/alerting-rules/grafana-cloud.test.yml (100%) rename test/tests/providers/capi/{capa-mimir => capa}/platform/atlas/alerting-rules/mimir.rules.test.yml (100%) create mode 100644 test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml create mode 100644 test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml create mode 100644 test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml create mode 100644 test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c84e2421..485250efb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Removed + +- Remove the `mimir.enabled` property to replace it with the MC flavor as all CAPI MCs now run Mimir. + ## [4.24.1] - 2024-11-12 ### Fixed diff --git a/README.md b/README.md index a704e0a98..c6e50b285 100644 --- a/README.md +++ b/README.md @@ -168,11 +168,11 @@ There are 2 kinds of tests on rules: ``` [...] ### Testing platform/atlas/alerting-rules/prometheus-operator.rules.yml - ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-operator.rules.yml + ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa/platform/atlas/alerting-rules/prometheus-operator.rules.yml ### Skipping platform/atlas/alerting-rules/prometheus-operator.rules.yml: listed in test/conf/promtool_ignore ### Testing platform/atlas/alerting-rules/prometheus.rules.yml - ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa-mimir/platform/atlas/alerting-rules/prometheus.rules.yml - ### promtool test rules prometheus.rules.test.yml - capi/capa-mimir + ### promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa/platform/atlas/alerting-rules/prometheus.rules.yml + ### promtool test rules prometheus.rules.test.yml - capi/capa [...] 09:06:29 promtool: end (Elapsed time: 1s) Congratulations! Prometheus rules have been promtool checked and tested diff --git a/helm/prometheus-rules/templates/alloy-rules-configmap.yaml b/helm/prometheus-rules/templates/alloy-rules-configmap.yaml index 5bb93b84f..54d8d51f4 100644 --- a/helm/prometheus-rules/templates/alloy-rules-configmap.yaml +++ b/helm/prometheus-rules/templates/alloy-rules-configmap.yaml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/prometheus-rules/templates/alloy-rules.yaml b/helm/prometheus-rules/templates/alloy-rules.yaml index ef23d1911..0132c9899 100644 --- a/helm/prometheus-rules/templates/alloy-rules.yaml +++ b/helm/prometheus-rules/templates/alloy-rules.yaml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: application.giantswarm.io/v1alpha1 kind: App metadata: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml index 2f5e080f6..24863fe14 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml @@ -5,9 +5,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: aws-load-balancer-controller.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml index 104e18863..6d2ace5c3 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: node.aws.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml index 1306de635..db06f9b0b 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml @@ -5,7 +5,7 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" {{- end }} name: aws.workload-cluster.rules diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml index 1e9cdb2e7..32d0848fb 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml @@ -6,9 +6,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: capa.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml index 993ca2c07..e1fd083d4 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml @@ -3,9 +3,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: irsa.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml index e0877f4f5..d69bcdc10 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: apiserver.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml index d23245c87..f26e64816 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: apiserver.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml index aed92be3a..aba6ac4d5 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml @@ -4,9 +4,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4}} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: capi.management-cluster.rules namespace: {{.Values.namespace}} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml index c113c46d6..db0538d2a 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: certificate.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml index 70def5eee..86027745b 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: certificate.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml index c47475cb5..c44e1e9e9 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml @@ -5,9 +5,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: cluster-autoscaler.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml index b28bdeceb..790646a89 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: etcd.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml index 222edb370..44aa8e9fc 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: etcd.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml index 7dea38eeb..4291a1a72 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: etcdbackup.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml index 735a771dc..984fa7070 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: inhibit.nodes.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml index 17865dc57..6f8fa87c2 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml index d67f64279..5ab9ac304 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: node.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml index 0507246f3..6a30a570e 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: node.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml index 61dced935..0bd99a509 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml @@ -4,7 +4,7 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" {{- end }} name: pods.core.rules diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml index 9f27fb3c1..591515777 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: core.storage.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml index 72b7d6e06..a19a9035d 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: core.storage.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml b/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml index 988bce7e7..60d9a16e1 100644 --- a/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml +++ b/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml index 1f98fe451..be6a9f5a2 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: deployment.management-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml index fa9087331..ca7422b1d 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end}} + {{- end }} name: deployment.workload-cluster.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml index 9560570ef..2022f4fde 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml @@ -3,7 +3,7 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" {{- end }} name: grafana-cloud.rules @@ -17,7 +17,7 @@ spec: annotations: description: 'Prometheus is not sending data to Grafana Cloud.' opsrecipe: prometheus-grafanacloud/ - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) {{- else }} expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) @@ -29,7 +29,7 @@ spec: severity: page team: atlas topic: observability - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} - name: mimir-to-grafana-cloud-exporter rules: - alert: MimirToGrafanaCloudExporterDown diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml index 97a10780b..977840aa1 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml @@ -3,7 +3,7 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" {{- end }} name: grafana.rules diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml index 83089fc33..7fa5beeb9 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml @@ -14,7 +14,7 @@ spec: annotations: description: '{{`KubeStateMetrics is down.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: |- label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1) {{- else }} @@ -89,7 +89,7 @@ spec: annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_configmap_created{}) {{- else }} expr: |- @@ -117,7 +117,7 @@ spec: annotations: description: '{{`kube_daemonset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_daemonset_created{}) {{- else }} expr: |- @@ -145,7 +145,7 @@ spec: annotations: description: '{{`kube_deployment_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_deployment_created{}) {{- else }} expr: |- @@ -173,7 +173,7 @@ spec: annotations: description: '{{`kube_endpoint_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_endpoint_created{}) {{- else }} expr: |- @@ -201,7 +201,7 @@ spec: annotations: description: '{{`kube_namespace_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_namespace_created{}) {{- else }} expr: |- @@ -229,7 +229,7 @@ spec: annotations: description: '{{`kube_node_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_node_created{}) {{- else }} expr: |- @@ -257,7 +257,7 @@ spec: annotations: description: '{{`kube_pod_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_pod_created{}) {{- else }} expr: |- @@ -285,7 +285,7 @@ spec: annotations: description: '{{`kube_replicaset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_replicaset_created{}) {{- else }} expr: |- @@ -313,7 +313,7 @@ spec: annotations: description: '{{`kube_secret_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_secret_created{}) {{- else }} expr: |- @@ -341,7 +341,7 @@ spec: annotations: description: '{{`kube_service_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' opsrecipe: kube-state-metrics-down/ - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} expr: absent(kube_service_created{}) {{- else }} expr: |- diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 6dc137889..294c5d155 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml index 99a34f86c..73c749b42 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml @@ -9,7 +9,7 @@ spec: groups: - name: prometheus-agent rules: - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint. - alert: PrometheusAgentFailing annotations: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml index 98865562f..ff81b5e41 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml @@ -9,7 +9,7 @@ spec: groups: - name: observability rules: - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} - alert: "Heartbeat" expr: up{job=~".*prometheus/prometheus.*",instance!="prometheus-agent"} == 1 labels: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml index 1c546f359..439a96426 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: statefulset.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml index 7b0798d5d..8490e4a79 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: observability.storage.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml index 3ebe08974..20bee678b 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml @@ -344,7 +344,7 @@ spec: rules: - expr: sum(ALERTS{alertstate="firing"}) by (alertname, cluster_id, cluster_type, customer, installation, pipeline, provider, region, area, severity, team, topic) record: aggregation:prometheus:alerts - {{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} # Metric container_memory_working_set_bytes comes from the cAdvisor component scraped on management clusters which is then scraped by the management cluster prometheus. # This means the cluster_id label on this metric will be the cluster_id of the management cluster for all the series, not the workload cluster id. # As we want to record the memory usage of the prometheis per cluster, we need to extract the cluster id from the prometheus pod name (i.e. pod=prometheus-xyz-ordinal => cluster_id=xyz). @@ -353,7 +353,7 @@ spec: - expr: sum(label_replace(container_memory_working_set_bytes{container='prometheus', namespace=~'.*-prometheus'}, "cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)")) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) record: aggregation:prometheus:memory_usage {{- end }} - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} - name: mimir.grafana-cloud.recording rules: - expr: sum(container_memory_working_set_bytes{namespace='mimir', cluster_type="management_cluster", container=~'.+'}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml index d41a406b5..7d0247b6a 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml @@ -1,4 +1,4 @@ -{{- if .Values.mimir.enabled }} +{{- if eq .Values.managementCluster.provider.flavor "capi" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml index 81a946f09..c402ff83d 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml @@ -9,7 +9,7 @@ spec: groups: - name: monitoring.resource-usage-estimation.recording rules: - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, job) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_usage_bytes{container="ingester", namespace="mimir"}) by (cluster_id) record: giantswarm:observability:monitoring:resource_usage_estimation:memory_usage_bytes - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, job) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_working_set_bytes{container="ingester", namespace="mimir"}) by (cluster_id) diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml index 6ba5a7fa2..d7557af5e 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "workload_cluster" -{{- end }} + {{- end }} name: external-dns.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml index fac504908..fc7af2fc6 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: app.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml index 453478048..2905ee3df 100644 --- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: dex.rules namespace: {{ .Values.namespace }} spec: @@ -41,7 +41,7 @@ spec: annotations: description: '{{`dex-operator did not register a dex-app in giantswarm namespace.`}}' opsrecipe: dex-operator/ - {{- if .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "capi" }} expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) {{- else }} expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster"}) == 1 diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml index 61cd126f2..20349e01e 100644 --- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml @@ -4,9 +4,9 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if eq .Values.managementCluster.provider.flavor "vintage" }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: falco.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/values.schema.json b/helm/prometheus-rules/values.schema.json index 780796c8c..414afa24d 100644 --- a/helm/prometheus-rules/values.schema.json +++ b/helm/prometheus-rules/values.schema.json @@ -30,14 +30,6 @@ } } }, - "mimir": { - "type": "object", - "properties": { - "enabled": { - "type": "boolean" - } - } - }, "name": { "type": "string" }, diff --git a/helm/prometheus-rules/values.yaml b/helm/prometheus-rules/values.yaml index 409130af9..0388578cf 100644 --- a/helm/prometheus-rules/values.yaml +++ b/helm/prometheus-rules/values.yaml @@ -10,9 +10,6 @@ managementCluster: flavor: "" region: "" -mimir: - enabled: false - Installation: V1: Guest: diff --git a/mimir/update.sh b/mimir/update.sh index 05ddd4bbc..7980f939a 100755 --- a/mimir/update.sh +++ b/mimir/update.sh @@ -36,7 +36,7 @@ spec:\ groups:' "$OUTPUT_FILE" # Add the mimir enabled helm conditional blocks -sed -i '1i{{- if .Values.mimir.enabled }}' "$OUTPUT_FILE" +sed -i '1i{{- if eq .Values.managementCluster.provider.flavor "capi" }}' "$OUTPUT_FILE" sed -i -e '$a{{- end }}' "$OUTPUT_FILE" sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE" diff --git a/test/conf/providers b/test/conf/providers index c22316aea..5425cc445 100644 --- a/test/conf/providers +++ b/test/conf/providers @@ -1,4 +1,3 @@ vintage/aws capi/capz capi/capa -capi/capa-mimir diff --git a/test/hack/bin/run-pint.sh b/test/hack/bin/run-pint.sh index a5aa0150e..84520c5ed 100755 --- a/test/hack/bin/run-pint.sh +++ b/test/hack/bin/run-pint.sh @@ -15,9 +15,11 @@ main () { PINT_CONFIG="${1:-test/conf/pint/pint-config.hcl}" if [[ "${2:-}" != "" ]]; then - mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capa-mimir/" | grep -v ".test.yml") + mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capa/" | grep -v ".test.yml") + mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capz/" | grep -v ".test.yml") else - mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capa-mimir/ -name "*.rules.yml") + mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capa/ -name "*.rules.yml") + mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capz/ -name "*.rules.yml") fi test/hack/bin/pint -c "$PINT_CONFIG" lint "${PINT_FILES_LIST[@]}" diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh index 57dd769d6..5f9278a97 100755 --- a/test/hack/bin/template-chart.sh +++ b/test/hack/bin/template-chart.sh @@ -13,7 +13,6 @@ main() { echo "Templating chart for provider: $provider" [[ $provider =~ ([a-z]+)/([a-z]+)([-]*[a-z]*) ]] - [[ "${BASH_REMATCH[3]}" == "-mimir" ]] && mimir_enabled=true || mimir_enabled=false helm template \ "$GIT_WORKDIR"/helm/prometheus-rules \ @@ -21,7 +20,6 @@ main() { --set="managementCluster.provider.kind=${BASH_REMATCH[2]}" \ --set="managementCluster.name=myinstall" \ --set="managementCluster.pipeline=stable" \ - --set="mimir.enabled=$mimir_enabled" \ --output-dir "$GIT_WORKDIR"/test/hack/output/helm-chart/"$provider" # Remove useless files for tests diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml deleted file mode 100644 index bd05e856f..000000000 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ /dev/null @@ -1,216 +0,0 @@ ---- -# These tests differ between prometheus and mimir installations: the resulting labels are different -rule_files: -- prometheus-agent.rules.yml - -tests: - # Tests for `PrometheusAgentShardsMissing` alert - - interval: 1m - input_series: - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '3+0x60 5+0x60 3+0x60' - - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '1+0x180' - alert_rule_test: - - alertname: PrometheusAgentShardsMissing - eval_time: 40m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 40m - - alertname: PrometheusAgentShardsMissing - eval_time: 120m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 100m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 130m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 130m - # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - - interval: 1m - input_series: - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' - values: "10000+0x180" - - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' - values: '3+0x60 5+0x60 3+0x60' - alert_rule_test: - - alertname: PrometheusAgentShardsMissing - eval_time: 40m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 40m - - alertname: PrometheusAgentShardsMissing - eval_time: 120m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 100m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 125m - exp_alerts: - - exp_labels: - area: platform - cluster_id: test01 - installation: myinstall - provider: aws - pipeline: testing - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - exp_annotations: - description: "Prometheus agent is missing shards." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissing - eval_time: 130m - - alertname: InhibitionPrometheusAgentShardsMissing - eval_time: 130m diff --git a/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml deleted file mode 100644 index 6b130ff88..000000000 --- a/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml +++ /dev/null @@ -1,54 +0,0 @@ ---- -rule_files: - - zot.rules.yml - -tests: - - interval: 1m - input_series: - - series: 'kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"}' - values: '_x5 0x10 1x45' - alert_rule_test: - - alertname: ZotDeploymentNotSatisfied - eval_time: 46m - exp_alerts: - - exp_labels: - alertname: "ZotDeploymentNotSatisfied" - area: "platform" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - cluster_type: "management_cluster" - deployment: "zot-zot" - namespace: "zot" - severity: "page" - team: "honeybadger" - topic: "managementcluster" - exp_annotations: - description: "Zot deployment zot/zot-zot is not satisfied." - opsrecipe: "zot/" - - interval: 1m - input_series: - - series: 'kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}' - values: '50x30 20x30 15x30 5x60' - - series: 'kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}' - values: '100x150' - alert_rule_test: - - alertname: ZotPersistentVolumeFillingUp - eval_time: 150m - exp_alerts: - - exp_labels: - alertname: "ZotPersistentVolumeFillingUp" - area: "platform" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - namespace: "zot" - persistentvolumeclaim: "zot-zot-pvc" - severity: "page" - team: "honeybadger" - topic: "managementcluster" - exp_annotations: - description: "The Zot PersistentVolume claimed by zot-zot-pvc in namespace zot is at least 80% full and projected to fill up soon." - opsrecipe: "zot/" diff --git a/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml b/test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml similarity index 100% rename from test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml rename to test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml new file mode 100644 index 000000000..71be1d4c3 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml @@ -0,0 +1,69 @@ +rule_files: + - capi-cluster.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Provisioned"}' + values: "1+0x75" + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Pending"}' + values: "1+0x75" + - series: 'capi_cluster_status_condition{name="grumpy", exported_namespace="giantswarm", status="False", type="Ready"}' + values: "0+0x10 0+1x65" + - series: 'capi_cluster_status_condition{name="grumpy", exported_namespace="giantswarm", status="True", type="Ready"}' + values: "0+1x10 0+0x65" + - series: 'capi_cluster_annotation_paused{name="grumpy", exported_namespace="giantswarm", paused_value="true"}' + values: "0+1x75" + alert_rule_test: + - alertname: ClusterUnhealthyPhase + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: managementcluster + name: clippaxy + exported_namespace: giantswarm + phase: Pending + exp_annotations: + description: "Cluster giantswarm/clippaxy stuck in Pending phase." + opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: ClusterStatusNotReady + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + name: grumpy + exported_namespace: giantswarm + status: "False" + type: Ready + exp_annotations: + description: "Cluster giantswarm/grumpy is not ready." + opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: ClusterPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + name: grumpy + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The cluster giantswarm/grumpy is paused." + opsrecipe: capi-cluster/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml new file mode 100644 index 000000000..2bcb3c23d --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml @@ -0,0 +1,52 @@ +rule_files: + - capi-kubeadmcontrolplane.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+2x100" + - series: 'capi_kubeadmcontrolplane_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: KubeadmControlPlaneReplicasMismatch + eval_time: 100m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jzy + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes." + opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: KubeadmControlPlanePaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-72r5c + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The clusters grumpy kubeadmcontrolplane giantswarm/grumpy-72r5c is paused." + opsrecipe: capi-kubeadmcontrolplane/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml new file mode 100644 index 000000000..e85606129 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml @@ -0,0 +1,49 @@ +rule_files: + - capi-machine.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}' + values: "1+0x10 0+0x35" + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}' + values: "0+0x10 1+0x35" + - series: 'capi_machine_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachineUnhealthyPhase + eval_time: 45m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jq5 + exported_namespace: giantswarm + phase: Failed + exp_annotations: + description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes." + opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: MachinePaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-72r5c + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "Machine giantswarm/grumpy-72r5c is paused." + opsrecipe: capi-machine/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml new file mode 100644 index 000000000..9d9c1d913 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml @@ -0,0 +1,47 @@ +rule_files: + - capi-machinedeployment.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machinedeployment_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' + values: "0+3x75" + - series: 'capi_machinedeployment_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachineDeploymentIsNotHealthy + eval_time: 25m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + phase: Failed + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-def00 + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy." + opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: MachineDeploymentPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-def99 + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused." + opsrecipe: capi-machinedeployment/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml new file mode 100644 index 000000000..70f519087 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml @@ -0,0 +1,47 @@ +rule_files: + - capi-machinepool.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machinepool_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' + values: "0+3x75" + - series: 'capi_machinepool_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachinePoolIsNotHealthy + eval_time: 25m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + phase: Failed + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-def00 + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy." + opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers + - alertname: MachinePoolPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-72r5c + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused." + opsrecipe: capi-machinepool/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml new file mode 100644 index 000000000..d41639d87 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml @@ -0,0 +1,27 @@ +rule_files: + - capi-machineset.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machineset_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}' + values: "0+1x75" + alert_rule_test: + - alertname: MachineSetPaused + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_monitoring_agent_down: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: grumpy + name: grumpy-def99 + exported_namespace: giantswarm + paused_value: "true" + exp_annotations: + description: "Machineset giantswarm/grumpy-def99 is paused." + opsrecipe: capi-machineset/ + dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml new file mode 100644 index 000000000..c07f91b54 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml @@ -0,0 +1,91 @@ +rule_files: + - capi.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}' + values: "1+0x10 0+0x35" + - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}' + values: "0+0x10 1+0x35" + alert_rule_test: + - alertname: MachineUnhealthyPhase + eval_time: 45m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jq5 + exported_namespace: giantswarm + phase: Failed + exp_annotations: + description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes." + - interval: 1m + input_series: + - series: 'capi_machinepool_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' + values: "0+3x75" + alert_rule_test: + - alertname: MachinePoolIsNotHealthy + eval_time: 25m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-def00 + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy." + - interval: 1m + input_series: + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+3x100" + - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}' + values: "0+2x100" + alert_rule_test: + - alertname: KubeadmControlPlaneReplicasMismatch + eval_time: 100m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + cluster_name: clippaxy + name: clippaxy-72jzy + exported_namespace: giantswarm + exp_annotations: + description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes." + - interval: 1m + input_series: + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Provisioned"}' + values: "1+0x75" + - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Pending"}' + values: "1+0x75" + alert_rule_test: + - alertname: ClusterUnhealthyPhase + eval_time: 75m + exp_alerts: + - exp_labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: notify + team: phoenix + topic: managementcluster + name: clippaxy + exported_namespace: giantswarm + phase: Pending + exp_annotations: + description: "Cluster giantswarm/clippaxy is in a non healthy phase." diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml new file mode 100644 index 000000000..078f75d79 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml @@ -0,0 +1,94 @@ +--- +rule_files: + - certificate.all.rules.yml + +tests: + # CertificateSecretWillExpireInLessThanTwoWeeks within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="aws", secretkey="tls.crt", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: CertificateSecretWillExpireInLessThanTwoWeeks + eval_time: 20d + exp_alerts: + - exp_labels: + alertname: CertificateSecretWillExpireInLessThanTwoWeeks + app: cert-exporter-deployment + area: kaas + cancel_if_outside_working_hours: "true" + cluster_id: gollem + cluster_type: management_cluster + container: cert-exporter + customer: giantswarm + exported_namespace: giantswarm + instance: 10.0.0.0:1234 + job: gollem-prometheus/workload-gollem/0 + namespace: giantswarm + node: 10.0.0.0 + organization: giantswarm + pod: cert-exporter-deployment-5c47b4c55c-49wt9 + provider: aws + name: athena-certs-secret + installation: gollem + service_priority: highest + severity: page + secretkey: tls.crt + team: phoenix + topic: cert-manager + exp_annotations: + description: "Certificate stored in Secret giantswarm/athena-certs-secret on gollem will expire in less than two weeks." + opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/" + # CertificateSecretWillExpireInLessThanTwoWeeks not within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="aws", secretkey="tls.crt", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: CertificateSecretWillExpireInLessThanTwoWeeks + eval_time: 10d + # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="aws", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks + eval_time: 20d + exp_alerts: + - exp_labels: + alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks + app: cert-exporter-deployment + area: kaas + cancel_if_outside_working_hours: "true" + cluster_id: 12345 + cluster_type: workload_cluster + container: cert-exporter + customer: giantswarm + exported_namespace: kube-system + instance: 10.0.0.0:1234 + job: 12345-prometheus/workload-12345/0 + namespace: kube-system + node: 10.0.0.0 + organization: giantswarm + pod: cert-exporter-deployment-57bbbfd856-8r8dr + provider: aws + name: kiam-agent + installation: gollem + service_priority: highest + severity: page + team: phoenix + topic: cert-manager + issuer_ref: kiam-ca-issuer + managed_issuer: "true" + exp_annotations: + description: "Certificate CR kube-system/kiam-agent on 12345 will expire in less than two weeks." + opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/" + # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks not within 2 weeks of expiration + - interval: 1d + input_series: + - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="aws", service_priority="highest"}' + values: "2678400x60" + alert_rule_test: + - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks + eval_time: 10d diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml new file mode 100644 index 000000000..786acc105 --- /dev/null +++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml @@ -0,0 +1,38 @@ +--- +rule_files: + - node-exporter.rules.yml + +tests: + # NodeExporterCollectorFailed tests + - interval: 1m + input_series: + # No data for 20 minutes, then all good, then cpu collector fails, then bonding collector fails + - series: 'node_scrape_collector_success{app="node-exporter", collector="cpu", instance="10.0.5.111:10300"}' + values: "_x20 1+0x20 0+0x20 1+0x20" + - series: 'node_scrape_collector_success{app="node-exporter", collector="bonding", instance="10.0.5.111:10300"}' + values: "_x20 1+0x20 1+0x20 0+0x20" + alert_rule_test: + - alertname: NodeExporterCollectorFailed + eval_time: 10m + - alertname: NodeExporterCollectorFailed + eval_time: 30m + - alertname: NodeExporterCollectorFailed + eval_time: 50m + exp_alerts: + - exp_labels: + alertname: NodeExporterCollectorFailed + app: "node-exporter" + area: "kaas" + cancel_if_outside_working_hours: "true" + collector: "cpu" + instance: "10.0.5.111:10300" + severity: "page" + team: "phoenix" + topic: "observability" + exp_annotations: + description: "NodeExporter Collector cpu on 10.0.5.111:10300 is failed." + opsrecipe: "node-exporter-device-error/" + - alertname: NodeExporterCollectorFailed + eval_time: 70m + + diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml similarity index 100% rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml rename to test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml similarity index 100% rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml rename to test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index f539b2347..bd05e856f 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -4,98 +4,6 @@ rule_files: - prometheus-agent.rules.yml tests: - # Tests for `PrometheusAgentFailing` alert - - interval: 1m - input_series: - - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}' - values: "_x60 0+0x60 1+0x60" - - series: 'capi_cluster_status_condition{cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}' - values: "1+0x180" - alert_rule_test: - - alertname: PrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 150m - - alertname: InhibitionPrometheusAgentFailing - eval_time: 150m # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: diff --git a/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml b/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml new file mode 100644 index 000000000..00167d085 --- /dev/null +++ b/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml @@ -0,0 +1,46 @@ +--- +rule_files: + - cert-manager.rules.yml + +tests: + - interval: 1m + input_series: + - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}' + values: "0+0x60" + alert_rule_test: + - alertname: CertManagerDown + eval_time: 15m + exp_alerts: + - exp_labels: + alertname: CertManagerDown + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "true" + cluster_id: 12345 + cluster_type: workload_cluster + container: cert-manager + customer: giantswarm + instance: 10.0.0.0:1234 + ip: 10.0.0.0 + job: 12345-prometheus/workload-12345/0 + namespace: kube-system + organization: giantswarm + pod: cert-manager-controller-7fcc585578-gnprd + provider: capa + installation: golem + service_priority: highest + severity: page + team: shield + topic: cert-manager + exp_annotations: + description: "cert-manager in namespace kube-system is down." + opsrecipe: "cert-manager-down/" + - interval: 1m + input_series: + - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}' + values: "1+0x60" + alert_rule_test: + - alertname: CertManagerDown + eval_time: 15m diff --git a/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml b/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml new file mode 100644 index 000000000..2ab1f7c20 --- /dev/null +++ b/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml @@ -0,0 +1,59 @@ +--- +rule_files: + - 'teleport.rules.yml' + +tests: + - interval: 1m + input_series: + - series: 'kube_secret_created{cluster_id="my-cluster", installation="golem", secret="my-cluster-teleport-join-token"}' + values: "1+0x150" + - series: 'capi_cluster_status_phase{cluster_id="my-cluster", installation="golem", phase="Provisioned"}' + values: "2+0x150" + alert_rule_test: + - alertname: TeleportJoinTokenSecretMismatch + eval_time: 30m + exp_alerts: [] + - alertname: TeleportJoinTokenSecretMismatch + eval_time: 140m + exp_alerts: + - exp_labels: + alertname: TeleportJoinTokenSecretMismatch + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + cluster_id: my-cluster + installation: golem + severity: notify + team: shield + topic: teleport + exp_annotations: + description: "Mismatch in number of teleport-join-token secrets and clusters" + - interval: 1m + input_series: + - series: 'kube_configmap_info{cluster_id="my-cluster", installation="grizzly", configmap="my-cluster-teleport-kube-agent-config"}' + values: "1+0x150" + - series: 'capi_cluster_status_phase{cluster_id="my-cluster", installation="grizzly", phase="Provisioned"}' + values: "2+0x150" + alert_rule_test: + - alertname: TeleportKubeAgentConfigMapMismatch + eval_time: 30m + exp_alerts: [] + - alertname: TeleportKubeAgentConfigMapMismatch + eval_time: 140m + exp_alerts: + - exp_labels: + alertname: TeleportKubeAgentConfigMapMismatch + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + cluster_id: my-cluster + installation: grizzly + severity: notify + team: shield + topic: teleport + exp_annotations: + description: "Mismatch in number of teleport-kube-agent-config secrets and clusters" diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml new file mode 100644 index 000000000..79c5aa0f1 --- /dev/null +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml @@ -0,0 +1,156 @@ +--- +rule_files: +- grafana-cloud.rules.yml + +tests: + # Tests for `MimirToGrafanaCloudExporterDown` alert + - interval: 1m + input_series: + - series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", customer="giantswarm", pipeline="stable", provider="capa", region="eu-west-2"}' + values: "_x60 1+0x60 0+0x60 1+0x60" + alert_rule_test: + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 50m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: myinstall + cluster_type: management_cluster + installation: myinstall + job: mimir/mimir-to-grafana-cloud + pipeline: stable + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud" + description: "Prometheus Mimir to Grafana-Cloud is down." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 70m + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: myinstall + cluster_type: management_cluster + customer: giantswarm + installation: myinstall + job: mimir/mimir-to-grafana-cloud + namespace: mimir + pipeline: stable + provider: capa + region: eu-west-2 + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud" + description: "Prometheus Mimir to Grafana-Cloud is down." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 200m + # Tests for `MimirToGrafanaCloudExporterFailures` alert + - interval: 1m + input_series: + # remote read is working for 2 hours and then fails for 1 hour + - series: 'prometheus_remote_storage_read_queries_total{code="200", job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x60 0+10x60 0+0x60 0+10x180" + # remote write has no failure for 4 hours and then fails for 2 hours + - series: 'prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x60 0+0x180 0+10x120" + alert_rule_test: + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 70m + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: "myinstall" + installation: "myinstall" + pipeline: "testing" + provider: "capa" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 200m + - alertname: MimirToGrafanaCloudExporterFailures + eval_time: 280m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: "myinstall" + installation: "myinstall" + pipeline: "testing" + provider: "capa" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + # Tests for `MimirToGrafanaCloudExporterTooManyRestarts` alert + - interval: 1m + input_series: + # remote read is working for 2 hours and then fails for 1 hour + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea5", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x60 1+0x60 _x80" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea6", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x122 1+0x2 _x78" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea7", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x124 1+0x2 _x76" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea8", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x126 1+0x2 _x74" + - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea9", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "_x128 1+0x72" + alert_rule_test: + - alertname: MimirToGrafanaCloudExporterTooManyRestarts + eval_time: 70m + - alertname: MimirToGrafanaCloudExporterTooManyRestarts + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + pod: "prometheus-mimir-to-grafana-cloud-0" + cluster_id: "myinstall" + installation: "myinstall" + pipeline: "testing" + provider: "capa" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus Mimir to Grafana-Cloud is restarting too much." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" + - alertname: MimirToGrafanaCloudExporterTooManyRestarts + eval_time: 180m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml new file mode 100644 index 000000000..6bdfeaeab --- /dev/null +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -0,0 +1,392 @@ +--- +rule_files: + - mimir.rules.yml + +tests: + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: up, none, up, down, up + - series: 'up{job="mimir/ingester", container="ingester"}' + values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30" + alert_rule_test: + - alertname: Heartbeat + eval_time: 20m + exp_alerts: + - exp_labels: + area: platform + job: mimir/ingester + container: ingester + installation: myinstall + team: atlas + topic: observability + type: mimir-heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + opsrecipe: "mimir/" + - alertname: Heartbeat + eval_time: 70m + - alertname: Heartbeat + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + job: mimir/ingester + container: ingester + installation: myinstall + team: atlas + topic: observability + type: mimir-heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + opsrecipe: "mimir/" + - alertname: Heartbeat + eval_time: 140m + - alertname: Heartbeat + eval_time: 165m + exp_alerts: + - exp_labels: + area: platform + job: mimir/ingester + container: ingester + installation: myinstall + team: atlas + topic: observability + type: mimir-heartbeat + exp_annotations: + description: "This alert is used to ensure the entire alerting pipeline is functional." + opsrecipe: "mimir/" + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'up{job="mimir/ingester", container="ingester", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", service="mimir-ingester"}' + values: "_x20 1+0x20 0+0x20" + alert_rule_test: + - alertname: MimirComponentDown + eval_time: 10m + - alertname: MimirComponentDown + eval_time: 30m + - alertname: MimirComponentDown + eval_time: 50m + exp_alerts: + - exp_labels: + service: mimir-ingester + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + exp_annotations: + dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview + description: "Mimir component : mimir-ingester is down." + opsrecipe: "mimir/" + - interval: 1m + input_series: + # test: none, rate > 0, rate = 0 + - series: 'mimir_rules_events_failed_total{cluster_type="management_cluster", cluster_id="golem", installation="golem", namespace="mimir"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirRulerEventsFailed + eval_time: 40m + - alertname: MimirRulerEventsFailed + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + cluster_type: management_cluster + installation: golem + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler + description: "Mimir ruler is failing to process PrometheusRules." + opsrecipe: "mimir/" + - alertname: MimirRulerEventsFailed + eval_time: 160m + - interval: 1m + input_series: + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}' + values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}' + values: "0+5x180" # prometheus container restarts 5 times per minute for 180 minutes + alert_rule_test: + - alertname: MimirRestartingTooOften + eval_time: 15m # should be OK after 15 minutes + - alertname: MimirRestartingTooOften + eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error + exp_alerts: + - exp_labels: + all_pipelines: "true" + area: platform + cancel_if_outside_working_hours: "true" + cluster_type: management_cluster + container: mimir-ingester + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview + description: Mimir containers are restarting too often. + opsrecipe: "mimir/" + - alertname: MimirRestartingTooOften + eval_time: 140m # After 140m minutes, all should be back to normal + # Test for MimirIngesterNeedsToBeScaledUp alert + - interval: 1m + input_series: + # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests. + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60" + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60" + # mimir-ingester memory requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x400" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x400" + # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests. + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x100 6000+110x70 10400+60x60 14000+110x70 18400+60x60" + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x400" + # mimir-ingester cpu requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x400" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x400" + alert_rule_test: + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 15m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 85m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 130m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 170m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 210m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 295m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 350m + # Test for MimirIngesterNeedsToBeScaledDown alert + - interval: 1m + input_series: + # mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests. + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" + # mimir-ingester memory requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + # mimir-ingester real cpu usage gradually increases until it goes below 30% of the cpu requests. + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x100 6000+10x40 6400+60x60 10000+10x40 10400+60x60" + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+30x300" + # mimir-ingester cpu requests stay the same for the entire duration of the test + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + alert_rule_test: + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 15m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 55m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 100m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 135m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 180m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 240m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming very few resources and needs to be scaled down. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 280m + # Test for MimirHPAReachedMaxReplicas alert + - interval: 1m + input_series: + # HPA max replicas = 3 for the whole test + # HPA target metric = 90% for the whole test + # Cases: + # desired_replicas < max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas < max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas < max_replicas AND current_utilization > target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas = max_replicas AND current_utilization > target_utilization does fire + # desired_replicas > max_replicas AND current_utilization < target_utilization does not fire + # desired_replicas > max_replicas AND current_utilization = target_utilization does not fire + # desired_replicas > max_replicas AND current_utilization > target_utilization does fire + - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}' + values: '3+0x360' + - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}' + values: '2+0x120 3+0x120 4+0x120' + - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}' + values: '90+0x360' + # HPA current metric = 80% for 10mn, then increase to 90% for 10mn + - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}' + values: '80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40' + alert_rule_test: + - alertname: MimirHPAReachedMaxReplicas + eval_time: 234m + - alertname: MimirHPAReachedMaxReplicas + eval_time: 235m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + horizontalpodautoscaler: mimir-distributor + namespace: mimir + exp_annotations: + description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + opsrecipe: "mimir-hpa/" + - alertname: MimirHPAReachedMaxReplicas + eval_time: 246m + - alertname: MimirHPAReachedMaxReplicas + eval_time: 360m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + horizontalpodautoscaler: mimir-distributor + namespace: mimir + exp_annotations: + description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up." + opsrecipe: "mimir-hpa/" + # Test for MimirCompactorFailedCompaction alert + - interval: 1m + input_series: + - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190" + alert_rule_test: + - alertname: MimirCompactorFailedCompaction + eval_time: 15m + - alertname: MimirCompactorFailedCompaction + eval_time: 55m + - alertname: MimirCompactorFailedCompaction + eval_time: 120m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources + description: Mimir compactor has been failing its compactions for 2 hours. + opsrecipe: "mimir#mimircompactorfailedcompaction" + - alertname: MimirCompactorFailedCompaction + eval_time: 205m + - alertname: MimirCompactorFailedCompaction + eval_time: 350m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index e8ec81346..bd05e856f 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -4,98 +4,6 @@ rule_files: - prometheus-agent.rules.yml tests: - # Tests for `PrometheusAgentFailing` alert - - interval: 1m - input_series: - - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}' - values: "_x60 0+0x60 1+0x60" - - series: 'capi_cluster_status_condition{ cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}' - values: "1+0x180" - alert_rule_test: - - alertname: PrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 30m - exp_alerts: - - exp_labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: InhibitionPrometheusAgentFailing - eval_time: 90m - exp_alerts: - - exp_labels: - area: platform - cluster_id: gauss - cluster_type: workload_cluster - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - installation: myinstall - instance: prometheus-agent - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - exp_annotations: - dashboard: "promRW001/prometheus-remote-write" - description: "Prometheus agent remote write is failing." - opsrecipe: "prometheus-agent/" - summary: "Prometheus agent fails to send samples to remote write endpoint." - - alertname: PrometheusAgentFailing - eval_time: 150m - - alertname: InhibitionPrometheusAgentFailing - eval_time: 150m # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: