From 380d5a06b93c4ae76aecba8fda661d8f708d9ebf Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 11 Jun 2024 12:22:25 +0200 Subject: [PATCH] Sort out shared alerts ownership --- CHANGELOG.md | 12 +-- CODEOWNERS | 19 ++-- .../dns-operator-azure.rules.yml | 0 .../alerting-rules/certificate.all.rules.yml | 2 +- .../certificate.management-cluster.rules.yml | 0 .../certificate.workload-cluster.rules.yml | 0 .../management-cluster.rules.yml | 13 +-- .../kubernetes-mixins.rules.yml | 0 .../deployment.management-cluster.rules.yml | 24 ++--- .../deployment.workload-cluster.rules.yml | 12 +-- .../alerting-rules/operatorkit.rules.yml | 5 +- .../recording-rules/grafana-cloud.rules.yml | 0 .../recording-rules/service-level.rules.yml | 0 .../alerting-rules/microendpoint.rules.yml | 72 -------------- scripts/sync-kube-mixin.sh | 2 +- test/conf/promtool_ignore | 98 ++++++++++++------- 16 files changed, 109 insertions(+), 150 deletions(-) rename helm/prometheus-rules/templates/{shared => kaas/phoenix}/alerting-rules/dns-operator-azure.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/certificate.all.rules.yml (99%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/certificate.management-cluster.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/certificate.workload-cluster.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/management-cluster.rules.yml (95%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/recording-rules/kubernetes-mixins.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/alerting-rules/deployment.management-cluster.rules.yml (97%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/alerting-rules/deployment.workload-cluster.rules.yml (96%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/alerting-rules/operatorkit.rules.yml (98%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/recording-rules/grafana-cloud.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/recording-rules/service-level.rules.yml (100%) delete mode 100644 helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c33c5dbc..5e480db9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,20 +23,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Moves cluster-autoscaler and vpa alerts to turtles. - Reviewed turtles alerts labels. - Use `ready` replicas for Kyverno webhooks alert. -- Moves ownership of alerts for shared components to turtles. - +- Sort out shared alert ownership by distributing them all to teams. ### Fixed - Fixed usage of yq, and jq in check-opsrecipes.sh - Fetch jq with make install-tools -- Fix and improve the check-opsrecipes.sh script so support /_index.md based ops-recipes. -- Fix cabbage alerts for multi-provider wcs. -- Fix a few area labels. -- Fix `cert-exporter` alerting. +- Fixed and improve the check-opsrecipes.sh script so support /_index.md based ops-recipes. +- Fixed cabbage alerts for multi-provider MCs. +- Fixed all area alert labels. +- Fixed `cert-exporter` alerts to page on all providers. ### Removed +- cleanup: get rid of microendpoint alerts as it never fired and probably never will - cleanup: remove scrape timeout inhibition leftovers (documentation and labels) ## [4.1.2] - 2024-05-31 diff --git a/CODEOWNERS b/CODEOWNERS index 28f07ef17..bd081f495 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,9 +1,12 @@ * @giantswarm/team-atlas -/helm/prometheus-rules/templates/kaas/bigmac/ @team-bigmac -/helm/prometheus-rules/templates/kaas/phoenix/ @team-phoenix -/helm/prometheus-rules/templates/kaas/rocket/ @team-rocket -/helm/prometheus-rules/templates/kaas/turtles/ @team-turtles -/helm/prometheus-rules/templates/platform/atlas/ @team-atlas -/helm/prometheus-rules/templates/platform/cabbage/ @team-cabbage -/helm/prometheus-rules/templates/platform/honeybadger/ @team-honeybadger -/helm/prometheus-rules/templates/platform/shield/ @team-shield +/helm/prometheus-rules/templates/kaas/bigmac/ @giantswarm/team-bigmac +/helm/prometheus-rules/templates/kaas/phoenix/ @giantswarm/team-phoenix +/helm/prometheus-rules/templates/kaas/rocket/ @giantswarm/team-rocket +/helm/prometheus-rules/templates/kaas/turtles/ @giantswarm/team-turtles +/helm/prometheus-rules/templates/platform/atlas/ @giantswarm/team-atlas +/helm/prometheus-rules/templates/platform/cabbage/ @giantswarm/team-cabbage +/helm/prometheus-rules/templates/platform/honeybadger/ @giantswarm/team-honeybadger +/helm/prometheus-rules/templates/platform/shield/ @giantswarm/team-shield + +# No owners for changelog +/CHANGELOG.md diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/dns-operator-azure.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/dns-operator-azure.rules.yml rename to helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.all.rules.yml similarity index 99% rename from helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.all.rules.yml index a04fda923..2ed94092d 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.all.rules.yml @@ -42,7 +42,7 @@ spec: opsrecipe: kyverno-certificate-secret-will-expire-in-less-than-two-days/ expr: (cert_exporter_secret_not_after{name=~".*kyverno.*"} - time()) < 2 * 24 * 60 * 60 labels: - area: managedservices + area: kaas cancel_if_outside_working_hours: "true" severity: notify team: shield diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/certificate.workload-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml similarity index 95% rename from helm/prometheus-rules/templates/shared/alerting-rules/management-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml index 0576aacd3..52550477f 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml @@ -13,7 +13,6 @@ spec: groups: - name: management-cluster rules: - {{- if (eq .Values.managementCluster.provider.kind "aws") }} - alert: ManagementClusterHasLessThanThreeNodes annotations: description: '{{`Management cluster {{ $labels.cluster_id }} has less than 3 nodes.`}}' @@ -24,7 +23,7 @@ spec: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterMissingNodes annotations: @@ -34,7 +33,7 @@ spec: labels: area: kaas severity: notify - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterCPUUsageTooHigh annotations: @@ -46,7 +45,7 @@ spec: area: kass cancel_if_outside_working_hours: "true" severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterMemoryUsageTooHigh annotations: @@ -58,7 +57,7 @@ spec: area: kass cancel_if_outside_working_hours: "true" severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterPodLimitAlmostReached annotations: @@ -72,8 +71,10 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster + {{- if (eq .Values.managementCluster.provider.kind "aws") }} + ## TODO Remove when all vintage clusters are gone - alert: ManagementClusterCriticalPodNotRunning annotations: description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}' diff --git a/helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml similarity index 97% rename from helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml index 5a587e527..36ac26281 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml @@ -20,7 +20,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -35,7 +35,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"app-admission-controller-.+|app-operator-.+|chart-operator-.+", cluster_id!~"argali|giraffe"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -50,7 +50,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"app-admission-controller-.+|app-operator-.+|chart-operator-.+", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: managedservices + area: platform severity: page team: honeybadger topic: managementcluster @@ -61,7 +61,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"kyverno.*", cluster_id!~"argali|giraffe"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -76,7 +76,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"kyverno.*", cluster_id=~"argali|giraffe"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -91,7 +91,7 @@ spec: expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator-.+|cluster-operator-.+|cluster-api-core-webhook.*|event-exporter-.*|etcd-kubernetes-resources-count-exporter-.*|upgrade-schedule-operator.*|worker-.+|master-.+", cluster_id!~"argali|giraffe"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -105,7 +105,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-operator-.+|cluster-operator-.+|coredns-.+|event-exporter-.+|etcd-kubernetes-resources-count-exporter.*", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: kaas + area: platform severity: page team: {{ include "providerTeam" . }} topic: managementcluster @@ -116,7 +116,7 @@ spec: expr: kube_deployment_status_replicas_available{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="aws"} + kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="aws"} == 0 for: 4h labels: - area: kaas + area: platform severity: notify team: phoenix topic: managementcluster @@ -127,7 +127,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator.*|cluster-operator.*|cluster-api-core-webhook.*|event-exporter-.*|upgrade-schedule-operator.*|event-exporter-app.*", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -142,7 +142,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller)-.+", cluster_id!~"argali|giraffe"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -156,7 +156,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller|coredns)-.+", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: kaas + area: platform severity: page team: cabbage topic: managementcluster @@ -171,7 +171,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml similarity index 96% rename from helm/prometheus-rules/templates/shared/alerting-rules/deployment.workload-cluster.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml index e0c8f0dcb..776df0011 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml @@ -20,7 +20,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="chart-operator"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: honeybadger @@ -32,7 +32,7 @@ spec: expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"metrics-server|vertical-pod-autoscaler(-app)?-admission-controller|vertical-pod-autoscaler(-app)?-recommender|vertical-pod-autoscaler(-app)?-updater|aws-pod-identity-webhook.*|cluster-autoscaler|aws-load-balancer-controller"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: {{ include "providerTeam" . }} @@ -44,7 +44,7 @@ spec: expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: - area: kaas + area: platform cancel_if_prometheus_agent_down: "true" cancel_if_outside_working_hours: "true" severity: page @@ -56,7 +56,7 @@ spec: expr: kube_deployment_status_replicas_available{cluster_type="workload_cluster", deployment="chart-operator"} + kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="chart-operator"} == 0 for: 4h labels: - area: managedservices + area: platform cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify team: honeybadger @@ -67,7 +67,7 @@ spec: expr: kube_deployment_spec_replicas{cluster_type="workload_cluster", deployment=~"trivy-operator|starboard-exporter|jiralert"} == 0 for: 4h labels: - area: managedservices + area: platform cancel_if_outside_working_hours: "true" severity: notify team: shield @@ -79,7 +79,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/operatorkit.rules.yml similarity index 98% rename from helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/operatorkit.rules.yml index 4e1805ff2..a571c99af 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/operatorkit.rules.yml @@ -1,3 +1,4 @@ +# Atlas is the team responsible for the operatorkit apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -17,7 +18,7 @@ spec: expr: operatorkit_controller_error_total{pod=~"app-operator.*|chart-operator.*"} > 5 for: 1m labels: - area: kaas + area: platform severity: notify team: honeybadger topic: qa @@ -27,7 +28,7 @@ spec: expr: (time() - operatorkit_controller_last_reconciled{pod=~"app-operator.*|chart-operator.*"}) / 60 > 30 for: 10m labels: - area: managedservices + area: platform severity: notify team: honeybadger topic: releng diff --git a/helm/prometheus-rules/templates/shared/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/recording-rules/grafana-cloud.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml diff --git a/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml deleted file mode 100644 index 4577db4d5..000000000 --- a/helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml +++ /dev/null @@ -1,72 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - name: microendpoint.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: microendpoint - rules: - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsAtlas - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: empowerment - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: atlas - topic: releng - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsHoneybadger - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"app-operator.*|chart-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: managedservices - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: honeybadger - topic: releng - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsAWS - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"aws-operator.*|cluster-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: phoenix - topic: releng - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsRocket - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"ignition-operator|cert-operator|node-operator"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: rocket - topic: releng diff --git a/scripts/sync-kube-mixin.sh b/scripts/sync-kube-mixin.sh index 85bf5490e..5850ab3c4 100755 --- a/scripts/sync-kube-mixin.sh +++ b/scripts/sync-kube-mixin.sh @@ -5,7 +5,7 @@ set -o nounset set -o pipefail TMPDIR="$(mktemp -d -t 'tmp.XXXXXXXXXX')" -RULESFILE="helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml" +RULESFILE="helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml" trap 'cleanup' EXIT diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index fd715736d..b7c8062ed 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -1,52 +1,78 @@ +kaas/bigmac/alerting-rules/cert-manager.rules.yml +kaas/bigmac/alerting-rules/certificate.all.rules.yml +kaas/bigmac/alerting-rules/certificate.management-cluster.rules.yml +kaas/bigmac/alerting-rules/certificate.workload-cluster.rules.yml kaas/bigmac/alerting-rules/dex.rules.yml +kaas/bigmac/alerting-rules/teleport.rules.yml +kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml +kaas/phoenix/alerting-rules/aws.job.rules.yml kaas/phoenix/alerting-rules/aws.management-cluster.rules.yml kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml -kaas/phoenix/alerting-rules/certificate.management-cluster.rules.yml -kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml +kaas/phoenix/alerting-rules/calico.rules.yml +kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml +kaas/phoenix/alerting-rules/cluster-service.rules.yml kaas/phoenix/alerting-rules/credentiald.rules.yml -kaas/phoenix/alerting-rules/inhibit.all.rules.yml -kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml -kaas/phoenix/alerting-rules/job.rules.yml +kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml +kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml +kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml kaas/phoenix/alerting-rules/kiam.rules.yml -kaas/rocket/alerting-rules/falco.rules.yml +kaas/phoenix/alerting-rules/vault.rules.yml +kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml +kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml +kaas/turtles/alerting-rules/bastions.rules.yml +kaas/turtles/alerting-rules/capi-cluster.rules.yml +kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.yml +kaas/turtles/alerting-rules/capi-machine.rules.yml +kaas/turtles/alerting-rules/capi-machinedeployment.rules.yml +kaas/turtles/alerting-rules/capi-machinepool.rules.yml +kaas/turtles/alerting-rules/capi-machineset.rules.yml +kaas/turtles/alerting-rules/capi.management-cluster.rules.yml +kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml +kaas/turtles/alerting-rules/docker.rules.yml +kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml +kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml +kaas/turtles/alerting-rules/etcdbackup.rules.yml +kaas/turtles/alerting-rules/fairness.rules.yml +kaas/turtles/alerting-rules/inhibit.capi.rules.yml +kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml +kaas/turtles/alerting-rules/job.rules.yml +kaas/turtles/alerting-rules/kubelet.rules.yml +kaas/turtles/alerting-rules/management-cluster.rules.yml +kaas/turtles/alerting-rules/net-exporter.rules.yml +kaas/turtles/alerting-rules/node-exporter.rules.yml +kaas/turtles/alerting-rules/node.management-cluster.rules.yml +kaas/turtles/alerting-rules/node.workload-cluster.rules.yml +kaas/turtles/alerting-rules/storage.management-cluster.rules.yml +kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml +kaas/turtles/alerting-rules/systemd.rules.yml +kaas/turtles/alerting-rules/timesync.rules.yml +kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml +kaas/turtles/recording-rules/kubernetes-mixins.rules.yml +platform/atlas/alerting-rules/deployment.management-cluster.rules.yml +platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml platform/atlas/alerting-rules/fluentbit.rules.yml +platform/atlas/alerting-rules/inhibit.oncall.rules.yml +platform/atlas/alerting-rules/keda.rules.yml platform/atlas/alerting-rules/kube-state-metrics.rules.yml +platform/atlas/alerting-rules/operatorkit.rules.yml platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml platform/atlas/alerting-rules/prometheus-operator.rules.yml +platform/atlas/alerting-rules/service-level.rules.yml +platform/atlas/alerting-rules/service-level.rules.yml +platform/atlas/alerting-rules/storage.rules.yml +platform/atlas/recording-rules/grafana-cloud.rules.yml +platform/atlas/recording-rules/loki-mixins.rules.yml +platform/atlas/recording-rules/mimir-mixins.rules.yml +platform/atlas/recording-rules/service-level.rules.yml platform/cabbage/alerting-rules/coredns.rules.yml platform/cabbage/alerting-rules/external-dns.rules.yml platform/cabbage/alerting-rules/ingress-controller.rules.yml -platform/cabbage/alerting-rules/network.all.rules.yml +platform/cabbage/alerting-rules/network.rules.yml +platform/cabbage/recording-rules/gs-managed-app-deployment-status.rules.yml platform/honeybadger/alerting-rules/chart.rules.yml platform/honeybadger/alerting-rules/helm.rules.yml platform/honeybadger/alerting-rules/release.rules.yml platform/honeybadger/alerting-rules/secret.rules.yml -platform/shield/alerting-rules/kyverno.rules.yml -shared/alerting-rules/apiserver.management-cluster.rules.yml -shared/alerting-rules/apiserver.workload-cluster.rules.yml -shared/alerting-rules/calico.rules.yml -shared/alerting-rules/certificate.workload-cluster.rules.yml -shared/alerting-rules/cluster-service.rules.yml -shared/alerting-rules/deployment.management-cluster.rules.yml -shared/alerting-rules/deployment.workload-cluster.rules.yml -shared/alerting-rules/disk.management-cluster.rules.yml -shared/alerting-rules/disk.workload-cluster.rules.yml -shared/alerting-rules/etcd.management-cluster.rules.yml -shared/alerting-rules/etcd.workload-cluster.rules.yml -shared/alerting-rules/etcdbackup.rules.yml -shared/alerting-rules/fairness.rules.yml -shared/alerting-rules/management-cluster.rules.yml -shared/alerting-rules/microendpoint.rules.yml -shared/alerting-rules/node.management_cluster.rules.yml -shared/alerting-rules/node.workload_cluster.rules.yml -shared/alerting-rules/operatorkit.rules.yml -shared/alerting-rules/service-level.rules.yml -shared/alerting-rules/timesync.rules.yml -shared/alerting-rules/up.rules.yml -shared/alerting-rules/vault.rules.yml -shared/recording-rules/grafana-cloud.rules.yml -shared/recording-rules/gs-managed-app-deployment-status.rules.yml -shared/recording-rules/kubernetes-mixins.rules.yml -shared/recording-rules/service-level.rules.yml -shared/recording-rules/mimir-mixins.rules.yml -shared/recording-rules/loki-mixins.rules.yml +platform/honeybadger/recording-rules/helm-operations.rules.yml +platform/honeybadger/recording-rules/helm-operations.rules.yml +platform/shield/alerting-rules/falco.rules.yml