From a1161ea3737d40c4ea30361b003c3b650723b002 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 11 Jun 2024 14:49:44 +0200 Subject: [PATCH] Sort out shared alerts ownership (#1232) Signed-off-by: QuentinBisson --- CHANGELOG.md | 11 +-- Makefile.custom.mk | 1 - .../dns-operator-azure.rules.yml | 0 .../alerting-rules/certificate.all.rules.yml | 2 +- .../certificate.management-cluster.rules.yml | 0 .../certificate.workload-cluster.rules.yml | 0 .../management-cluster.rules.yml | 19 ++--- .../kubernetes-mixins.rules.yml | 0 .../deployment.management-cluster.rules.yml | 24 +++---- .../deployment.workload-cluster.rules.yml | 12 ++-- .../alerting-rules/operatorkit.rules.yml | 5 +- .../recording-rules/grafana-cloud.rules.yml | 0 .../recording-rules/service-level.rules.yml | 0 .../alerting-rules/microendpoint.rules.yml | 72 ------------------- scripts/sync-kube-mixin.sh | 2 +- test/conf/promtool_ignore | 25 ++++--- test/hack/bin/template-chart.sh | 2 + 17 files changed, 53 insertions(+), 122 deletions(-) rename helm/prometheus-rules/templates/{shared => kaas/phoenix}/alerting-rules/dns-operator-azure.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/certificate.all.rules.yml (99%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/certificate.management-cluster.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/certificate.workload-cluster.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/alerting-rules/management-cluster.rules.yml (88%) rename helm/prometheus-rules/templates/{shared => kaas/turtles}/recording-rules/kubernetes-mixins.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/alerting-rules/deployment.management-cluster.rules.yml (97%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/alerting-rules/deployment.workload-cluster.rules.yml (96%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/alerting-rules/operatorkit.rules.yml (98%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/recording-rules/grafana-cloud.rules.yml (100%) rename helm/prometheus-rules/templates/{shared => platform/atlas}/recording-rules/service-level.rules.yml (100%) delete mode 100644 helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index ddfd70779..399918773 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,20 +23,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Moves cluster-autoscaler and vpa alerts to turtles. - Reviewed turtles alerts labels. - Use `ready` replicas for Kyverno webhooks alert. -- Moves ownership of alerts for shared components to turtles. +- Sort out shared alert ownership by distributing them all to teams. ### Fixed - Fixed usage of yq, and jq in check-opsrecipes.sh - Fetch jq with make install-tools -- Fix and improve the check-opsrecipes.sh script so support /_index.md based ops-recipes. -- Fix cabbage alerts for multi-provider wcs. -- Fix a few area labels in alerts. -- Fix `cert-exporter` alerting. +- Fixed and improve the check-opsrecipes.sh script to support /_index.md based ops-recipes. +- Fixed cabbage alerts for multi-provider MCs. +- Fixed all area alert labels. +- Fixed `cert-exporter` alerts to page on all providers. - Fix `ManagementClusterDexAppMissing` use of absent for mimir. ### Removed +- cleanup: get rid of microendpoint alerts as it never fired and probably never will - cleanup: remove scrape timeout inhibition leftovers (documentation and labels) ## [4.1.2] - 2024-05-31 diff --git a/Makefile.custom.mk b/Makefile.custom.mk index deaad8df3..f7147898d 100644 --- a/Makefile.custom.mk +++ b/Makefile.custom.mk @@ -16,7 +16,6 @@ install-tools: ./test/hack/bin/fetch-tools.sh template-chart: install-tools ## prepare the helm chart - test/hack/bin/architect helm template --dir helm/prometheus-rules --dry-run bash ./test/hack/bin/template-chart.sh test-rules: install-tools template-chart ## run unit tests for alerting rules diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/dns-operator-azure.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/dns-operator-azure.rules.yml rename to helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.all.rules.yml similarity index 99% rename from helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.all.rules.yml index a04fda923..2ed94092d 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.all.rules.yml @@ -42,7 +42,7 @@ spec: opsrecipe: kyverno-certificate-secret-will-expire-in-less-than-two-days/ expr: (cert_exporter_secret_not_after{name=~".*kyverno.*"} - time()) < 2 * 24 * 60 * 60 labels: - area: managedservices + area: kaas cancel_if_outside_working_hours: "true" severity: notify team: shield diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/alerting-rules/certificate.workload-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml similarity index 88% rename from helm/prometheus-rules/templates/shared/alerting-rules/management-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml index 0576aacd3..dd3e05eed 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml @@ -13,28 +13,27 @@ spec: groups: - name: management-cluster rules: - {{- if (eq .Values.managementCluster.provider.kind "aws") }} - alert: ManagementClusterHasLessThanThreeNodes annotations: description: '{{`Management cluster {{ $labels.cluster_id }} has less than 3 nodes.`}}' opsrecipe: management-cluster-less-than-three-workers/ - expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (cluster_id, node) kube_node_role{role="worker", cluster_type="management_cluster"}) by (cluster_id) < 3 + expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (cluster_id, node) kube_node_role{role="worker", cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider) < 3 for: 1h labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterMissingNodes annotations: description: '{{`Management cluster {{ $labels.cluster_id }} has less than 4 minimum nodes.`}}' - expr: sum(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}) by (cluster_id) < 4 + expr: sum(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}) by (cluster_id, installation, pipeline, provider) < 4 for: 15m labels: area: kaas severity: notify - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterCPUUsageTooHigh annotations: @@ -46,7 +45,7 @@ spec: area: kass cancel_if_outside_working_hours: "true" severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterMemoryUsageTooHigh annotations: @@ -58,12 +57,12 @@ spec: area: kass cancel_if_outside_working_hours: "true" severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterPodLimitAlmostReached annotations: description: '{{`Cluster {{ $labels.cluster_id }} is almost exceeding its pod limit.`}}' - expr: (sum(kube_pod_info{cluster_type="management_cluster"}) by (cluster_id) / sum(kube_node_status_capacity{resource="pods", cluster_type="management_cluster"}) by (cluster_id)) > 0.8 + expr: (sum(kube_pod_info{cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider) / sum(kube_node_status_capacity{resource="pods", cluster_type="management_cluster"}) by (cluster_id, installation, pipeline, provider)) > 0.8 for: 5m labels: area: kaas @@ -72,8 +71,10 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster + {{- if (eq .Values.managementCluster.provider.kind "aws") }} + ## TODO Remove when all vintage clusters are gone - alert: ManagementClusterCriticalPodNotRunning annotations: description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}' diff --git a/helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml similarity index 97% rename from helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml index 5a587e527..36ac26281 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml @@ -20,7 +20,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -35,7 +35,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"app-admission-controller-.+|app-operator-.+|chart-operator-.+", cluster_id!~"argali|giraffe"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -50,7 +50,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"app-admission-controller-.+|app-operator-.+|chart-operator-.+", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: managedservices + area: platform severity: page team: honeybadger topic: managementcluster @@ -61,7 +61,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"kyverno.*", cluster_id!~"argali|giraffe"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -76,7 +76,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"kyverno.*", cluster_id=~"argali|giraffe"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -91,7 +91,7 @@ spec: expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator-.+|cluster-operator-.+|cluster-api-core-webhook.*|event-exporter-.*|etcd-kubernetes-resources-count-exporter-.*|upgrade-schedule-operator.*|worker-.+|master-.+", cluster_id!~"argali|giraffe"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -105,7 +105,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-operator-.+|cluster-operator-.+|coredns-.+|event-exporter-.+|etcd-kubernetes-resources-count-exporter.*", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: kaas + area: platform severity: page team: {{ include "providerTeam" . }} topic: managementcluster @@ -116,7 +116,7 @@ spec: expr: kube_deployment_status_replicas_available{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="aws"} + kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="aws"} == 0 for: 4h labels: - area: kaas + area: platform severity: notify team: phoenix topic: managementcluster @@ -127,7 +127,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator.*|cluster-operator.*|cluster-api-core-webhook.*|event-exporter-.*|upgrade-schedule-operator.*|event-exporter-app.*", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -142,7 +142,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller)-.+", cluster_id!~"argali|giraffe"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -156,7 +156,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller|coredns)-.+", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: - area: kaas + area: platform severity: page team: cabbage topic: managementcluster @@ -171,7 +171,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml similarity index 96% rename from helm/prometheus-rules/templates/shared/alerting-rules/deployment.workload-cluster.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml index e0c8f0dcb..776df0011 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml @@ -20,7 +20,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="chart-operator"} > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: honeybadger @@ -32,7 +32,7 @@ spec: expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"metrics-server|vertical-pod-autoscaler(-app)?-admission-controller|vertical-pod-autoscaler(-app)?-recommender|vertical-pod-autoscaler(-app)?-updater|aws-pod-identity-webhook.*|cluster-autoscaler|aws-load-balancer-controller"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: {{ include "providerTeam" . }} @@ -44,7 +44,7 @@ spec: expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: - area: kaas + area: platform cancel_if_prometheus_agent_down: "true" cancel_if_outside_working_hours: "true" severity: page @@ -56,7 +56,7 @@ spec: expr: kube_deployment_status_replicas_available{cluster_type="workload_cluster", deployment="chart-operator"} + kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="chart-operator"} == 0 for: 4h labels: - area: managedservices + area: platform cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify team: honeybadger @@ -67,7 +67,7 @@ spec: expr: kube_deployment_spec_replicas{cluster_type="workload_cluster", deployment=~"trivy-operator|starboard-exporter|jiralert"} == 0 for: 4h labels: - area: managedservices + area: platform cancel_if_outside_working_hours: "true" severity: notify team: shield @@ -79,7 +79,7 @@ spec: expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/operatorkit.rules.yml similarity index 98% rename from helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/operatorkit.rules.yml index 4e1805ff2..a571c99af 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/operatorkit.rules.yml @@ -1,3 +1,4 @@ +# Atlas is the team responsible for the operatorkit apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -17,7 +18,7 @@ spec: expr: operatorkit_controller_error_total{pod=~"app-operator.*|chart-operator.*"} > 5 for: 1m labels: - area: kaas + area: platform severity: notify team: honeybadger topic: qa @@ -27,7 +28,7 @@ spec: expr: (time() - operatorkit_controller_last_reconciled{pod=~"app-operator.*|chart-operator.*"}) / 60 > 30 for: 10m labels: - area: managedservices + area: platform severity: notify team: honeybadger topic: releng diff --git a/helm/prometheus-rules/templates/shared/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/recording-rules/grafana-cloud.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml diff --git a/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml similarity index 100% rename from helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml deleted file mode 100644 index 4577db4d5..000000000 --- a/helm/prometheus-rules/templates/shared/alerting-rules/microendpoint.rules.yml +++ /dev/null @@ -1,72 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - name: microendpoint.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: microendpoint - rules: - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsAtlas - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: empowerment - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: atlas - topic: releng - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsHoneybadger - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"app-operator.*|chart-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: managedservices - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: honeybadger - topic: releng - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsAWS - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"aws-operator.*|cluster-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: phoenix - topic: releng - # replacing `version` with `reconciled_version` is only done if the latter - # is non-empty and is done to work with old operator versions using - # microendpoint < 0.1.0 (i.e. before VOO) - - alert: CollidingOperatorsRocket - annotations: - description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' - opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"ignition-operator|cert-operator|node-operator"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 - for: 5m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: rocket - topic: releng diff --git a/scripts/sync-kube-mixin.sh b/scripts/sync-kube-mixin.sh index 85bf5490e..5850ab3c4 100755 --- a/scripts/sync-kube-mixin.sh +++ b/scripts/sync-kube-mixin.sh @@ -5,7 +5,7 @@ set -o nounset set -o pipefail TMPDIR="$(mktemp -d -t 'tmp.XXXXXXXXXX')" -RULESFILE="helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml" +RULESFILE="helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml" trap 'cleanup' EXIT diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index 71a57dac8..2d3d2a071 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -9,6 +9,7 @@ kaas/phoenix/alerting-rules/calico.rules.yml kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml kaas/phoenix/alerting-rules/cluster-service.rules.yml kaas/phoenix/alerting-rules/credentiald.rules.yml +kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml kaas/phoenix/alerting-rules/kiam.rules.yml @@ -23,6 +24,9 @@ kaas/turtles/alerting-rules/capi-machinedeployment.rules.yml kaas/turtles/alerting-rules/capi-machinepool.rules.yml kaas/turtles/alerting-rules/capi-machineset.rules.yml kaas/turtles/alerting-rules/capi.management-cluster.rules.yml +kaas/turtles/alerting-rules/certificate.all.rules.yml +kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml +kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml kaas/turtles/alerting-rules/docker.rules.yml kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml @@ -33,6 +37,7 @@ kaas/turtles/alerting-rules/inhibit.capi.rules.yml kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml kaas/turtles/alerting-rules/job.rules.yml kaas/turtles/alerting-rules/kubelet.rules.yml +kaas/turtles/alerting-rules/management-cluster.rules.yml kaas/turtles/alerting-rules/net-exporter.rules.yml kaas/turtles/alerting-rules/node-exporter.rules.yml kaas/turtles/alerting-rules/node.management-cluster.rules.yml @@ -42,16 +47,23 @@ kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml kaas/turtles/alerting-rules/systemd.rules.yml kaas/turtles/alerting-rules/timesync.rules.yml kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml +kaas/turtles/recording-rules/kubernetes-mixins.rules.yml +platform/atlas/alerting-rules/deployment.management-cluster.rules.yml +platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml platform/atlas/alerting-rules/fluentbit.rules.yml platform/atlas/alerting-rules/inhibit.oncall.rules.yml platform/atlas/alerting-rules/keda.rules.yml platform/atlas/alerting-rules/kube-state-metrics.rules.yml +platform/atlas/alerting-rules/operatorkit.rules.yml platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml platform/atlas/alerting-rules/prometheus-operator.rules.yml platform/atlas/alerting-rules/service-level.rules.yml +platform/atlas/alerting-rules/service-level.rules.yml platform/atlas/alerting-rules/storage.rules.yml +platform/atlas/recording-rules/grafana-cloud.rules.yml platform/atlas/recording-rules/loki-mixins.rules.yml platform/atlas/recording-rules/mimir-mixins.rules.yml +platform/atlas/recording-rules/service-level.rules.yml platform/cabbage/alerting-rules/coredns.rules.yml platform/cabbage/alerting-rules/external-dns.rules.yml platform/cabbage/alerting-rules/ingress-controller.rules.yml @@ -64,16 +76,3 @@ platform/honeybadger/alerting-rules/secret.rules.yml platform/honeybadger/recording-rules/helm-operations.rules.yml platform/honeybadger/recording-rules/helm-operations.rules.yml platform/shield/alerting-rules/falco.rules.yml -shared/alerting-rules/certificate.all.rules.yml -shared/alerting-rules/certificate.management-cluster.rules.yml -shared/alerting-rules/certificate.workload-cluster.rules.yml -shared/alerting-rules/deployment.management-cluster.rules.yml -shared/alerting-rules/deployment.workload-cluster.rules.yml -shared/alerting-rules/dns-operator-azure.rules.yml -shared/alerting-rules/management-cluster.rules.yml -shared/alerting-rules/microendpoint.rules.yml -shared/alerting-rules/operatorkit.rules.yml -shared/alerting-rules/service-level.rules.yml -shared/recording-rules/grafana-cloud.rules.yml -shared/recording-rules/kubernetes-mixins.rules.yml -shared/recording-rules/service-level.rules.yml diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh index ab8edf5f8..155e858bf 100755 --- a/test/hack/bin/template-chart.sh +++ b/test/hack/bin/template-chart.sh @@ -7,6 +7,8 @@ main() { local -a providers mapfile -t providers <"$GIT_WORKDIR/test/conf/providers" + rm -rf "$GIT_WORKDIR"/test/hack/output/helm-chart/ + for provider in "${providers[@]}"; do echo "Templating chart for provider: $provider"