Skip to content

Commit

Permalink
Sort out shared alerts ownership
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
  • Loading branch information
QuentinBisson committed Jun 11, 2024
1 parent ff29140 commit 9848718
Show file tree
Hide file tree
Showing 17 changed files with 50 additions and 119 deletions.
11 changes: 6 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Moves cluster-autoscaler and vpa alerts to turtles.
- Reviewed turtles alerts labels.
- Use `ready` replicas for Kyverno webhooks alert.
- Moves ownership of alerts for shared components to turtles.
- Sort out shared alert ownership by distributing them all to teams.

### Fixed

- Fixed usage of yq, and jq in check-opsrecipes.sh
- Fetch jq with make install-tools
- Fix and improve the check-opsrecipes.sh script so support <directory>/_index.md based ops-recipes.
- Fix cabbage alerts for multi-provider wcs.
- Fix a few area labels in alerts.
- Fix `cert-exporter` alerting.
- Fixed and improve the check-opsrecipes.sh script to support <directory>/_index.md based ops-recipes.
- Fixed cabbage alerts for multi-provider MCs.
- Fixed all area alert labels.
- Fixed `cert-exporter` alerts to page on all providers.
- Fix `ManagementClusterDexAppMissing` use of absent for mimir.

### Removed

- cleanup: get rid of microendpoint alerts as it never fired and probably never will
- cleanup: remove scrape timeout inhibition leftovers (documentation and labels)

## [4.1.2] - 2024-05-31
Expand Down
1 change: 0 additions & 1 deletion Makefile.custom.mk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ install-tools:
./test/hack/bin/fetch-tools.sh

template-chart: install-tools ## prepare the helm chart
test/hack/bin/architect helm template --dir helm/prometheus-rules --dry-run
bash ./test/hack/bin/template-chart.sh

test-rules: install-tools template-chart ## run unit tests for alerting rules
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ spec:
opsrecipe: kyverno-certificate-secret-will-expire-in-less-than-two-days/
expr: (cert_exporter_secret_not_after{name=~".*kyverno.*"} - time()) < 2 * 24 * 60 * 60
labels:
area: managedservices
area: kaas
cancel_if_outside_working_hours: "true"
severity: notify
team: shield
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ spec:
groups:
- name: management-cluster
rules:
{{- if (eq .Values.managementCluster.provider.kind "aws") }}
- alert: ManagementClusterHasLessThanThreeNodes
annotations:
description: '{{`Management cluster {{ $labels.cluster_id }} has less than 3 nodes.`}}'
Expand All @@ -24,7 +23,7 @@ spec:
area: kaas
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: page
team: phoenix
team: {{ include "providerTeam" . }}
topic: managementcluster
- alert: ManagementClusterMissingNodes
annotations:
Expand All @@ -34,7 +33,7 @@ spec:
labels:
area: kaas
severity: notify
team: phoenix
team: {{ include "providerTeam" . }}
topic: managementcluster
- alert: ManagementClusterCPUUsageTooHigh
annotations:
Expand All @@ -46,7 +45,7 @@ spec:
area: kass
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
team: {{ include "providerTeam" . }}
topic: managementcluster
- alert: ManagementClusterMemoryUsageTooHigh
annotations:
Expand All @@ -58,7 +57,7 @@ spec:
area: kass
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
team: {{ include "providerTeam" . }}
topic: managementcluster
- alert: ManagementClusterPodLimitAlmostReached
annotations:
Expand All @@ -72,8 +71,10 @@ spec:
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: notify
team: phoenix
team: {{ include "providerTeam" . }}
topic: managementcluster
{{- if (eq .Values.managementCluster.provider.kind "aws") }}
## TODO Remove when all vintage clusters are gone
- alert: ManagementClusterCriticalPodNotRunning
annotations:
description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0
for: 30m
labels:
area: kaas
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -35,7 +35,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"app-admission-controller-.+|app-operator-.+|chart-operator-.+", cluster_id!~"argali|giraffe"} > 0
for: 30m
labels:
area: managedservices
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -50,7 +50,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"app-admission-controller-.+|app-operator-.+|chart-operator-.+", cluster_id=~"argali|giraffe"} > 0
for: 3h
labels:
area: managedservices
area: platform
severity: page
team: honeybadger
topic: managementcluster
Expand All @@ -61,7 +61,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"kyverno.*", cluster_id!~"argali|giraffe"} > 0
for: 30m
labels:
area: managedservices
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -76,7 +76,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"kyverno.*", cluster_id=~"argali|giraffe"} > 0
for: 30m
labels:
area: managedservices
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -91,7 +91,7 @@ spec:
expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator-.+|cluster-operator-.+|cluster-api-core-webhook.*|event-exporter-.*|etcd-kubernetes-resources-count-exporter-.*|upgrade-schedule-operator.*|worker-.+|master-.+", cluster_id!~"argali|giraffe"}, "service", "/", "namespace", "deployment") > 0
for: 30m
labels:
area: kaas
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -105,7 +105,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-operator-.+|cluster-operator-.+|coredns-.+|event-exporter-.+|etcd-kubernetes-resources-count-exporter.*", cluster_id=~"argali|giraffe"} > 0
for: 3h
labels:
area: kaas
area: platform
severity: page
team: {{ include "providerTeam" . }}
topic: managementcluster
Expand All @@ -116,7 +116,7 @@ spec:
expr: kube_deployment_status_replicas_available{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="aws"} + kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"([a-z]*)-operator([a-z,-]*)",provider="aws"} == 0
for: 4h
labels:
area: kaas
area: platform
severity: notify
team: phoenix
topic: managementcluster
Expand All @@ -127,7 +127,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator.*|cluster-operator.*|cluster-api-core-webhook.*|event-exporter-.*|upgrade-schedule-operator.*|event-exporter-app.*", cluster_id=~"argali|giraffe"} > 0
for: 3h
labels:
area: kaas
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -142,7 +142,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller)-.+", cluster_id!~"argali|giraffe"} > 0
for: 30m
labels:
area: kaas
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand All @@ -156,7 +156,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller|coredns)-.+", cluster_id=~"argali|giraffe"} > 0
for: 3h
labels:
area: kaas
area: platform
severity: page
team: cabbage
topic: managementcluster
Expand All @@ -171,7 +171,7 @@ spec:
{{- end }}
for: 30m
labels:
area: kaas
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="chart-operator"} > 0
for: 30m
labels:
area: managedservices
area: platform
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: page
team: honeybadger
Expand All @@ -32,7 +32,7 @@ spec:
expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"metrics-server|vertical-pod-autoscaler(-app)?-admission-controller|vertical-pod-autoscaler(-app)?-recommender|vertical-pod-autoscaler(-app)?-updater|aws-pod-identity-webhook.*|cluster-autoscaler|aws-load-balancer-controller"}, "service", "/", "namespace", "deployment") > 0
for: 30m
labels:
area: kaas
area: platform
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: page
team: {{ include "providerTeam" . }}
Expand All @@ -44,7 +44,7 @@ spec:
expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
for: 30m
labels:
area: kaas
area: platform
cancel_if_prometheus_agent_down: "true"
cancel_if_outside_working_hours: "true"
severity: page
Expand All @@ -56,7 +56,7 @@ spec:
expr: kube_deployment_status_replicas_available{cluster_type="workload_cluster", deployment="chart-operator"} + kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="chart-operator"} == 0
for: 4h
labels:
area: managedservices
area: platform
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
severity: notify
team: honeybadger
Expand All @@ -67,7 +67,7 @@ spec:
expr: kube_deployment_spec_replicas{cluster_type="workload_cluster", deployment=~"trivy-operator|starboard-exporter|jiralert"} == 0
for: 4h
labels:
area: managedservices
area: platform
cancel_if_outside_working_hours: "true"
severity: notify
team: shield
Expand All @@ -79,7 +79,7 @@ spec:
expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0
for: 30m
labels:
area: kaas
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Atlas is the team responsible for the operatorkit
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -17,7 +18,7 @@ spec:
expr: operatorkit_controller_error_total{pod=~"app-operator.*|chart-operator.*"} > 5
for: 1m
labels:
area: kaas
area: platform
severity: notify
team: honeybadger
topic: qa
Expand All @@ -27,7 +28,7 @@ spec:
expr: (time() - operatorkit_controller_last_reconciled{pod=~"app-operator.*|chart-operator.*"}) / 60 > 30
for: 10m
labels:
area: managedservices
area: platform
severity: notify
team: honeybadger
topic: releng
Expand Down

This file was deleted.

2 changes: 1 addition & 1 deletion scripts/sync-kube-mixin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -o nounset
set -o pipefail

TMPDIR="$(mktemp -d -t 'tmp.XXXXXXXXXX')"
RULESFILE="helm/prometheus-rules/templates/shared/recording-rules/kubernetes-mixins.rules.yml"
RULESFILE="helm/prometheus-rules/templates/kaas/turtles/recording-rules/kubernetes-mixins.rules.yml"

trap 'cleanup' EXIT

Expand Down
Loading

0 comments on commit 9848718

Please sign in to comment.