Skip to content

Commit

Permalink
Merge branch 'master' into fix-cilium-rules-for-mimir
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored May 6, 2024
2 parents d789a59 + be49a73 commit 4368f1f
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 21 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Remove cilium entry from KAAS SLOs.
- Fix cilium related alerts for mimir.
- Fix etcd alerts for Mimir.
- Add missing labels for apiserver alerts.

### Added

- Add `cluster_control_plane_unhealthy` inhibition.

### Added

- Added inhibitions expressions for CAPI clusters.

## [3.13.1] - 2024-04-30

### Removed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ spec:
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: workload-cluster-deployment-not-satisfied/
expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment="etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
opsrecipe: etcd-high-commit-duration/
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!="eks"}[5m])) > 1.0
for: 15m
labels:
area: kaas
Expand All @@ -44,7 +44,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
opsrecipe: etcd-db-size-too-large/
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!="eks"}) * 100 > 80
for: 90m
labels:
area: kaas
Expand All @@ -55,7 +55,7 @@ spec:
- alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh
annotations:
description: '{{`Etcd has too many leader changes.`}}'
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!="eks"}[1h]) > 8
labels:
area: kaas
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
Expand All @@ -66,7 +66,7 @@ spec:
annotations:
description: '{{`Etcd has no leader.`}}'
opsrecipe: etcd-has-no-leader/
expr: etcd_server_has_leader{cluster_type="management_cluster", provider!~"eks"} == 0
expr: etcd_server_has_leader{cluster_type="management_cluster", provider!="eks"} == 0
for: 5m
labels:
area: kaas
Expand All @@ -78,7 +78,7 @@ spec:
annotations:
description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
opsrecipe: etcd-metrics-missing/
expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id)
expr: count(up{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="management_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider)
for: 1h
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}'
opsrecipe: etcd-down/
expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0
expr: up{cluster_type="workload_cluster", app="etcd", provider!="eks"} == 0
for: 20m
labels:
area: kaas
Expand All @@ -33,7 +33,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}'
opsrecipe: etcd-high-commit-duration/
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0
expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!="eks"}[5m])) > 1.0
for: 15m
labels:
area: kaas
Expand All @@ -45,7 +45,7 @@ spec:
annotations:
description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}'
opsrecipe: etcd-db-size-too-large/
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80
expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!="eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!="eks"}) * 100 > 80
for: 15m
labels:
area: kaas
Expand All @@ -56,7 +56,7 @@ spec:
- alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh
annotations:
description: '{{`Etcd has too many leader changes.`}}'
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8
expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!="eks"}[1h]) > 8
labels:
area: kaas
severity: notify
Expand All @@ -66,7 +66,7 @@ spec:
annotations:
description: '{{`Etcd has no leader.`}}'
opsrecipe: etcd-has-no-leader/
expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0
expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!="eks"} == 0
for: 35m
labels:
area: kaas
Expand All @@ -78,7 +78,7 @@ spec:
annotations:
description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}'
opsrecipe: etcd-metrics-missing/
expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id)
expr: count(up{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider) unless count(etcd_server_id{cluster_type="workload_cluster", provider!="eks"}) by (cluster_id, installation, pipeline, provider)
for: 1h
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
annotations:
description: '{{`Latest successfull ETCD backup for {{ $labels.cluster_id }}/{{ $labels.tenant_cluster_id }} was more than 48h ago.`}}'
opsrecipe: etcd-backup-failed/
expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (tenant_cluster_id) > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (tenant_cluster_id)
expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, tenant_cluster_id) > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)
for: 5m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
annotations:
description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
opsrecipe: flowcontrol-rejected-requests/
expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, installation, pipeline, provider, flow_schema, priority_level) > (min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
for: 15m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
- alert: InhibitionClusterStatusCreating
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Creating''.`}}'
expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1
expr: label_replace(max_over_time(statusresource_cluster_status{status="Creating"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Creating"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Provisioning"}[30m]) == 1
labels:
area: kaas
cluster_status_creating: "true"
Expand All @@ -26,7 +26,7 @@ spec:
- alert: InhibitionClusterStatusCreated
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Created''.`}}'
expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1
expr: label_replace(statusresource_cluster_status{status="Created"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Created"} == 1 or (capi_cluster_status_phase{phase="Provisioned"} == 1 and capi_cluster_status_condition{type="Ready", status="True"} == 1)
labels:
area: kaas
cluster_status_created: "true"
Expand All @@ -35,7 +35,7 @@ spec:
- alert: InhibitionClusterStatusUpdating
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updating''.`}}'
expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1
expr: label_replace(statusresource_cluster_status{status="Updating"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updating"} == 1 or label_replace(changes(statusresource_cluster_status{status="Updating"}[10m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or changes(cluster_operator_cluster_status{status="Updating"}[10m]) == 1 or (capi_cluster_status_condition{type="Ready", status="False"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="False"} == 1))
labels:
area: kaas
cluster_status_updating: "true"
Expand All @@ -44,7 +44,7 @@ spec:
- alert: InhibitionClusterStatusUpdated
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Updated''.`}}'
expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1
expr: label_replace(statusresource_cluster_status{status="Updated"}, "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or cluster_operator_cluster_status{status="Updated"} == 1 or (capi_cluster_status_condition{type="Ready", status="True"} == 1 and (capi_kubeadmcontrolplane_status_condition{type="MachinesSpecUpToDate", status="True"} == 1 and capi_kubeadmcontrolplane_status_condition{type="MachinesReady", status="True"} == 1))
labels:
area: kaas
cluster_status_updated: "true"
Expand All @@ -53,7 +53,7 @@ spec:
- alert: InhibitionClusterStatusDeleting
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} has status condition ''Deleting''.`}}'
expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1
expr: label_replace(max_over_time(statusresource_cluster_status{status="Deleting"}[30m]), "cluster_id", "$1", "exported_cluster_id", "(.+)") == 1 or max_over_time(cluster_operator_cluster_status{status="Deleting"}[30m]) == 1 or max_over_time(capi_cluster_status_phase{phase="Deleting"}[30m]) == 1
labels:
area: kaas
cluster_status_deleting: "true"
Expand All @@ -71,7 +71,7 @@ spec:
- alert: InhibitionClusterScalingNodePools
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} node pools are scaling.`}}'
expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers
expr: cluster_operator_node_pool_desired_workers != cluster_operator_node_pool_ready_workers or (capi_machinepool_status_phase{phase="ScalingUp"} == 1 or capi_machinepool_status_phase{phase="ScalingDown"} == 1)
labels:
area: kaas
cluster_with_scaling_nodepools: "true"
Expand All @@ -80,7 +80,7 @@ spec:
- alert: InhibitionClusterNodePoolsNotReady
annotations:
description: '{{`Cluster {{ $labels.cluster_id }} node pools are not ready. Either they have been scaled down to 0 or they are not up yet.`}}'
expr: cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0
expr: (cluster_operator_node_pool_desired_workers == 0 and cluster_operator_node_pool_ready_workers == 0) or capi_machinepool_status_condition{type="Ready", status="False"} == 1
labels:
area: kaas
cluster_with_notready_nodepools: "true"
Expand All @@ -95,4 +95,13 @@ spec:
instance_state_not_running: "true"
team: phoenix
topic: status
- alert: InhibitionControlPlaneUnhealthy
annotations:
description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
labels:
area: kaas
cluster_control_plane_unhealthy: "true"
team: phoenix
topic: status
{{- end }}

0 comments on commit 4368f1f

Please sign in to comment.