Skip to content

Commit

Permalink
review-phoenix-inhibitions (#1212)
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson authored Jun 6, 2024
1 parent b45bd99 commit 04969c1
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 81 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor.
- Add `CiliumAPITooSlow`.

### Changed

- Review phoenix alerts towards Mimir.
- Moves cluster-autoscaler and vpa alerts to turtles.

### Fixed

- Fix cabbage alerts for multi-provider wcs.
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
## TODO Remove with vintage
# This rule applies to vintage aws management clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
{{- if not .Values.mimir.enabled }}
# No need for .Values.mimir.enabled condition - will be gone with Vintage
cluster_type: "management_cluster"
{{- end }}
name: inhibit.management-cluster.rules
name: inhibit.aws.management-cluster.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: inhibit.management-cluster
- name: inhibit.aws.management-cluster
rules:
- alert: InhibitionClusterStatusCreating
annotations:
Expand Down Expand Up @@ -95,13 +96,13 @@ spec:
instance_state_not_running: "true"
team: phoenix
topic: status
- alert: InhibitionControlPlaneUnhealthy
- alert: InhibitionClusterWithoutWorkerNodes
annotations:
description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
labels:
area: kaas
cluster_control_plane_unhealthy: "true"
has_worker_nodes: "false"
team: phoenix
topic: status
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
## TODO Remove with vintage
# This rule applies to vintage aws clusters
{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
name: inhibit.kiam.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: inhibit.kiam
rules:
- alert: InhibitionKiamErrors
annotations:
description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'
expr: increase(kiam_metadata_credential_fetch_errors_total[10m]) > 0 or increase(kiam_metadata_find_role_errors_total[10m]) > 0 or increase(kiam_sts_issuing_errors_total[10m]) > 0
labels:
area: kaas
kiam_has_errors: "true"
team: phoenix
topic: kiam
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if eq .Values.managementCluster.provider.kind "aws" }}
# This rule applies to all cloud workload clusters
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -18,31 +18,30 @@ spec:
annotations:
description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has unneeded nodes.`}}'
opsrecipe: cluster-autoscaler-scaling/
expr: cluster_autoscaler_unneeded_nodes_count > 0
expr: cluster_autoscaler_unneeded_nodes_count{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"} > 0
for: 240m
labels:
area: managedservices
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cancel_if_cluster_has_no_workers: "true"
severity: page
team: phoenix
team: turtles
topic: cluster-autoscaler
- alert: ClusterAutoscalerFailedScaling
annotations:
description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has failed scaling up.`}}'
opsrecipe: cluster-autoscaler-scaling/
expr: increase(cluster_autoscaler_failed_scale_ups_total[5m]) > 1
expr: increase(cluster_autoscaler_failed_scale_ups_total{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"}[5m]) > 1
for: 15m
labels:
area: managedservices
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
team: turtles
topic: cluster-autoscaler
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This rule applies to all capi management clusters
{{- if eq .Values.managementCluster.provider.flavor "capi" }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
{{- if not .Values.mimir.enabled }}
cluster_type: "management_cluster"
{{- end }}
name: inhibit.capi.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: inhibit.capi
rules:
- alert: InhibitionControlPlaneUnhealthy
annotations:
description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
expr: |-
capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="ControlPlaneComponentsHealthy", status="False"} == 1
or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="EtcdClusterHealthy", status="False"} == 1
or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="Available", status="False"} == 1
labels:
area: kaas
cluster_control_plane_unhealthy: "true"
team: turtles
topic: status
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# This rule applies to all clusters
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
name: inhibit.kubelet.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: inhibit.kubelet
rules:
- alert: InhibitionKubeletDown
annotations:
description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
labels:
kubelet_down: "true"
area: kaas
topic: kubernetes
team: turtles
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
name: vpa.all.rules
name: vertical-pod-autoscaler.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: vpa
- name: vertical-pod-autoscaler
rules:
- alert: VpaComponentTooManyRestarts
annotations:
Expand All @@ -27,5 +27,5 @@ spec:
cancel_if_scrape_timeout: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: phoenix
topic: observability
team: turtles
topic: autoscaling
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
{{- include "labels.common" . | nindent 4 }}
name: inhibit.oncall.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: inhibit.oncall
rules:
- alert: InhibitionOutsideWorkingHours
annotations:
description: '{{`Fires outside working hours.`}}'
expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1)
labels:
area: platform
outside_working_hours: "true"
team: atlas
topic: monitoring
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ spec:
cancel_if_cluster_with_scaling_nodepools: "true"
cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
cancel_if_cluster_has_no_workers: "true"
cancel_if_nodes_down: "true"
severity: page
team: {{ include "providerTeam" . }}
topic: network
Expand Down

0 comments on commit 04969c1

Please sign in to comment.