From 20d15aa5dc36573ae64028a8bed8aa1dcc784e02 Mon Sep 17 00:00:00 2001 From: Spyros Synodinos Date: Thu, 11 Jan 2024 16:08:41 +0200 Subject: [PATCH] teleport-operator specifics --- .../deployment.management-cluster.rules.yml | 2 +- .../deployment.workload-cluster.rules.yml | 4 +- .../alerting-rules/teleport.rules.yaml | 40 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 helm/prometheus-rules/templates/alerting-rules/teleport.rules.yaml diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml index b5e9cfcf6..4c253a30d 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml @@ -215,7 +215,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0 for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml index e3d8c7ffb..891675428 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml @@ -73,10 +73,10 @@ spec: - alert: WorkloadClusterDeploymentNotSatisfiedBigMac annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled down to zero for prolonged period of time.`}}' - expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"cert-manager-*|teleport-*|dex*|athena*|rbac-operator|credentiald"} > 0 for: 30m labels: - area: kass + area: kaas cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/teleport.rules.yaml b/helm/prometheus-rules/templates/alerting-rules/teleport.rules.yaml new file mode 100644 index 000000000..7a8eee683 --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/teleport.rules.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: prometheus.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: teleport + rules: + - alert: TeleportJoinTokenSecretMismatch + annotations: + description: '{{`Mismatch in number of teleport-join-token secrets and clusters`}}' + expr: count(kube_secret_created{secret=~".*-teleport-join-token"}) != count(capi_cluster_info{control_plane_reference_kind="KubeadmControlPlane"}) + for: 30m + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: bigmac + topic: teleport + - alert: TeleportKubeAgentConfigMapMismatch + annotations: + description: '{{`Teleport join token is missing`}}' + expr: count(kube_configmap_info{configmap=~".*-teleport-kube-agent-config"})!= count(capi_cluster_info{control_plane_reference_kind="KubeadmControlPlane"}) + for: 30m + labels: + area: kaas + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: bigmac + topic: teleport \ No newline at end of file