From 4fb94aa5bc96848c726a496abc13fdab3286ccbf Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 15 Nov 2023 08:54:42 +0100 Subject: [PATCH] Support multiple KSM pods (#950) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hervé Nicol --- CHANGELOG.md | 1 + ...gs-managed-app-deployment-status.rules.yml | 45 ++++++++++++------- .../recording-rules/service-level.rules.yml | 32 +++++++------ 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab8aabcb8..78a68cdec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Support multiple KSM pods in our alerts. - Split prometheus-agent alerts (`PrometheusAgentFailing` and `PrometheusAgentShardsMissing`) in 2: - existing alerts will fire later - new inhibitions alerts will fire earlier diff --git a/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml b/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml index 9ff700eb0..e73c05f89 100644 --- a/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml @@ -9,21 +9,36 @@ spec: groups: - name: gs-managed-app-deployments.recording rules: - - expr: label_replace( - kube_deployment_status_replicas_available * - on (namespace, deployment) group_left(label_app_kubernetes_io_name) - kube_deployment_labels{label_giantswarm_io_service_type="managed"}, - "managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" ) + - expr: | + label_replace( + kube_deployment_status_replicas_available + * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) + kube_deployment_labels{label_giantswarm_io_service_type="managed"}, + "managed_app", + "$1", + "label_app_kubernetes_io_name", + "(.*)" + ) record: managed_app_deployment_status_replicas_available - - expr: label_replace( - kube_deployment_status_replicas_unavailable * - on (namespace, deployment) group_left(label_app_kubernetes_io_name) - kube_deployment_labels{label_giantswarm_io_service_type="managed"}, - "managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" ) + - expr: | + label_replace( + kube_deployment_status_replicas_unavailable + * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) + kube_deployment_labels{label_giantswarm_io_service_type="managed"}, + "managed_app", + "$1", + "label_app_kubernetes_io_name", + "(.*)" + ) record: managed_app_deployment_status_replicas_unavailable - - expr: label_replace( - kube_deployment_spec_replicas * - on (namespace, deployment) group_left(label_app_kubernetes_io_name) - kube_deployment_labels{label_giantswarm_io_service_type="managed"}, - "managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" ) + - expr: | + label_replace( + kube_deployment_spec_replicas + * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) + kube_deployment_labels{label_giantswarm_io_service_type="managed"}, + "managed_app", + "$1", + "label_app_kubernetes_io_name", + "(.*)" + ) record: managed_app_deployment_spec_replicas diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index 8ab9e1a53..f2d4143ac 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -174,12 +174,14 @@ spec: # -- error recording rules # record when pods of a daemonset with label "label_giantswarm_io_monitoring_basic_sli" are down - expr: | - kube_daemonset_labels * on (daemonset, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( + label_replace( kube_daemonset_status_number_unavailable - and on(daemonset,cluster_id,cluster_type,namespace) - kube_daemonset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "daemonset", "(.*)" ) + * on (pod, daemonset, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team) + kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, + "service", + "$1", + "daemonset", + "(.*)" ) labels: class: MEDIUM @@ -187,12 +189,14 @@ spec: record: raw_slo_errors # record when pods of a deployment with label "label_giantswarm_io_monitoring_basic_sli" are down - expr: | - kube_deployment_labels * on (deployment, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( + label_replace( kube_deployment_status_replicas_unavailable - and on(deployment,cluster_id,cluster_type,namespace) + * on (pod, deployment, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team) kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "deployment", "(.*)" ) + "service", + "$1", + "deployment", + "(.*)" ) labels: class: MEDIUM @@ -200,12 +204,14 @@ spec: record: raw_slo_errors # record when pods of a statefulset with label "label_giantswarm_io_monitoring_basic_sli" are down - expr: | - kube_statefulset_labels * on (statefulset, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( + label_replace( kube_statefulset_status_replicas - kube_statefulset_status_replicas_current - and on(statefulset,cluster_id,cluster_type,namespace) + * on (pod, statefulset, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team) kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "statefulset", "(.*)" ) + "service", + "$1", + "statefulset", + "(.*)" ) labels: class: MEDIUM