diff --git a/CHANGELOG.md b/CHANGELOG.md index d5eb649a2..1204178af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Support multiple KSM pods in our alerts. + ## [2.140.0] - 2023-11-13 ### Added diff --git a/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml b/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml index 9ff700eb0..e73c05f89 100644 --- a/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/gs-managed-app-deployment-status.rules.yml @@ -9,21 +9,36 @@ spec: groups: - name: gs-managed-app-deployments.recording rules: - - expr: label_replace( - kube_deployment_status_replicas_available * - on (namespace, deployment) group_left(label_app_kubernetes_io_name) - kube_deployment_labels{label_giantswarm_io_service_type="managed"}, - "managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" ) + - expr: | + label_replace( + kube_deployment_status_replicas_available + * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) + kube_deployment_labels{label_giantswarm_io_service_type="managed"}, + "managed_app", + "$1", + "label_app_kubernetes_io_name", + "(.*)" + ) record: managed_app_deployment_status_replicas_available - - expr: label_replace( - kube_deployment_status_replicas_unavailable * - on (namespace, deployment) group_left(label_app_kubernetes_io_name) - kube_deployment_labels{label_giantswarm_io_service_type="managed"}, - "managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" ) + - expr: | + label_replace( + kube_deployment_status_replicas_unavailable + * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) + kube_deployment_labels{label_giantswarm_io_service_type="managed"}, + "managed_app", + "$1", + "label_app_kubernetes_io_name", + "(.*)" + ) record: managed_app_deployment_status_replicas_unavailable - - expr: label_replace( - kube_deployment_spec_replicas * - on (namespace, deployment) group_left(label_app_kubernetes_io_name) - kube_deployment_labels{label_giantswarm_io_service_type="managed"}, - "managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" ) + - expr: | + label_replace( + kube_deployment_spec_replicas + * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) + kube_deployment_labels{label_giantswarm_io_service_type="managed"}, + "managed_app", + "$1", + "label_app_kubernetes_io_name", + "(.*)" + ) record: managed_app_deployment_spec_replicas diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index 6c1b732cf..4fcd958a7 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -174,12 +174,14 @@ spec: # -- error recording rules # record when pods of a daemonset with label "label_giantswarm_io_monitoring_basic_sli" are down - expr: | - kube_daemonset_labels * on (daemonset, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( + label_replace( kube_daemonset_status_number_unavailable - and on(daemonset,cluster_id,cluster_type,namespace) - kube_daemonset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "daemonset", "(.*)" ) + * on (pod, daemonset, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team) + kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, + "service", + "$1", + "daemonset", + "(.*)" ) labels: class: MEDIUM @@ -187,12 +189,14 @@ spec: record: raw_slo_errors # record when pods of a deployment with label "label_giantswarm_io_monitoring_basic_sli" are down - expr: | - kube_deployment_labels * on (deployment, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( + label_replace( kube_deployment_status_replicas_unavailable - and on(deployment,cluster_id,cluster_type,namespace) + * on (pod, deployment, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team) kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "deployment", "(.*)" ) + "service", + "$1", + "deployment", + "(.*)" ) labels: class: MEDIUM @@ -200,12 +204,14 @@ spec: record: raw_slo_errors # record when pods of a statefulset with label "label_giantswarm_io_monitoring_basic_sli" are down - expr: | - kube_statefulset_labels * on (statefulset, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( + label_replace( kube_statefulset_status_replicas - kube_statefulset_status_replicas_current - and on(statefulset,cluster_id,cluster_type,namespace) + * on (pod, statefulset, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team) kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "statefulset", "(.*)" ) + "service", + "$1", + "statefulset", + "(.*)" ) labels: class: MEDIUM