Skip to content

Commit

Permalink
Support multiple KSM pods (#950)
Browse files Browse the repository at this point in the history
Co-authored-by: Hervé Nicol <[email protected]>
  • Loading branch information
QuentinBisson and hervenicol authored Nov 15, 2023
1 parent 87d9570 commit 4fb94aa
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 28 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- Support multiple KSM pods in our alerts.
- Split prometheus-agent alerts (`PrometheusAgentFailing` and `PrometheusAgentShardsMissing`) in 2:
- existing alerts will fire later
- new inhibitions alerts will fire earlier
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,36 @@ spec:
groups:
- name: gs-managed-app-deployments.recording
rules:
- expr: label_replace(
kube_deployment_status_replicas_available *
on (namespace, deployment) group_left(label_app_kubernetes_io_name)
kube_deployment_labels{label_giantswarm_io_service_type="managed"},
"managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" )
- expr: |
label_replace(
kube_deployment_status_replicas_available
* on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name)
kube_deployment_labels{label_giantswarm_io_service_type="managed"},
"managed_app",
"$1",
"label_app_kubernetes_io_name",
"(.*)"
)
record: managed_app_deployment_status_replicas_available
- expr: label_replace(
kube_deployment_status_replicas_unavailable *
on (namespace, deployment) group_left(label_app_kubernetes_io_name)
kube_deployment_labels{label_giantswarm_io_service_type="managed"},
"managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" )
- expr: |
label_replace(
kube_deployment_status_replicas_unavailable
* on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name)
kube_deployment_labels{label_giantswarm_io_service_type="managed"},
"managed_app",
"$1",
"label_app_kubernetes_io_name",
"(.*)"
)
record: managed_app_deployment_status_replicas_unavailable
- expr: label_replace(
kube_deployment_spec_replicas *
on (namespace, deployment) group_left(label_app_kubernetes_io_name)
kube_deployment_labels{label_giantswarm_io_service_type="managed"},
"managed_app", "$1", "label_app_kubernetes_io_name", "(.*)" )
- expr: |
label_replace(
kube_deployment_spec_replicas
* on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name)
kube_deployment_labels{label_giantswarm_io_service_type="managed"},
"managed_app",
"$1",
"label_app_kubernetes_io_name",
"(.*)"
)
record: managed_app_deployment_spec_replicas
Original file line number Diff line number Diff line change
Expand Up @@ -174,38 +174,44 @@ spec:
# -- error recording rules
# record when pods of a daemonset with label "label_giantswarm_io_monitoring_basic_sli" are down
- expr: |
kube_daemonset_labels * on (daemonset, namespace) group_right(label_application_giantswarm_io_team) (
label_replace(
label_replace(
kube_daemonset_status_number_unavailable
and on(daemonset,cluster_id,cluster_type,namespace)
kube_daemonset_labels{label_giantswarm_io_monitoring_basic_sli='true'},
"service", "$1", "daemonset", "(.*)" )
* on (pod, daemonset, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team)
kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'},
"service",
"$1",
"daemonset",
"(.*)"
)
labels:
class: MEDIUM
area: managed-apps
record: raw_slo_errors
# record when pods of a deployment with label "label_giantswarm_io_monitoring_basic_sli" are down
- expr: |
kube_deployment_labels * on (deployment, namespace) group_right(label_application_giantswarm_io_team) (
label_replace(
label_replace(
kube_deployment_status_replicas_unavailable
and on(deployment,cluster_id,cluster_type,namespace)
* on (pod, deployment, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team)
kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'},
"service", "$1", "deployment", "(.*)" )
"service",
"$1",
"deployment",
"(.*)"
)
labels:
class: MEDIUM
area: managed-apps
record: raw_slo_errors
# record when pods of a statefulset with label "label_giantswarm_io_monitoring_basic_sli" are down
- expr: |
kube_statefulset_labels * on (statefulset, namespace) group_right(label_application_giantswarm_io_team) (
label_replace(
label_replace(
kube_statefulset_status_replicas - kube_statefulset_status_replicas_current
and on(statefulset,cluster_id,cluster_type,namespace)
* on (pod, statefulset, cluster_id, cluster_type, namespace) group_left (label_application_giantswarm_io_team)
kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'},
"service", "$1", "statefulset", "(.*)" )
"service",
"$1",
"statefulset",
"(.*)"
)
labels:
class: MEDIUM
Expand Down

0 comments on commit 4fb94aa

Please sign in to comment.