Skip to content

Commit

Permalink
Add missing team to slo alerts
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
  • Loading branch information
QuentinBisson committed Oct 5, 2023
1 parent 85bd281 commit aaff797
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Add missing team label to slo alerts.

## [2.137.0] - 2023-10-04

### Removed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ spec:
class: HIGH
area: kaas
service: api-server
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_requests
# The first statement ensures that an api-server error is counted if the kubernetes api is not up for a specific cluster.
# The next statement returns 1 for a cluster with "updated", "created" or unknown (absent) status.
Expand All @@ -36,6 +37,7 @@ spec:
labels:
area: kaas
service: api-server
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: slo_target

# -- daemonset
Expand All @@ -46,6 +48,7 @@ spec:
labels:
class: MEDIUM
area: kaas
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_requests
# -- the errors are counted as follows:
# -- pods in a daemonset that are UNAVAILABLE NOW and have been UNAVAILABLE 10 MINUTES AGO
Expand Down Expand Up @@ -75,6 +78,7 @@ spec:
- expr: sum by (service, area) (raw_slo_errors{area="kaas", service=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"} - raw_slo_errors{area="kaas", service=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}) + 1-0.99
labels:
area: kaas
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: slo_target

# -- kubelet whole cluster
Expand All @@ -83,6 +87,7 @@ spec:
class: MEDIUM
area: kaas
service: kubelet
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_requests
- expr: |
(
Expand All @@ -96,19 +101,22 @@ spec:
area: kaas
class: MEDIUM
service: kubelet
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_errors
# -- 99% availability
- expr: "vector((1 - 0.99))"
labels:
area: kaas
service: kubelet
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: slo_target

# kubelet - single nodepool
- expr: label_replace(kube_node_labels{nodepool=~".+"}, "service", "kubelet nodepool $1", "nodepool", "(.+)")
labels:
area: kaas
class: MEDIUM
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_requests

- expr: |
Expand All @@ -122,12 +130,14 @@ spec:
labels:
area: kaas
class: MEDIUM
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_errors
- expr: |
label_replace(max by (nodepool) (kube_node_labels{nodepool=~".+"}), "service", "kubelet nodepool $1", "nodepool", "(.+)") * (1 - 0.99)
labels:
area: kaas
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: slo_target
# -- node-exporter
Expand All @@ -137,6 +147,7 @@ spec:
class: MEDIUM
area: kaas
service: node-exporter
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_requests
# record of number of node-exporters that are down.
# up == 1 when node-exporters are up, and up == 0 when node-exporters are down -
Expand All @@ -149,12 +160,14 @@ spec:
area: kaas
class: MEDIUM
service: node-exporter
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_errors
# -- 99% availability
- expr: "vector((1 - 0.99))"
labels:
area: kaas
service: node-exporter
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: slo_target

# -- managed-apps
Expand Down Expand Up @@ -246,17 +259,20 @@ spec:
labels:
class: MEDIUM
area: kaas
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_requests
# record number of errors.
- expr: label_replace(sum(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler", code=~"5..|<error>"}) by (cluster_id,app), "service", "$1", "app", "(.*)")
labels:
area: kaas
class: MEDIUM
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: raw_slo_errors
# -- 99% availability
- expr: label_replace(group(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler"}) by (app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99
labels:
area: kaas
label_application_giantswarm_io_team: {{ include "providerTeam" . }}
record: slo_target

# core k8s components azure API requests
Expand All @@ -265,17 +281,20 @@ spec:
labels:
class: MEDIUM
area: kaas
label_application_giantswarm_io_team: phoenix
record: raw_slo_requests
# record number of errors.
- expr: label_replace(sum(cloudprovider_azure_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id,app), "service", "$1", "app", "(.*)")
labels:
area: kaas
class: MEDIUM
label_application_giantswarm_io_team: phoenix
record: raw_slo_errors
# -- 99% availability
- expr: label_replace(group(cloudprovider_azure_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99
labels:
area: kaas
label_application_giantswarm_io_team: phoenix
record: slo_target

# core k8s components aws API requests
Expand All @@ -284,17 +303,20 @@ spec:
labels:
class: MEDIUM
area: kaas
label_application_giantswarm_io_team: phoenix
record: raw_slo_requests
# record number of errors.
- expr: label_replace(sum(cloudprovider_aws_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id,app), "service", "$1", "app", "(.*)")
labels:
area: kaas
class: MEDIUM
label_application_giantswarm_io_team: phoenix
record: raw_slo_errors
# -- 99% availability
- expr: label_replace(group(cloudprovider_aws_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99
labels:
area: kaas
label_application_giantswarm_io_team: phoenix
record: slo_target

# -- Managed Prometheus
Expand All @@ -304,13 +326,15 @@ spec:
area: managed-apps
class: MEDIUM
service: managed-prometheus
label_application_giantswarm_io_team: atlas
record: raw_slo_requests
# Set SLO error to be 1 when a managed prometheus is down.
- expr: (up{app="prometheus-operator-app-prometheus",container="prometheus"}*-1)+1 == 1
labels:
area: managed-apps
class: MEDIUM
service: managed-prometheus
label_application_giantswarm_io_team: atlas
record: raw_slo_errors

# -- Managed Alertmanager
Expand All @@ -320,13 +344,15 @@ spec:
area: managed-apps
class: MEDIUM
service: managed-alertmanager
label_application_giantswarm_io_team: atlas
record: raw_slo_requests
# Set SLO error to be 1 when a managed alertmanager is down.
- expr: (up{app="prometheus-operator-app-alertmanager",container="alertmanager"}*-1)+1 == 1
labels:
area: managed-apps
class: MEDIUM
service: managed-alertmanager
label_application_giantswarm_io_team: atlas
record: raw_slo_errors

# -- VPA
Expand Down Expand Up @@ -364,9 +390,9 @@ spec:
record: slo_burnrate_high
- expr: "vector(12)"
record: slo_burnrate_low
- expr: sum(raw_slo_requests) by (service, cluster_type, cluster_id, area)
- expr: sum(raw_slo_requests) by (service, cluster_type, cluster_id, area, label_application_giantswarm_io_team)
record: slo_requests
- expr: sum(raw_slo_errors) by (service, cluster_type, cluster_id, area)
- expr: sum(raw_slo_errors) by (service, cluster_type, cluster_id, area, label_application_giantswarm_io_team)
record: slo_errors
- expr: sum(sum_over_time(raw_slo_errors[5m])) by (cluster_type, cluster_id, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[5m])) by (cluster_type, cluster_id, service, class, area, label_application_giantswarm_io_team)
record: slo_errors_per_request:ratio_rate5m
Expand Down

0 comments on commit aaff797

Please sign in to comment.