From 162e1ed4bd9784301d4c15ba810c6b6098e1fb24 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Mon, 10 Jun 2024 15:26:12 +0200 Subject: [PATCH 1/2] Add codeowners to prometheus alerts (#1226) Signed-off-by: QuentinBisson --- CHANGELOG.md | 12 +++++------- CODEOWNERS | 9 ++++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a5b64b3..87f29cf5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,15 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Fixed - -- Fixed usage of yq, and jq in check-opsrecipes.sh -- Fetch jq with make install-tools - ### Added - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor. -- Add `CiliumAPITooSlow`. +- Added `CiliumAPITooSlow`. +- Added `CODEOWNERS` files. ### Changed @@ -30,7 +26,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- Fix and improve the ops-recipe test script. +- Fixed usage of yq, and jq in check-opsrecipes.sh +- Fetch jq with make install-tools +- Fix and improve the check-opsrecipes.sh script so support /_index.md based ops-recipes. - Fix cabbage alerts for multi-provider wcs. - Fix shield alert area labels. - Fix `cert-exporter` alerting. diff --git a/CODEOWNERS b/CODEOWNERS index 0200516ff..28f07ef17 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,2 +1,9 @@ -# generated by giantswarm/github actions - changes will be overwritten * @giantswarm/team-atlas +/helm/prometheus-rules/templates/kaas/bigmac/ @team-bigmac +/helm/prometheus-rules/templates/kaas/phoenix/ @team-phoenix +/helm/prometheus-rules/templates/kaas/rocket/ @team-rocket +/helm/prometheus-rules/templates/kaas/turtles/ @team-turtles +/helm/prometheus-rules/templates/platform/atlas/ @team-atlas +/helm/prometheus-rules/templates/platform/cabbage/ @team-cabbage +/helm/prometheus-rules/templates/platform/honeybadger/ @team-honeybadger +/helm/prometheus-rules/templates/platform/shield/ @team-shield From 6a20ebf1d9e73f6a1e09ae2843c90b76c18df44e Mon Sep 17 00:00:00 2001 From: Daniel Simionato Date: Mon, 10 Jun 2024 15:33:24 +0200 Subject: [PATCH 2/2] Review turtles alerts labels (#1218) * Reviewed turtles alerts labels * Merge branch 'main' into rework-app-role-node-labels --------- Co-authored-by: Quentin Bisson --- CHANGELOG.md | 1 + .../apiserver.management-cluster.rules.yml | 2 +- .../apiserver.workload-cluster.rules.yml | 10 +++--- .../turtles/alerting-rules/docker.rules.yml | 2 +- .../node.management_cluster.rules.yml | 8 ++--- .../alerting-rules/operatorkit.rules.yml | 32 +++++++++---------- .../shared/alerting-rules/up.rules.yml | 2 +- .../recording-rules/service-level.rules.yml | 22 ++++++------- 8 files changed, 40 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87f29cf5b..3566cff7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Move the management cluster certificate alerts into the shared alerts because it is provider independent - Review and fix phoenix alerts towards Mimir and multi-provider MCs. - Moves cluster-autoscaler and vpa alerts to turtles. +- Reviewed turtles alerts labels. - Use `ready` replicas for Kyverno webhooks alert. - Moves ownership of alerts for shared components to turtles. diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml index fce397cd5..47288b9ad 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml @@ -48,7 +48,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml index 62dae1822..518d8723f 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml @@ -49,7 +49,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name!~".*(prometheus|vpa.k8s.io|linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io|kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name!~".*(prometheus|vpa.k8s.io|linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io|kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -63,7 +63,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -77,7 +77,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -91,7 +91,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -105,7 +105,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml index 56978f785..95e144c23 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Docker memory usage on {{ $labels.instance }} is too high.`}}' opsrecipe: docker-memory-usage-high/ - expr: process_resident_memory_bytes{app="docker"} > (5 * 1024 * 1024 * 1024) + expr: process_resident_memory_bytes{job=~".*/docker-.*"} > (5 * 1024 * 1024 * 1024) for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/node.management_cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/node.management_cluster.rules.yml index a191bcd13..d67f64279 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/node.management_cluster.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/node.management_cluster.rules.yml @@ -26,9 +26,9 @@ spec: description: '{{`Node {{ $labels.node }} status is flapping under load.`}}' expr: | ( - sum(node_load15{cluster_type="management_cluster", app!="vault", role!="bastion"}) + sum(node_load15{cluster_type="management_cluster", service="node-exporter"}) by (cluster_id, installation, node, pipeline, provider) - / count(rate(node_cpu_seconds_total{cluster_type="management_cluster", app!="vault", role!="bastion", mode="idle"}[5m])) + / count(rate(node_cpu_seconds_total{cluster_type="management_cluster", service="node-exporter", mode="idle"}[5m])) by (cluster_id, installation, node, pipeline, provider) ) >= 2 unless on (cluster_id, installation, node, pipeline, provider) ( @@ -101,9 +101,9 @@ spec: annotations: description: '{{`Machine {{ $labels.node }} CPU load is too high.`}}' expr: | - sum(node_load5{cluster_type="management_cluster", app!="vault", role!="bastion"}) + sum(node_load5{cluster_type="management_cluster", service="node-exporter"}) by (node, cluster_id, installation, pipeline, provider) > 2 - * count(rate(node_cpu_seconds_total{cluster_type="management_cluster", mode="idle", app!="vault", role!="bastion"}[5m])) + * count(rate(node_cpu_seconds_total{cluster_type="management_cluster", mode="idle", service="node-exporter"}[5m])) by (node, cluster_id, installation, pipeline, provider) for: 3m labels: diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml index dac351db3..4e1805ff2 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml @@ -12,9 +12,9 @@ spec: rules: - alert: OperatorkitErrorRateTooHighHoneybadger annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }} has reported errors. Please check logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }} has reported errors. Please check logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: operatorkit_controller_error_total{app=~"app-operator.*|chart-operator.*"} > 5 + expr: operatorkit_controller_error_total{pod=~"app-operator.*|chart-operator.*"} > 5 for: 1m labels: area: kaas @@ -23,8 +23,8 @@ spec: topic: qa - alert: OperatorNotReconcilingHoneybadger annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }} not reconciling controller {{$labels.controller}}. Please check logs.`}}' - expr: (time() - operatorkit_controller_last_reconciled{app=~"app-operator.*|chart-operator.*"}) / 60 > 30 + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }} not reconciling controller {{$labels.controller}}. Please check logs.`}}' + expr: (time() - operatorkit_controller_last_reconciled{pod=~"app-operator.*|chart-operator.*"}) / 60 > 30 for: 10m labels: area: managedservices @@ -33,9 +33,9 @@ spec: topic: releng - alert: OperatorkitErrorRateTooHighPhoenix annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: rate(operatorkit_controller_error_total{app=~"aws-.*"}[5m]) > 1 + expr: rate(operatorkit_controller_error_total{pod=~"aws-.*"}[5m]) > 1 for: 10m labels: area: kaas @@ -47,9 +47,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorkitErrorRateTooHighAWS annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: operatorkit_controller_error_total{app=~"aws-operator.+|cluster-operator.+"} > 5 + expr: operatorkit_controller_error_total{pod=~"aws-operator.+|cluster-operator.+"} > 5 for: 1m labels: area: kaas @@ -62,9 +62,9 @@ spec: # wrong to fix the root cause eventually. - alert: OperatorkitCRNotDeletedAWS annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has not deleted object {{ $labels.namespace }}/{{ $labels.name }} of type {{ $labels.kind }} for too long.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has not deleted object {{ $labels.namespace }}/{{ $labels.name }} of type {{ $labels.kind }} for too long.`}}' opsrecipe: check-not-deleted-object/ - expr: (time() - operatorkit_controller_deletion_timestamp{app=~"aws-operator.+|cluster-operator.+", provider="aws"}) > 18000 + expr: (time() - operatorkit_controller_deletion_timestamp{pod=~"aws-operator.+|cluster-operator.+", provider="aws"}) > 18000 for: 5m labels: area: kaas @@ -75,9 +75,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorNotReconcilingAWS annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' opsrecipe: operator-not-reconciling/ - expr: (sum by (cluster_id, installation, pipeline, provider, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) + expr: (sum by (cluster_id, installation, pipeline, provider, instance, pod, app_version, namespace)(increase(operatorkit_controller_event_count{pod=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) for: 20m labels: area: kaas @@ -90,9 +90,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorkitErrorRateTooHighKaas annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: operatorkit_controller_error_total{app=~"ignition-operator|cert-operator|node-operator"} > 5 + expr: operatorkit_controller_error_total{pod=~"ignition-operator.*|cert-operator.*|node-operator.*"} > 5 for: 1m labels: area: kaas @@ -103,9 +103,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorNotReconcilingProviderTeam annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' opsrecipe: operator-not-reconciling/ - expr: (sum by (cluster_id, installation, pipeline, provider, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app="node-operator"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) + expr: (sum by (cluster_id, installation, pipeline, provider, instance, pod, app_version, namespace)(increase(operatorkit_controller_event_count{pod=~"node-operator.*"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) for: 20m labels: area: kaas diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml index f39cf8ef1..f56a4bd55 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/up.rules.yml @@ -33,7 +33,7 @@ spec: annotations: description: '{{`Cadvisor ({{ $labels.instance }}) is down.`}}' opsrecipe: kubelet-is-down/ - expr: label_replace(up{app="cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0 + expr: label_replace(up{job="kubelet", metrics_path="/metrics/cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0 for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml index cda5ccb24..a4b7f846e 100644 --- a/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml @@ -154,7 +154,7 @@ spec: # -- node-exporter # record of number of node-exporters. - - expr: count(up{app="node-exporter", role!="bastion"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) + - expr: count(up{job="node-exporter"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) labels: class: MEDIUM area: kaas @@ -167,7 +167,7 @@ spec: # multiply by -1 to get -1 for node-exporters that are up, and 0 for node-exporters that are down, # then add 1 to get 0 for node-exporters that are up, and 1 for node-exporters that are down, # then sum. - - expr: sum((up{app='node-exporter', role!="bastion"} * -1) + 1) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) + - expr: sum((up{job='node-exporter'} * -1) + 1) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) labels: area: kaas class: MEDIUM @@ -273,21 +273,21 @@ spec: # core k8s components internal API requests # record number of requests. - - expr: label_replace(sum(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: class: MEDIUM area: kaas label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests # record number of errors. - - expr: label_replace(sum(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler", code=~"5..|"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler", code=~"5..|"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: area: kaas class: MEDIUM label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_errors # -- 99% availability - - expr: label_replace(group(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 + - expr: label_replace(group(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 labels: area: kaas label_application_giantswarm_io_team: {{ include "providerTeam" . }} @@ -295,21 +295,21 @@ spec: # core k8s components azure API requests # record number of requests. - - expr: label_replace(sum(cloudprovider_azure_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_azure_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: class: MEDIUM area: kaas label_application_giantswarm_io_team: phoenix record: raw_slo_requests # record number of errors. - - expr: label_replace(sum(cloudprovider_azure_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_azure_api_request_errors{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: area: kaas class: MEDIUM label_application_giantswarm_io_team: phoenix record: raw_slo_errors # -- 99% availability - - expr: label_replace(group(cloudprovider_azure_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 + - expr: label_replace(group(cloudprovider_azure_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 labels: area: kaas label_application_giantswarm_io_team: phoenix @@ -317,21 +317,21 @@ spec: # core k8s components aws API requests # record number of requests. - - expr: label_replace(sum(cloudprovider_aws_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_aws_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: class: MEDIUM area: kaas label_application_giantswarm_io_team: phoenix record: raw_slo_requests # record number of errors. - - expr: label_replace(sum(cloudprovider_aws_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_aws_api_request_errors{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: area: kaas class: MEDIUM label_application_giantswarm_io_team: phoenix record: raw_slo_errors # -- 99% availability - - expr: label_replace(group(cloudprovider_aws_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 + - expr: label_replace(group(cloudprovider_aws_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 labels: area: kaas label_application_giantswarm_io_team: phoenix