diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a5b64b3..3566cff7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,15 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Fixed - -- Fixed usage of yq, and jq in check-opsrecipes.sh -- Fetch jq with make install-tools - ### Added - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor. -- Add `CiliumAPITooSlow`. +- Added `CiliumAPITooSlow`. +- Added `CODEOWNERS` files. ### Changed @@ -25,12 +21,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Move the management cluster certificate alerts into the shared alerts because it is provider independent - Review and fix phoenix alerts towards Mimir and multi-provider MCs. - Moves cluster-autoscaler and vpa alerts to turtles. +- Reviewed turtles alerts labels. - Use `ready` replicas for Kyverno webhooks alert. - Moves ownership of alerts for shared components to turtles. ### Fixed -- Fix and improve the ops-recipe test script. +- Fixed usage of yq, and jq in check-opsrecipes.sh +- Fetch jq with make install-tools +- Fix and improve the check-opsrecipes.sh script so support /_index.md based ops-recipes. - Fix cabbage alerts for multi-provider wcs. - Fix shield alert area labels. - Fix `cert-exporter` alerting. diff --git a/CODEOWNERS b/CODEOWNERS index 0200516ff..28f07ef17 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,2 +1,9 @@ -# generated by giantswarm/github actions - changes will be overwritten * @giantswarm/team-atlas +/helm/prometheus-rules/templates/kaas/bigmac/ @team-bigmac +/helm/prometheus-rules/templates/kaas/phoenix/ @team-phoenix +/helm/prometheus-rules/templates/kaas/rocket/ @team-rocket +/helm/prometheus-rules/templates/kaas/turtles/ @team-turtles +/helm/prometheus-rules/templates/platform/atlas/ @team-atlas +/helm/prometheus-rules/templates/platform/cabbage/ @team-cabbage +/helm/prometheus-rules/templates/platform/honeybadger/ @team-honeybadger +/helm/prometheus-rules/templates/platform/shield/ @team-shield diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml index fce397cd5..47288b9ad 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml @@ -48,7 +48,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml index 62dae1822..518d8723f 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml @@ -49,7 +49,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name!~".*(prometheus|vpa.k8s.io|linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io|kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name!~".*(prometheus|vpa.k8s.io|linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io|kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -63,7 +63,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(kyverno|app-admission-controller).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -77,7 +77,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -91,7 +91,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas @@ -105,7 +105,7 @@ spec: annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml index 56978f785..95e144c23 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/docker.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Docker memory usage on {{ $labels.instance }} is too high.`}}' opsrecipe: docker-memory-usage-high/ - expr: process_resident_memory_bytes{app="docker"} > (5 * 1024 * 1024 * 1024) + expr: process_resident_memory_bytes{job=~".*/docker-.*"} > (5 * 1024 * 1024 * 1024) for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml index eaa7c6804..ea0163a88 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/kubelet.rules.yml @@ -14,7 +14,7 @@ spec: annotations: description: '{{`Cadvisor ({{ $labels.instance }}) is down.`}}' opsrecipe: kubelet-is-down/ - expr: label_replace(up{app="cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0 + expr: label_replace(up{job="kubelet", metrics_path="/metrics/cadvisor"}, "ip", "$1", "instance", "(.+):\\d+") == 0 for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml index a191bcd13..d67f64279 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml @@ -26,9 +26,9 @@ spec: description: '{{`Node {{ $labels.node }} status is flapping under load.`}}' expr: | ( - sum(node_load15{cluster_type="management_cluster", app!="vault", role!="bastion"}) + sum(node_load15{cluster_type="management_cluster", service="node-exporter"}) by (cluster_id, installation, node, pipeline, provider) - / count(rate(node_cpu_seconds_total{cluster_type="management_cluster", app!="vault", role!="bastion", mode="idle"}[5m])) + / count(rate(node_cpu_seconds_total{cluster_type="management_cluster", service="node-exporter", mode="idle"}[5m])) by (cluster_id, installation, node, pipeline, provider) ) >= 2 unless on (cluster_id, installation, node, pipeline, provider) ( @@ -101,9 +101,9 @@ spec: annotations: description: '{{`Machine {{ $labels.node }} CPU load is too high.`}}' expr: | - sum(node_load5{cluster_type="management_cluster", app!="vault", role!="bastion"}) + sum(node_load5{cluster_type="management_cluster", service="node-exporter"}) by (node, cluster_id, installation, pipeline, provider) > 2 - * count(rate(node_cpu_seconds_total{cluster_type="management_cluster", mode="idle", app!="vault", role!="bastion"}[5m])) + * count(rate(node_cpu_seconds_total{cluster_type="management_cluster", mode="idle", service="node-exporter"}[5m])) by (node, cluster_id, installation, pipeline, provider) for: 3m labels: diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/kyverno.all.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/kyverno.rules.yml similarity index 81% rename from helm/prometheus-rules/templates/platform/shield/alerting-rules/kyverno.all.rules.yml rename to helm/prometheus-rules/templates/platform/shield/alerting-rules/kyverno.rules.yml index d9697e767..42fb673b7 100644 --- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/kyverno.all.rules.yml +++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/kyverno.rules.yml @@ -4,11 +4,24 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - name: kyverno.all.rules + name: kyverno.rules namespace: {{ .Values.namespace }} spec: groups: - - name: webhooks + - name: kyverno.certificates + rules: + - alert: KyvernoCertificateSecretWillExpireInLessThanTwoDays + annotations: + description: '{{`Kyverno Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two days.`}}' + opsrecipe: kyverno-certificate-secret-will-expire-in-less-than-two-days/ + expr: (cert_exporter_secret_not_after{name=~".*kyverno.*"} - time()) < 2 * 24 * 60 * 60 + labels: + area: managedservices + cancel_if_outside_working_hours: "true" + severity: notify + team: shield + topic: kyverno + - name: kyverno.webhooks rules: - alert: KyvernoWebhookHasNoAvailableReplicas annotations: @@ -25,7 +38,7 @@ spec: severity: page team: shield topic: kyverno - - name: resources + - name: kyverno.resources rules: - alert: KyvernoUpdateRequestsCountTooHigh annotations: @@ -42,7 +55,7 @@ spec: severity: notify team: shield topic: kyverno - - name: replicas + - name: kyverno.replicas rules: - alert: KyvernoScaledDownTooLong annotations: diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml index a04fda923..259504573 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/certificate.all.rules.yml @@ -36,17 +36,6 @@ spec: severity: page team: phoenix topic: cert-manager - - alert: KyvernoCertificateSecretWillExpireInLessThanTwoDays - annotations: - description: '{{`Kyverno Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two days.`}}' - opsrecipe: kyverno-certificate-secret-will-expire-in-less-than-two-days/ - expr: (cert_exporter_secret_not_after{name=~".*kyverno.*"} - time()) < 2 * 24 * 60 * 60 - labels: - area: managedservices - cancel_if_outside_working_hours: "true" - severity: notify - team: shield - topic: kyverno - alert: CertificateSecretWillExpireInLessThanTwoWeeks annotations: description: '{{`Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}' diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml index 5a587e527..e3c268378 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/deployment.management-cluster.rules.yml @@ -17,17 +17,17 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|tempo.*|pyroscope.*|object-storage-operator.*|observability-operator.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 for: 30m labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page team: atlas - topic: managementcluster + topic: observability - alert: DeploymentNotSatisfiedHoneybadger annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' diff --git a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml index dac351db3..4e1805ff2 100644 --- a/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/operatorkit.rules.yml @@ -12,9 +12,9 @@ spec: rules: - alert: OperatorkitErrorRateTooHighHoneybadger annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }} has reported errors. Please check logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }} has reported errors. Please check logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: operatorkit_controller_error_total{app=~"app-operator.*|chart-operator.*"} > 5 + expr: operatorkit_controller_error_total{pod=~"app-operator.*|chart-operator.*"} > 5 for: 1m labels: area: kaas @@ -23,8 +23,8 @@ spec: topic: qa - alert: OperatorNotReconcilingHoneybadger annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }} not reconciling controller {{$labels.controller}}. Please check logs.`}}' - expr: (time() - operatorkit_controller_last_reconciled{app=~"app-operator.*|chart-operator.*"}) / 60 > 30 + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }} not reconciling controller {{$labels.controller}}. Please check logs.`}}' + expr: (time() - operatorkit_controller_last_reconciled{pod=~"app-operator.*|chart-operator.*"}) / 60 > 30 for: 10m labels: area: managedservices @@ -33,9 +33,9 @@ spec: topic: releng - alert: OperatorkitErrorRateTooHighPhoenix annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: rate(operatorkit_controller_error_total{app=~"aws-.*"}[5m]) > 1 + expr: rate(operatorkit_controller_error_total{pod=~"aws-.*"}[5m]) > 1 for: 10m labels: area: kaas @@ -47,9 +47,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorkitErrorRateTooHighAWS annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: operatorkit_controller_error_total{app=~"aws-operator.+|cluster-operator.+"} > 5 + expr: operatorkit_controller_error_total{pod=~"aws-operator.+|cluster-operator.+"} > 5 for: 1m labels: area: kaas @@ -62,9 +62,9 @@ spec: # wrong to fix the root cause eventually. - alert: OperatorkitCRNotDeletedAWS annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has not deleted object {{ $labels.namespace }}/{{ $labels.name }} of type {{ $labels.kind }} for too long.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has not deleted object {{ $labels.namespace }}/{{ $labels.name }} of type {{ $labels.kind }} for too long.`}}' opsrecipe: check-not-deleted-object/ - expr: (time() - operatorkit_controller_deletion_timestamp{app=~"aws-operator.+|cluster-operator.+", provider="aws"}) > 18000 + expr: (time() - operatorkit_controller_deletion_timestamp{pod=~"aws-operator.+|cluster-operator.+", provider="aws"}) > 18000 for: 5m labels: area: kaas @@ -75,9 +75,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorNotReconcilingAWS annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' opsrecipe: operator-not-reconciling/ - expr: (sum by (cluster_id, installation, pipeline, provider, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) + expr: (sum by (cluster_id, installation, pipeline, provider, instance, pod, app_version, namespace)(increase(operatorkit_controller_event_count{pod=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) for: 20m labels: area: kaas @@ -90,9 +90,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorkitErrorRateTooHighKaas annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has reported errors. Please check the logs.`}}' opsrecipe: check-operator-error-rate-high/ - expr: operatorkit_controller_error_total{app=~"ignition-operator|cert-operator|node-operator"} > 5 + expr: operatorkit_controller_error_total{pod=~"ignition-operator.*|cert-operator.*|node-operator.*"} > 5 for: 1m labels: area: kaas @@ -103,9 +103,9 @@ spec: # be paged to be able to fix the issue immediately. - alert: OperatorNotReconcilingProviderTeam annotations: - description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' + description: '{{`{{ $labels.namespace }}/{{ $labels.pod }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}' opsrecipe: operator-not-reconciling/ - expr: (sum by (cluster_id, installation, pipeline, provider, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app="node-operator"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) + expr: (sum by (cluster_id, installation, pipeline, provider, instance, pod, app_version, namespace)(increase(operatorkit_controller_event_count{pod=~"node-operator.*"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp)) for: 20m labels: area: kaas diff --git a/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml index cda5ccb24..a4b7f846e 100644 --- a/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/shared/recording-rules/service-level.rules.yml @@ -154,7 +154,7 @@ spec: # -- node-exporter # record of number of node-exporters. - - expr: count(up{app="node-exporter", role!="bastion"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) + - expr: count(up{job="node-exporter"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) labels: class: MEDIUM area: kaas @@ -167,7 +167,7 @@ spec: # multiply by -1 to get -1 for node-exporters that are up, and 0 for node-exporters that are down, # then add 1 to get 0 for node-exporters that are up, and 1 for node-exporters that are down, # then sum. - - expr: sum((up{app='node-exporter', role!="bastion"} * -1) + 1) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) + - expr: sum((up{job='node-exporter'} * -1) + 1) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) labels: area: kaas class: MEDIUM @@ -273,21 +273,21 @@ spec: # core k8s components internal API requests # record number of requests. - - expr: label_replace(sum(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: class: MEDIUM area: kaas label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests # record number of errors. - - expr: label_replace(sum(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler", code=~"5..|"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler", code=~"5..|"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: area: kaas class: MEDIUM label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_errors # -- 99% availability - - expr: label_replace(group(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 + - expr: label_replace(group(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 labels: area: kaas label_application_giantswarm_io_team: {{ include "providerTeam" . }} @@ -295,21 +295,21 @@ spec: # core k8s components azure API requests # record number of requests. - - expr: label_replace(sum(cloudprovider_azure_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_azure_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: class: MEDIUM area: kaas label_application_giantswarm_io_team: phoenix record: raw_slo_requests # record number of errors. - - expr: label_replace(sum(cloudprovider_azure_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_azure_api_request_errors{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: area: kaas class: MEDIUM label_application_giantswarm_io_team: phoenix record: raw_slo_errors # -- 99% availability - - expr: label_replace(group(cloudprovider_azure_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 + - expr: label_replace(group(cloudprovider_azure_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 labels: area: kaas label_application_giantswarm_io_team: phoenix @@ -317,21 +317,21 @@ spec: # core k8s components aws API requests # record number of requests. - - expr: label_replace(sum(cloudprovider_aws_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_aws_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: class: MEDIUM area: kaas label_application_giantswarm_io_team: phoenix record: raw_slo_requests # record number of errors. - - expr: label_replace(sum(cloudprovider_aws_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") + - expr: label_replace(sum(cloudprovider_aws_api_request_errors{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") labels: area: kaas class: MEDIUM label_application_giantswarm_io_team: phoenix record: raw_slo_errors # -- 99% availability - - expr: label_replace(group(cloudprovider_aws_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 + - expr: label_replace(group(cloudprovider_aws_api_request_duration_seconds_count{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 labels: area: kaas label_application_giantswarm_io_team: phoenix