diff --git a/CHANGELOG.md b/CHANGELOG.md index e832ee71..12509847 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Alerting rule for Loki missing logs at ingestion +## [4.17.0] - 2024-10-03 + +### Removed + +- Remove legacy in-house slo framework. + ## [4.16.1] - 2024-09-26 ### Fixed @@ -3127,7 +3133,8 @@ Fix `PromtailRequestsErrors` alerts as promtail retries after some backoff so ac - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.16.1...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.17.0...HEAD +[4.17.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.16.1...v4.17.0 [4.16.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.16.0...v4.16.1 [4.16.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.15.2...v4.16.0 [4.15.2]: https://github.com/giantswarm/prometheus-rules/compare/v4.15.1...v4.15.2 diff --git a/helm/prometheus-rules/Chart.yaml b/helm/prometheus-rules/Chart.yaml index 39b6c315..82992b6a 100644 --- a/helm/prometheus-rules/Chart.yaml +++ b/helm/prometheus-rules/Chart.yaml @@ -5,7 +5,7 @@ home: https://github.com/giantswarm/prometheus-rules icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png name: prometheus-rules appVersion: '0.1.0' -version: '4.16.1' +version: '4.17.0' annotations: application.giantswarm.io/team: "atlas" config.giantswarm.io/version: 1.x.x diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/service-level.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/service-level.rules.yml deleted file mode 100644 index 957e64d5..00000000 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/service-level.rules.yml +++ /dev/null @@ -1,52 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - name: service-level.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: service-level - rules: - - alert: ServiceLevelBurnRateTooHigh - annotations: - description: '{{`Service level burn rate is too high for {{ $labels.service }} service.`}}' - opsrecipe: service-level-burn-rate-too-high/ - dashboard: https://giantswarm.grafana.net/d/service-level/service-level?orgId=1 - expr: | - label_replace( - ( - slo_errors_per_request:ratio_rate1h{service!~".*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*", namespace!~"linkerd.*"} - > on (cluster_id, service) group_left () - slo_threshold_high - and - slo_errors_per_request:ratio_rate5m{service!~".*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*", namespace!~"linkerd.*"} - > on (cluster_id, service) group_left () - slo_threshold_high - ) - or - ( - slo_errors_per_request:ratio_rate6h{service!~".*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*", namespace!~"linkerd.*"} - > on (cluster_id, service) group_left () - slo_threshold_low - and - slo_errors_per_request:ratio_rate30m{service!~".*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*", namespace!~"linkerd.*"} - > on (cluster_id, service) group_left () - slo_threshold_low - ), - "team", - "$1", - "label_application_giantswarm_io_team", - "(.*)" - ) - for: 5m - labels: - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: '{{`{{ $labels.team }}`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml deleted file mode 100644 index 8f7b5950..00000000 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/service-level.rules.yml +++ /dev/null @@ -1,241 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - {{- include "labels.common" . | nindent 4 }} - giantswarm.io/remote-write-target: grafana-cloud - name: service-level.recording.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: service-level.recording - rules: - # -- kubelet whole cluster - - expr: kube_node_status_condition{condition="Ready"} - labels: - class: MEDIUM - area: kaas - service: kubelet - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_requests - - expr: | - ( - kube_node_status_condition{condition="Ready", status!="true"} - and - on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, node) kube_node_spec_unschedulable == 0 - ) - and - on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, node) time() - kube_node_created > 10 * 60 - labels: - area: kaas - class: MEDIUM - service: kubelet - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_errors - # -- 99% availability - - expr: "vector((1 - 0.99))" - labels: - area: kaas - service: kubelet - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: slo_target - - # kubelet - single nodepool - - expr: label_replace(kube_node_labels{nodepool=~".+"}, "service", "kubelet nodepool $1", "nodepool", "(.+)") - labels: - area: kaas - class: MEDIUM - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_requests - - - expr: | - ( - label_replace((kube_node_status_condition{condition="Ready", status!="true"} * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, node) group_left(nodepool) kube_node_labels{nodepool=~".+"}), "service", "kubelet nodepool $1", "nodepool", "(.*)") - and - on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, node) kube_node_spec_unschedulable == 0 - ) - and - on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, node) time() - kube_node_created > 10 * 60 - labels: - area: kaas - class: MEDIUM - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_errors - - - expr: | - label_replace(max by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, nodepool) (kube_node_labels{nodepool=~".+"}), "service", "kubelet nodepool $1", "nodepool", "(.+)") * (1 - 0.99) - labels: - area: kaas - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: slo_target - - # -- node-exporter - # record of number of node-exporters. - - expr: count(up{job="node-exporter"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) - labels: - class: MEDIUM - area: kaas - service: node-exporter - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_requests - # record of number of node-exporters that are down. - # up == 1 when node-exporters are up, and up == 0 when node-exporters are down - - # to get a count of the number of node-exporters that are down, - # multiply by -1 to get -1 for node-exporters that are up, and 0 for node-exporters that are down, - # then add 1 to get 0 for node-exporters that are up, and 1 for node-exporters that are down, - # then sum. - - expr: sum((up{job='node-exporter'} * -1) + 1) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) - labels: - area: kaas - class: MEDIUM - service: node-exporter - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_errors - # -- 99% availability - - expr: "vector((1 - 0.99))" - labels: - area: kaas - service: node-exporter - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: slo_target - - # -- managed-apps - # -- error recording rules - # record when pods of a daemonset with label "label_giantswarm_io_monitoring_basic_sli" are down - - expr: | - label_replace( - kube_daemonset_status_number_unavailable - * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, pod, daemonset, namespace) group_left (label_application_giantswarm_io_team) - kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", - "$1", - "daemonset", - "(.*)" - ) - labels: - class: MEDIUM - area: managed-apps - record: raw_slo_errors - # record when pods of a deployment with label "label_giantswarm_io_monitoring_basic_sli" are down - - expr: | - label_replace( - kube_deployment_status_replicas_unavailable - * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, pod, deployment, namespace) group_left (label_application_giantswarm_io_team) - kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", - "$1", - "deployment", - "(.*)" - ) - labels: - class: MEDIUM - area: managed-apps - record: raw_slo_errors - # record when pods of a statefulset with label "label_giantswarm_io_monitoring_basic_sli" are down - - expr: | - label_replace( - kube_statefulset_status_replicas - kube_statefulset_status_replicas_current - * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, pod, statefulset, namespace) group_left (label_application_giantswarm_io_team) - kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", - "$1", - "statefulset", - "(.*)" - ) - labels: - class: MEDIUM - area: managed-apps - record: raw_slo_errors - # -- present pods recording rules - - expr: | - kube_daemonset_labels * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, daemonset, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( - kube_daemonset_status_desired_number_scheduled - and on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, daemonset, namespace) - kube_daemonset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "daemonset", "(.*)" ) - ) - labels: - class: MEDIUM - area: managed-apps - record: raw_slo_requests - - expr: | - kube_deployment_labels * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, deployment, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( - kube_deployment_status_replicas - and on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, deployment, namespace) - kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "deployment", "(.*)" ) - ) - labels: - class: MEDIUM - area: managed-apps - record: raw_slo_requests - - expr: | - kube_statefulset_labels * on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, statefulset, namespace) group_right(label_application_giantswarm_io_team) ( - label_replace( - kube_statefulset_status_replicas - and on (cluster_id, cluster_type, customer, installation, pipeline, provider, region, statefulset, namespace) - kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "statefulset", "(.*)" ) - ) - labels: - class: MEDIUM - area: managed-apps - record: raw_slo_requests - # -- 99% availability - # -- this expression collects all the services from the area managed apps and assigns the same slo target to all of them - - expr: sum by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, area, class) (raw_slo_errors{area="managed-apps"} - raw_slo_errors{area="managed-apps"}) + 1-0.99 - record: slo_target - - # core k8s components internal API requests - # record number of requests. - - expr: label_replace(sum(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") - labels: - class: MEDIUM - area: kaas - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_requests - # record number of errors. - - expr: label_replace(sum(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler", code=~"5..|"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") - labels: - area: kaas - class: MEDIUM - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: raw_slo_errors - # -- 99% availability - - expr: label_replace(group(rest_client_requests_total{job=~"kube-controller-manager|kube-scheduler"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job), "service", "$1", "job", "(.*)") * 0 + 1 - 0.99 - labels: - area: kaas - label_application_giantswarm_io_team: {{ include "providerTeam" . }} - record: slo_target - - # -- generic stuff - # -- standard burnrates based on https://sre.google/workbook/alerting-on-slos/#6-multiwindow-multi-burn-rate-alerts - - expr: "vector(36)" - record: slo_burnrate_high - - expr: "vector(12)" - record: slo_burnrate_low - - expr: sum(raw_slo_requests) by (service, cluster_id, cluster_type, customer, installation, pipeline, provider, region, area, label_application_giantswarm_io_team) - record: slo_requests - - expr: sum(raw_slo_errors) by (service, cluster_id, cluster_type, customer, installation, pipeline, provider, region, area, label_application_giantswarm_io_team) - record: slo_errors - - expr: sum(sum_over_time(raw_slo_errors[5m])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[5m])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate5m - - expr: sum(sum_over_time(raw_slo_errors[30m])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[30m])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate30m - - expr: sum(sum_over_time(raw_slo_errors[1h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[1h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate1h - - expr: sum(sum_over_time(raw_slo_errors[6h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[6h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate6h - - expr: sum(sum_over_time(raw_slo_errors[2h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[2h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate2h - - expr: sum(sum_over_time(raw_slo_errors[24h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[24h])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate24h - - expr: sum(sum_over_time(raw_slo_errors[3d])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[3d])) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team) - record: slo_errors_per_request:ratio_rate3d - # -- We use the `min` function here because we have `slo_burnrate_high` for each installation and cluster_id, even though the metric value is always the same. - - expr: slo_target*scalar(min(slo_burnrate_high) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team)) - record: slo_threshold_high - - expr: slo_target*scalar(min(slo_burnrate_low) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, service, class, area, label_application_giantswarm_io_team)) - record: slo_threshold_low diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml index d8d1241c..33c535c1 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml @@ -149,7 +149,7 @@ spec: annotations: description: '{{`App-exporter ({{ $labels.instance }}) is down.`}}' opsrecipe: app-exporter-down/ - expr: (up{app="app-exporter"} == 0) and (up{service="app-exporter"} == 0) + expr: up{job="app-exporter"} == 0 for: 15m labels: area: platform