From 4edb344a750be1743d69257e178827e627d982a4 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Wed, 4 Sep 2024 16:15:53 +0200 Subject: [PATCH] Send sloth slos to grafana cloud --- CHANGELOG.md | 4 + .../recording-rules/grafana-cloud.rules.yml | 209 ++++++++++++++++++ 2 files changed, 213 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc379b3..797f9847 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add aggregations for slo metrics to export them to grafana cloud + ## [4.13.1] - 2024-09-03 ### Fixed diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml index 9dc23629..add98a63 100644 --- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml @@ -547,3 +547,212 @@ spec: - expr: sum(capi_crd_info{resource_name=~".*infrastructure.cluster.x-k8s.io.*"}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, version) record: aggregation:capi_infrastructure_crd_versions {{- end }} + - name: slos.grafana-cloud.recording: + rules: + # Let's not send the slo:sli_error:ratio_rate30d rule to Grafana Cloud as it's not useful for the SLOs dashboard. + - expr: sum(slo:current_burn_rate:ratio) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:current_burn_rate:ratio + - expr: |- + sum( + label_replace( + label_replace( + slo:error_budget:ratio, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:error_budget:ratio + - expr: |- + sum( + label_replace( + label_replace( + slo:objective:ratio, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:objective:ratio + - expr: |- + sum( + label_replace( + label_replace( + slo:period_burn_rate:ratio, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:period_burn_rate:ratio + - expr: |- + sum( + label_replace( + label_replace( + slo:period_error_budget_remaining:ratio, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:period_error_budget_remaining:ratio + - expr: |- + sum( + label_replace( + label_replace( + slo:sli_error:ratio_rate1d, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate1d + - expr: |- + sum( + label_replace( + label_replace( + sli_error:ratio_rate1h, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate1h + - expr: |- + sum( + label_replace( + label_replace( + sli_error:ratio_rate2h, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate2h + - expr: |- + sum( + label_replace( + label_replace( + sli_error:ratio_rate30m, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate30m + - expr: |- + sum( + label_replace( + label_replace( + slo:sli_error:ratio_rate3d, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate3d + - expr: |- + sum( + label_replace( + label_replace( + slo:sli_error:ratio_rate5m, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate5m + - expr: |- + sum( + label_replace( + label_replace( + slo:sli_error:ratio_rate6h, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:sli_error:ratio_rate6h + - expr: |- + sum( + label_replace( + label_replace( + slo:time_period:days, + "slo", + "$1", + "sloth_id", + "(.*)" + ), + "service", + "$1", + "sloth_service", + "(.*)" + ) + ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, slo, service) + record: aggregation:slo:time_period:days