diff --git a/CHANGELOG.md b/CHANGELOG.md index 448f709c4..a3ffca4b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add mimir and loki recording rules from upstream mixins. - Add recording rules to show prometheus scraping job memory usage. - Add `cluster_control_plane_unhealthy` inhibition. - Add inhibitions expressions for CAPI clusters. @@ -24,12 +23,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- Fix shield alerts labels for Mimir. +- Fix shield alert labels for Mimir. - Fix cabbage alert labels for Mimir. - Fix honeybadger alert labels for Mimir. - Fix cert-manager alert labels for Mimir. - Fix operatorkit alert labels for Mimir. -- Fix Loki/Mimir and Tempo mixins according to `pint` recommendations +- Fix all mixins according to `pint` recommendations. - Fix etcd alert labels for Mimir. - Fix apiserver alert labels for Mimir. diff --git a/helm/prometheus-rules/templates/recording-rules/kubernetes-mixins.rules.yml b/helm/prometheus-rules/templates/recording-rules/kubernetes-mixins.rules.yml index 374e13d70..30048002c 100644 --- a/helm/prometheus-rules/templates/recording-rules/kubernetes-mixins.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/kubernetes-mixins.rules.yml @@ -13,26 +13,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[1d])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[1d])) labels: verb: read record: apiserver_request:burnrate1d @@ -40,26 +40,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[1h])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[1h])) labels: verb: read record: apiserver_request:burnrate1h @@ -67,26 +67,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[2h])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[2h])) labels: verb: read record: apiserver_request:burnrate2h @@ -94,26 +94,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[30m])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[30m])) labels: verb: read record: apiserver_request:burnrate30m @@ -121,26 +121,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[3d])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[3d])) labels: verb: read record: apiserver_request:burnrate3d @@ -148,26 +148,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[5m])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[5m])) labels: verb: read record: apiserver_request:burnrate5m @@ -175,26 +175,26 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - ( ( - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) or vector(0) ) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) ) ) + # errors - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET",code=~"5.."}[6h])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[6h])) labels: verb: read record: apiserver_request:burnrate6h @@ -202,15 +202,15 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write record: apiserver_request:burnrate1d @@ -218,15 +218,15 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write record: apiserver_request:burnrate1h @@ -234,15 +234,15 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write record: apiserver_request:burnrate2h @@ -250,15 +250,15 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write record: apiserver_request:burnrate30m @@ -266,15 +266,15 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write record: apiserver_request:burnrate3d @@ -282,15 +282,15 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: apiserver_request:burnrate5m @@ -298,28 +298,28 @@ spec: ( ( # too slow - sum by (cluster_id) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_count{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - - sum by (cluster_id) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) ) + - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / - sum by (cluster_id) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster_id, installation, pipeline, provider) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write record: apiserver_request:burnrate6h - name: kube-apiserver-histogram.rules rules: - expr: | - histogram_quantile(0.99, sum by (cluster_id, le, resource) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + histogram_quantile(0.99, sum by (cluster_id, installation, pipeline, provider, le, resource) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: "0.99" verb: read record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.99, sum by (cluster_id, le, resource) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + histogram_quantile(0.99, sum by (cluster_id, installation, pipeline, provider, le, resource) (rate(apiserver_request_duration_seconds_bucket{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: "0.99" verb: write @@ -331,81 +331,81 @@ spec: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - expr: | - sum by (cluster_id, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + sum by (cluster_id, installation, pipeline, provider, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - expr: | - sum by (cluster_id, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + sum by (cluster_id, installation, pipeline, provider, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d - expr: | - sum by (cluster_id, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) + sum by (cluster_id, installation, pipeline, provider, verb, scope) (increase(apiserver_request_duration_seconds_count[1h])) record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h - expr: | - sum by (cluster_id, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30) + sum by (cluster_id, installation, pipeline, provider, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[30d]) * 24 * 30) record: cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d - expr: | - sum by (cluster_id, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) + sum by (cluster_id, installation, pipeline, provider, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h])) record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h - expr: | - sum by (cluster_id, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + sum by (cluster_id, installation, pipeline, provider, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[30d]) * 24 * 30) record: cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d - expr: | 1 - ( ( # write too slow - sum by (cluster_id) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) ) + ( # read too slow - sum by (cluster_id) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( ( - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) or vector(0) ) + - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) ) ) + # errors - sum by (cluster_id) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + sum by (cluster_id, installation, pipeline, provider) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / - sum by (cluster_id) (code:apiserver_request_total:increase30d) + sum by (cluster_id, installation, pipeline, provider) (code:apiserver_request_total:increase30d) labels: verb: all record: apiserver_request:availability30d - expr: | 1 - ( - sum by (cluster_id) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( # too slow ( - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) or vector(0) ) + - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) ) + # errors - sum by (cluster_id) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + sum by (cluster_id, installation, pipeline, provider) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / - sum by (cluster_id) (code:apiserver_request_total:increase30d{verb="read"}) + sum by (cluster_id, installation, pipeline, provider) (code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read record: apiserver_request:availability30d @@ -413,48 +413,48 @@ spec: 1 - ( ( # too slow - sum by (cluster_id) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - - sum by (cluster_id) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + sum by (cluster_id, installation, pipeline, provider) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) ) + # errors - sum by (cluster_id) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + sum by (cluster_id, installation, pipeline, provider) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / - sum by (cluster_id) (code:apiserver_request_total:increase30d{verb="write"}) + sum by (cluster_id, installation, pipeline, provider) (code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write record: apiserver_request:availability30d - expr: | - sum by (cluster_id,code,resource) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[5m])) + sum by (cluster_id, installation, pipeline, provider,code,resource) (rate(apiserver_request_total{app="kubernetes",verb=~"LIST|GET"}[5m])) labels: verb: read record: code_resource:apiserver_request_total:rate5m - expr: | - sum by (cluster_id,code,resource) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster_id, installation, pipeline, provider,code,resource) (rate(apiserver_request_total{app="kubernetes",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: code_resource:apiserver_request_total:rate5m - expr: | - sum by (cluster_id, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + sum by (cluster_id, installation, pipeline, provider, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (cluster_id, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + sum by (cluster_id, installation, pipeline, provider, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (cluster_id, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + sum by (cluster_id, installation, pipeline, provider, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - expr: | - sum by (cluster_id, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + sum by (cluster_id, installation, pipeline, provider, code, verb) (increase(apiserver_request_total{app="kubernetes",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - name: k8s.rules rules: - expr: | - sum by (cluster_id, namespace, pod, container) ( + sum by (cluster_id, installation, pipeline, provider, namespace, pod, container) ( irate(container_cpu_usage_seconds_total{app="cadvisor", image!=""}[5m]) - ) * on (cluster_id, namespace, pod) group_left(node) topk by (cluster_id, namespace, pod) ( - 1, max by(cluster_id, namespace, pod, node) (kube_pod_info{node!=""}) + ) * on (cluster_id, installation, pipeline, provider, namespace, pod) group_left(node) topk by (cluster_id, installation, pipeline, provider, namespace, pod) ( + 1, max by(cluster_id, installation, pipeline, provider, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - expr: | @@ -482,75 +482,75 @@ spec: ) record: node_namespace_pod_container:container_memory_swap - expr: | - kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"} * on (namespace, pod, cluster_id) - group_left() max by (namespace, pod, cluster_id) ( + kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider) + group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - expr: | - sum by (namespace, cluster_id) ( - sum by (namespace, pod, cluster_id) ( - max by (namespace, pod, container, cluster_id) ( + sum by (namespace, cluster_id, installation, pipeline, provider) ( + sum by (namespace, pod, cluster_id, installation, pipeline, provider) ( + max by (namespace, pod, container, cluster_id, installation, pipeline, provider) ( kube_pod_container_resource_requests{resource="memory",app="kube-state-metrics"} - ) * on(namespace, pod, cluster_id) group_left() max by (namespace, pod, cluster_id) ( + ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_memory:kube_pod_container_resource_requests:sum - expr: | - kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"} * on (namespace, pod, cluster_id) - group_left() max by (namespace, pod, cluster_id) ( + kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider) + group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - expr: | - sum by (namespace, cluster_id) ( - sum by (namespace, pod, cluster_id) ( - max by (namespace, pod, container, cluster_id) ( + sum by (namespace, cluster_id, installation, pipeline, provider) ( + sum by (namespace, pod, cluster_id, installation, pipeline, provider) ( + max by (namespace, pod, container, cluster_id, installation, pipeline, provider) ( kube_pod_container_resource_requests{resource="cpu",app="kube-state-metrics"} - ) * on(namespace, pod, cluster_id) group_left() max by (namespace, pod, cluster_id) ( + ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum - expr: | - kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"} * on (namespace, pod, cluster_id) - group_left() max by (namespace, pod, cluster_id) ( + kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider) + group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits - expr: | - sum by (namespace, cluster_id) ( - sum by (namespace, pod, cluster_id) ( - max by (namespace, pod, container, cluster_id) ( + sum by (namespace, cluster_id, installation, pipeline, provider) ( + sum by (namespace, pod, cluster_id, installation, pipeline, provider) ( + max by (namespace, pod, container, cluster_id, installation, pipeline, provider) ( kube_pod_container_resource_limits{resource="memory",app="kube-state-metrics"} - ) * on(namespace, pod, cluster_id) group_left() max by (namespace, pod, cluster_id) ( + ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_memory:kube_pod_container_resource_limits:sum - expr: | - kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"} * on (namespace, pod, cluster_id) - group_left() max by (namespace, pod, cluster_id) ( + kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"} * on (namespace, pod, cluster_id, installation, pipeline, provider) + group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits - expr: | - sum by (namespace, cluster_id) ( - sum by (namespace, pod, cluster_id) ( - max by (namespace, pod, container, cluster_id) ( + sum by (namespace, cluster_id, installation, pipeline, provider) ( + sum by (namespace, pod, cluster_id, installation, pipeline, provider) ( + max by (namespace, pod, container, cluster_id, installation, pipeline, provider) ( kube_pod_container_resource_limits{resource="cpu",app="kube-state-metrics"} - ) * on(namespace, pod, cluster_id) group_left() max by (namespace, pod, cluster_id) ( + ) * on(namespace, pod, cluster_id, installation, pipeline, provider) group_left() max by (namespace, pod, cluster_id, installation, pipeline, provider) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_cpu:kube_pod_container_resource_limits:sum - expr: | - max by (cluster_id, namespace, workload, pod) ( + max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{app="kube-state-metrics", owner_kind="ReplicaSet"}, @@ -567,7 +567,7 @@ spec: workload_type: deployment record: namespace_workload_pod:kube_pod_owner:relabel - expr: | - max by (cluster_id, namespace, workload, pod) ( + max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) ( label_replace( kube_pod_owner{app="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" @@ -577,7 +577,7 @@ spec: workload_type: daemonset record: namespace_workload_pod:kube_pod_owner:relabel - expr: | - max by (cluster_id, namespace, workload, pod) ( + max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) ( label_replace( kube_pod_owner{app="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" @@ -587,7 +587,7 @@ spec: workload_type: statefulset record: namespace_workload_pod:kube_pod_owner:relabel - expr: | - max by (cluster_id, namespace, workload, pod) ( + max by (cluster_id, installation, pipeline, provider, namespace, workload, pod) ( label_replace( kube_pod_owner{app="kube-state-metrics", owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)" @@ -646,13 +646,13 @@ spec: - name: node.rules rules: - expr: | - topk by(cluster_id, namespace, pod) (1, - max by (cluster_id, node, namespace, pod) ( + topk by(cluster_id, installation, pipeline, provider, namespace, pod) (1, + max by (cluster_id, installation, pipeline, provider, node, namespace, pod) ( label_replace(kube_pod_info{app="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") )) record: 'node_namespace_pod:kube_pod_info:' - expr: | - count by (cluster_id, node) ( + count by (cluster_id, installation, pipeline, provider, node) ( node_cpu_seconds_total{mode="idle",app="node-exporter"} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) @@ -667,34 +667,34 @@ spec: node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Slab_bytes{app="node-exporter"} ) - ) by (cluster_id) + ) by (cluster_id, installation, pipeline, provider) record: :node_memory_MemAvailable_bytes:sum - expr: | - avg by (cluster_id, node) ( + avg by (cluster_id, installation, pipeline, provider, node) ( sum without (mode) ( rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",app="node-exporter"}[5m]) ) ) record: node:node_cpu_utilization:ratio_rate5m - expr: | - avg by (cluster_id) ( + avg by (cluster_id, installation, pipeline, provider) ( node:node_cpu_utilization:ratio_rate5m ) record: cluster:node_cpu:ratio_rate5m - name: kubelet.rules rules: - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, instance, le) * on(cluster_id, instance) group_left(node) kubelet_node_name{app="kubelet"}) + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, installation, pipeline, provider, instance, le) * on(cluster_id, installation, pipeline, provider, instance) group_left(node) kubelet_node_name{app="kubelet"}) labels: quantile: "0.99" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, instance, le) * on(cluster_id, instance) group_left(node) kubelet_node_name{app="kubelet"}) + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, installation, pipeline, provider, instance, le) * on(cluster_id, installation, pipeline, provider, instance) group_left(node) kubelet_node_name{app="kubelet"}) labels: quantile: "0.9" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - expr: | - histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, instance, le) * on(cluster_id, instance) group_left(node) kubelet_node_name{app="kubelet"}) + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, installation, pipeline, provider, instance, le) * on(cluster_id, installation, pipeline, provider, instance) group_left(node) kubelet_node_name{app="kubelet"}) labels: quantile: "0.5" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile \ No newline at end of file diff --git a/scripts/sync-kube-mixin.sh b/scripts/sync-kube-mixin.sh index 82aa0916a..4ad939136 100755 --- a/scripts/sync-kube-mixin.sh +++ b/scripts/sync-kube-mixin.sh @@ -19,6 +19,7 @@ function tune_rules { # Latest mixins use SLO instead of classic metrics in several places # but we dropped these SLO metrics sed -i 's/apiserver_request_slo_duration_seconds/apiserver_request_duration_seconds/g' "$RULESFILE" + sed -i 's/cluster_id/cluster_id, installation, pipeline, provider/g' "$RULESFILE" } function main {