Skip to content

Commit

Permalink
Make Atlas rules compatible with Mimir (#1102)
Browse files Browse the repository at this point in the history
* Make Atlas rules compatible with Mimir

* Update unit tests

* Fix review comments

* rewrite MatchingNumberOfPrometheusAndCluster
  • Loading branch information
marieroque authored Apr 8, 2024
1 parent 71c794d commit 953aa3a
Show file tree
Hide file tree
Showing 25 changed files with 193 additions and 96 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Make Atlas rules compatible with Mimir:
- Add labels `cluster_id, installation, provider, pipeline` for each aggregation functions
- Rewrite some of `absent` functions

## [3.6.2] - 2024-04-04

### Changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: deployment-not-satisfied/
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ spec:
annotations:
description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
opsrecipe: fluentbit-down/
expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0
expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, installation, provider, pipeline, job, namespace, node) == 0
for: 15m
labels:
area: empowerment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ spec:
- alert: GrafanaFolderPermissionsDown
# Monitors that folder permissions have been updated.
# We have a cronjob (grafana-permissions) that runs every 20 minutes.
# When successfully run, folders permissions successful updates counter increases.
# When successfully run, folders permissions successful updates counter increases.
annotations:
description: '{{`Grafana Folder not updated for ({{ $labels.instance }}).`}}'
opsrecipe: grafana-perms/
expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
expr: sum by(cluster_id, installation, provider, pipeline) (increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
for: 6h
labels:
area: managedservices
Expand All @@ -57,7 +57,7 @@ spec:
# - we create cronjob label from cron name (label_replace)
# - we sum number of failed to have one global value
# - we avg_over_time to avoid 0 value when a cron was skipped for whatever reason
expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0
expr: sum by (cronjob, cluster_id, installation, provider, pipeline) (label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) > 0
for: 6h
labels:
area: managedservices
Expand All @@ -75,7 +75,7 @@ spec:
# This alert triggers when the grafana permission job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0
or count by (cluster_id, installation, provider, pipeline) (max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0
labels:
area: empowerment
severity: page
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
# we retrieve the list of existing cluster IDs from `kube_namespace_created`
# excluding the MC's one, because it's always using prometheus-agent and namespace is not named after cluster name
# then compare it with the list of deployed prometheus-agents from `app_operator_app_info`
#
#
# Will only produce data (and inhibitions) on MC because it's where app_operator is running
# but that's enough to have the inhibitions on the installation-global alertmanager
- alert: InhibitionClusterIsNotRunningPrometheusAgent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
- alert: KedaDown
annotations:
description: 'Keda is down.'
expr: count (up{container=~"keda-.*"} == 0) > 0
expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,23 @@ spec:
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}'
opsrecipe: kube-state-metrics-down/
{{- if not .Values.mimir.enabled }}
expr: label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
{{- else }}
expr: |-
(
# modern clusters
label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1)
)
and
(
# vintage clusters without servicemonitor
label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
)
count by (cluster_id, installation, provider, pipeline) (label_replace(up{app="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0
or (
label_replace(
capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
"cluster_id",
"$1",
"name",
"(.*)"
) == 1
) unless on (cluster_id, customer, installation, pipeline, provider, region) (
count(up{app="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region)
)
{{- end }}
for: 15m
labels:
area: kaas
Expand All @@ -42,7 +49,7 @@ spec:
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}'
opsrecipe: kube-state-metrics-down/
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id)) > 7
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id, installation, provider, pipeline)) > 7
for: 15m
labels:
area: kaas
Expand All @@ -62,7 +69,7 @@ spec:
opsrecipe: kube-state-metrics-down/
expr: |-
# When it looks up but we don't have metrics
count({app="kube-state-metrics"}) by (cluster_id) < 10
count({app="kube-state-metrics"}) by (cluster_id, installation, provider, pipeline) < 10
for: 20m
labels:
area: kaas
Expand Down
8 changes: 4 additions & 4 deletions helm/prometheus-rules/templates/alerting-rules/loki.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ spec:
description: This alert checks that we have less than 10% errors on Loki requests.
opsrecipe: loki/
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster_id, namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route)
> 10
for: 120m
labels:
Expand All @@ -56,7 +56,7 @@ spec:
description: This alert checks that we have no panic errors on Loki.
opsrecipe: loki/
expr: |
sum(increase(loki_panic_total[10m])) by (cluster_id, namespace, job) > 0
sum(increase(loki_panic_total[10m])) by (cluster_id, installation, provider, pipeline, namespace, job) > 0
labels:
area: managedservices
cancel_if_apiserver_down: "true"
Expand All @@ -73,7 +73,7 @@ spec:
description: '{{`Loki pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) sees {{ $value }} unhealthy ring members`}}'
opsrecipe: loki/
expr: |
sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, name, namespace, organization, pod) > 0
sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) > 0
labels:
area: managedservices
cancel_if_apiserver_down: "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}'
opsrecipe: multiple-operators-running-same-version/
expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1
for: 5m
labels:
area: empowerment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ spec:
- alert: MimirComponentDown
annotations:
description: '{{`Mimir component : {{ $labels.service }} is down.`}}'
expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0
expr: count(up{app="mimir"} == 0) by (cluster_id, installation, provider, pipeline, service) > 0
for: 5m
labels:
area: managedservices
Expand All @@ -59,7 +59,7 @@ spec:
- alert: GrafanaAgentForPrometheusRulesDown
annotations:
description: 'Grafana-agent sending PrometheusRules to Mimir ruler is down.'
expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id) > 0
expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
for: 1h
labels:
area: managedservices
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ spec:
count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region)
)
{{- end }}
for: 1m
for: 2m
labels:
area: empowerment
severity: none
Expand All @@ -99,18 +99,18 @@ spec:
summary: Prometheus agent is missing shards.
opsrecipe: prometheus-agent/
expr: |-
max_over_time(sum by (cluster_id)(
max_over_time(sum by (cluster_id, installation, provider, pipeline)(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
!=
sum(
## number of shards defined in the Prometheus CR
prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
# if there is only 1 shard, there is no shard metric so we use the replicas metric
or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
)[5m:])
for: 20m
labels:
Expand All @@ -130,20 +130,20 @@ spec:
summary: Prometheus agent is missing shards.
opsrecipe: prometheus-agent/
expr: |-
max_over_time(sum by (cluster_id)(
max_over_time(sum by (cluster_id, installation, provider, pipeline)(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
!=
sum(
## number of shards defined in the Prometheus CR
prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
# if there is only 1 shard, there is no shard metric so we use the replicas metric
or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
)[5m:])
for: 1m
for: 2m
labels:
area: empowerment
severity: none
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,22 @@ spec:
# If a prometheus is missing, this alert will fire. This alert will not check if a prometheus is running when it should not (e.g. deleted cluster)
expr: |
(
sum by(cluster_id) (
sum by(cluster_id, installation, provider, pipeline) (
{__name__=~"cluster_service_cluster_info|cluster_operator_cluster_status", status!="Deleting"}
) unless sum by(cluster_id) (
) unless sum by(cluster_id, installation, provider, pipeline) (
label_replace(
kube_pod_container_status_running{container="prometheus", namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".*-prometheus"},
"cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)"
)
)
) + (
sum by (cluster_name) (
capi_cluster_status_phase{phase!="Deleting"}
) unless sum by (cluster_name) (
label_replace(kube_pod_container_status_running{container="prometheus",namespace=~".*-prometheus"},
"cluster_name", "$2", "pod", "(prometheus-)(.+)(-.+)"
) or (
sum by (cluster_id, installation, provider, pipeline) (
label_replace(capi_cluster_status_phase{phase!="Deleting"},
"cluster_id", "$1", "name", "(.+)"
)
) unless sum by (cluster_id, installation, provider, pipeline) (
label_replace(kube_pod_container_status_running{container="prometheus",namespace=~".*-prometheus"},
"cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)"
)
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
area: empowerment
Expand All @@ -44,7 +44,7 @@ spec:
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
area: empowerment
Expand All @@ -66,7 +66,7 @@ spec:
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
for: 10m
labels:
area: empowerment
Expand All @@ -88,7 +88,7 @@ spec:
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
expr: min by (cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
for: 5m
labels:
area: empowerment
Expand Down
Loading

0 comments on commit 953aa3a

Please sign in to comment.