Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Atlas rules compatible with Mimir #1102

Merged
merged 5 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Make Atlas rules compatible with Mimir:
- Add labels `cluster_id, installation, provider, pipeline` for each aggregation functions
- Rewrite some of `absent` functions

## [3.6.2] - 2024-04-04

### Changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
opsrecipe: deployment-not-satisfied/
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0
expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ spec:
annotations:
description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
opsrecipe: fluentbit-down/
expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0
expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, installation, provider, pipeline, job, namespace, node) == 0
for: 15m
labels:
area: empowerment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ spec:
- alert: GrafanaFolderPermissionsDown
# Monitors that folder permissions have been updated.
# We have a cronjob (grafana-permissions) that runs every 20 minutes.
# When successfully run, folders permissions successful updates counter increases.
# When successfully run, folders permissions successful updates counter increases.
annotations:
description: '{{`Grafana Folder not updated for ({{ $labels.instance }}).`}}'
opsrecipe: grafana-perms/
expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
expr: sum by(cluster_id, installation, provider, pipeline) (increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"})
for: 6h
labels:
area: managedservices
Expand All @@ -57,7 +57,7 @@ spec:
# - we create cronjob label from cron name (label_replace)
# - we sum number of failed to have one global value
# - we avg_over_time to avoid 0 value when a cron was skipped for whatever reason
expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0
expr: sum by (cronjob, cluster_id, installation, provider, pipeline) (label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) > 0
for: 6h
labels:
area: managedservices
Expand All @@ -75,7 +75,7 @@ spec:
# This alert triggers when the grafana permission job did not schedule for more than 1 day
# or if the job did not run successfully at least once in the last day
expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400
or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0
or count by (cluster_id, installation, provider, pipeline) (max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0
labels:
area: empowerment
severity: page
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
# we retrieve the list of existing cluster IDs from `kube_namespace_created`
# excluding the MC's one, because it's always using prometheus-agent and namespace is not named after cluster name
# then compare it with the list of deployed prometheus-agents from `app_operator_app_info`
#
#
# Will only produce data (and inhibitions) on MC because it's where app_operator is running
# but that's enough to have the inhibitions on the installation-global alertmanager
- alert: InhibitionClusterIsNotRunningPrometheusAgent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
- alert: KedaDown
annotations:
description: 'Keda is down.'
expr: count (up{container=~"keda-.*"} == 0) > 0
expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,23 @@ spec:
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}'
opsrecipe: kube-state-metrics-down/
{{- if not .Values.mimir.enabled }}
expr: label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
{{- else }}
expr: |-
(
# modern clusters
label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1)
)
and
(
# vintage clusters without servicemonitor
label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1)
)
count by (cluster_id, installation, provider, pipeline) (label_replace(up{app="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0
or (
label_replace(
capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
"cluster_id",
"$1",
"name",
"(.*)"
) == 1
) unless on (cluster_id, customer, installation, pipeline, provider, region) (
count(up{app="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region)
)
{{- end }}
for: 15m
labels:
area: kaas
Expand All @@ -42,7 +49,7 @@ spec:
annotations:
description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}'
opsrecipe: kube-state-metrics-down/
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id)) > 7
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id, installation, provider, pipeline)) > 7
for: 15m
labels:
area: kaas
Expand All @@ -62,7 +69,7 @@ spec:
opsrecipe: kube-state-metrics-down/
expr: |-
# When it looks up but we don't have metrics
count({app="kube-state-metrics"}) by (cluster_id) < 10
count({app="kube-state-metrics"}) by (cluster_id, installation, provider, pipeline) < 10
for: 20m
labels:
area: kaas
Expand Down
8 changes: 4 additions & 4 deletions helm/prometheus-rules/templates/alerting-rules/loki.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ spec:
description: This alert checks that we have less than 10% errors on Loki requests.
opsrecipe: loki/
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster_id, namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently Loki ServiceMonitor is scraping every 15s so that change is not mandatory.
But we should change the scrapeInterval to 1m IMO and so it's safer to have that change now...

/
sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route)
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route)
> 10
for: 120m
labels:
Expand All @@ -56,7 +56,7 @@ spec:
description: This alert checks that we have no panic errors on Loki.
opsrecipe: loki/
expr: |
sum(increase(loki_panic_total[10m])) by (cluster_id, namespace, job) > 0
sum(increase(loki_panic_total[10m])) by (cluster_id, installation, provider, pipeline, namespace, job) > 0
labels:
area: managedservices
cancel_if_apiserver_down: "true"
Expand All @@ -73,7 +73,7 @@ spec:
description: '{{`Loki pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) sees {{ $value }} unhealthy ring members`}}'
opsrecipe: loki/
expr: |
sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, name, namespace, organization, pod) > 0
sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) > 0
labels:
area: managedservices
cancel_if_apiserver_down: "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}'
opsrecipe: multiple-operators-running-same-version/
expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1
expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1
for: 5m
labels:
area: empowerment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ spec:
- alert: MimirComponentDown
annotations:
description: '{{`Mimir component : {{ $labels.service }} is down.`}}'
expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0
expr: count(up{app="mimir"} == 0) by (cluster_id, installation, provider, pipeline, service) > 0
for: 5m
labels:
area: managedservices
Expand All @@ -59,7 +59,7 @@ spec:
- alert: GrafanaAgentForPrometheusRulesDown
annotations:
description: 'Grafana-agent sending PrometheusRules to Mimir ruler is down.'
expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id) > 0
expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
for: 1h
labels:
area: managedservices
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ spec:
count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region)
)
{{- end }}
for: 1m
for: 2m
labels:
area: empowerment
severity: none
Expand All @@ -99,18 +99,18 @@ spec:
summary: Prometheus agent is missing shards.
opsrecipe: prometheus-agent/
expr: |-
max_over_time(sum by (cluster_id)(
max_over_time(sum by (cluster_id, installation, provider, pipeline)(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
!=
sum(
## number of shards defined in the Prometheus CR
prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
# if there is only 1 shard, there is no shard metric so we use the replicas metric
or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
)[5m:])
for: 20m
labels:
Expand All @@ -130,20 +130,20 @@ spec:
summary: Prometheus agent is missing shards.
opsrecipe: prometheus-agent/
expr: |-
max_over_time(sum by (cluster_id)(
max_over_time(sum by (cluster_id, installation, provider, pipeline)(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
!=
sum(
## number of shards defined in the Prometheus CR
prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
# if there is only 1 shard, there is no shard metric so we use the replicas metric
or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
) by (cluster_id)
) by (cluster_id, installation, provider, pipeline)
)[5m:])
for: 1m
for: 2m
labels:
area: empowerment
severity: none
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,20 @@ spec:
# If a prometheus is missing, this alert will fire. This alert will not check if a prometheus is running when it should not (e.g. deleted cluster)
expr: |
(
sum by(cluster_id) (
sum by(cluster_id, installation, provider, pipeline) (
{__name__=~"cluster_service_cluster_info|cluster_operator_cluster_status", status!="Deleting"}
) unless sum by(cluster_id) (
) unless sum by(cluster_id, installation, provider, pipeline) (
label_replace(
kube_pod_container_status_running{container="prometheus", namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".*-prometheus"},
"cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)"
)
)
) + (
sum by (cluster_name) (
) or (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That query should not be worked as cluster_name does not exist.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would it not work? cluster_name came from the label replace so it did work and we tested it with herve. Also why was this changed from a + to an or?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

label_replace is missing in
sum by (cluster_name) ( capi_cluster_status_phase{phase!="Deleting"} )

Copy link
Contributor Author

@marieroque marieroque Apr 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+ is not working as or and in that case if first part return nothing and second part return something, the final result will be empty.
It's not what we want, first case is for vintage and second for capi, so we need to do or

sum by (name, installation, provider, pipeline) (
capi_cluster_status_phase{phase!="Deleting"}
) unless sum by (cluster_name) (
label_replace(kube_pod_container_status_running{container="prometheus",namespace=~".*-prometheus"},
"cluster_name", "$2", "pod", "(prometheus-)(.+)(-.+)"
) unless sum by (name, installation, provider, pipeline) (
label_replace(kube_pod_container_status_running{container="prometheus",namespace=~".*-prometheus"},
"name", "$2", "pod", "(prometheus-)(.+)(-.+)"
)
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
area: empowerment
Expand All @@ -44,7 +44,7 @@ spec:
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
for: 15m
labels:
area: empowerment
Expand All @@ -66,7 +66,7 @@ spec:
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1
for: 10m
labels:
area: empowerment
Expand All @@ -88,7 +88,7 @@ spec:
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
expr: min by (cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
for: 5m
labels:
area: empowerment
Expand Down
Loading