diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml index 6e82298a5..6136b0816 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml @@ -15,7 +15,8 @@ spec: description: '{{`Prometheus Mimir to Grafana-Cloud is down.`}}' opsrecipe: mimir-grafana-cloud-exporter-failing/ dashboard: iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud - expr: up{job="mimir/mimir-to-grafana-cloud"} == 0 + # We can use absent function here because the prometheus mimir-to-grafana-cloud is a MC component only + expr: up{job="mimir/mimir-to-grafana-cloud"} == 0 or absent(up{job="mimir/mimir-to-grafana-cloud"}) for: 30m labels: area: platform @@ -32,13 +33,12 @@ spec: opsrecipe: mimir-grafana-cloud-exporter-failing/ dashboard: promRW001/prometheus-remote-write # We can have encountered failures with remote read and/or remote write - # For remote write, some errors can increased the number of samples failed (non-recoverable errors) and/or dropped (unknown errors) - # See https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/#debug-metrics + # For remote read, we are looking the number of read queries are increasing + # For remote write, we are looking the rate (on 10 minutes) of failed samples are not greater than 0 for 30 minutes expr: | - ( + sum by (cluster_id, installation, provider, pipeline) ( rate(prometheus_remote_storage_read_queries_total{job="mimir/mimir-to-grafana-cloud"}[10m]) == 0 or rate(prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud"}[10m]) > 0 - or rate(prometheus_remote_storage_samples_dropped_total{job="mimir/mimir-to-grafana-cloud"}[10m]) > 0 ) for: 30m labels: diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml index 44fb078fd..e9bf7e719 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml @@ -9,6 +9,23 @@ tests: - series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", installation="golem", namespace="mimir", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2"}' values: "_x60 1+0x60 0+0x60 1+0x60" alert_rule_test: + - alertname: MimirToGrafanaCloudExporterDown + eval_time: 50m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + job: mimir/mimir-to-grafana-cloud + exp_annotations: + dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud" + description: "Prometheus Mimir to Grafana-Cloud is down." + opsrecipe: "mimir-grafana-cloud-exporter-failing/" - alertname: MimirToGrafanaCloudExporterDown eval_time: 70m - alertname: MimirToGrafanaCloudExporterDown @@ -23,14 +40,14 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" - cluster_id: "golem" - customer: "giantswarm" - installation: "golem" - namespace: "mimir" - job: "mimir/mimir-to-grafana-cloud" - pipeline: "testing" - provider: "capa" - region: "eu-west-2" + cluster_id: golem + customer: giantswarm + installation: golem + namespace: mimir + job: mimir/mimir-to-grafana-cloud + pipeline: testing + provider: capa + region: eu-west-2 exp_annotations: dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud" description: "Prometheus Mimir to Grafana-Cloud is down." @@ -62,13 +79,9 @@ tests: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cluster_id: "golem" - customer: "giantswarm" installation: "golem" - namespace: "mimir" - job: "mimir/mimir-to-grafana-cloud" pipeline: "testing" provider: "capa" - region: "eu-west-2" exp_annotations: dashboard: "promRW001/prometheus-remote-write" description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data." @@ -88,13 +101,9 @@ tests: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cluster_id: "golem" - customer: "giantswarm" installation: "golem" - namespace: "mimir" - job: "mimir/mimir-to-grafana-cloud" pipeline: "testing" provider: "capa" - region: "eu-west-2" exp_annotations: dashboard: "promRW001/prometheus-remote-write" description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data."