Skip to content

Commit

Permalink
Update queries
Browse files Browse the repository at this point in the history
  • Loading branch information
Marie Roque committed Jun 11, 2024
1 parent 64f9297 commit 56e693d
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ spec:
description: '{{`Prometheus Mimir to Grafana-Cloud is down.`}}'
opsrecipe: mimir-grafana-cloud-exporter-failing/
dashboard: iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud
expr: up{job="mimir/mimir-to-grafana-cloud"} == 0
# We can use absent function here because the prometheus mimir-to-grafana-cloud is a MC component only
expr: up{job="mimir/mimir-to-grafana-cloud"} == 0 or absent(up{job="mimir/mimir-to-grafana-cloud"})
for: 30m
labels:
area: platform
Expand All @@ -32,13 +33,12 @@ spec:
opsrecipe: mimir-grafana-cloud-exporter-failing/
dashboard: promRW001/prometheus-remote-write
# We can have encountered failures with remote read and/or remote write
# For remote write, some errors can increased the number of samples failed (non-recoverable errors) and/or dropped (unknown errors)
# See https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/#debug-metrics
# For remote read, we are looking the number of read queries are increasing
# For remote write, we are looking the rate (on 10 minutes) of failed samples are not greater than 0 for 30 minutes
expr: |
(
sum by (cluster_id, installation, provider, pipeline) (
rate(prometheus_remote_storage_read_queries_total{job="mimir/mimir-to-grafana-cloud"}[10m]) == 0
or rate(prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud"}[10m]) > 0
or rate(prometheus_remote_storage_samples_dropped_total{job="mimir/mimir-to-grafana-cloud"}[10m]) > 0
)
for: 30m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,23 @@ tests:
- series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", installation="golem", namespace="mimir", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2"}'
values: "_x60 1+0x60 0+0x60 1+0x60"
alert_rule_test:
- alertname: MimirToGrafanaCloudExporterDown
eval_time: 50m
exp_alerts:
- exp_labels:
area: platform
severity: page
team: atlas
topic: observability
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
job: mimir/mimir-to-grafana-cloud
exp_annotations:
dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud"
description: "Prometheus Mimir to Grafana-Cloud is down."
opsrecipe: "mimir-grafana-cloud-exporter-failing/"
- alertname: MimirToGrafanaCloudExporterDown
eval_time: 70m
- alertname: MimirToGrafanaCloudExporterDown
Expand All @@ -23,14 +40,14 @@ tests:
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: "golem"
customer: "giantswarm"
installation: "golem"
namespace: "mimir"
job: "mimir/mimir-to-grafana-cloud"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
cluster_id: golem
customer: giantswarm
installation: golem
namespace: mimir
job: mimir/mimir-to-grafana-cloud
pipeline: testing
provider: capa
region: eu-west-2
exp_annotations:
dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud"
description: "Prometheus Mimir to Grafana-Cloud is down."
Expand Down Expand Up @@ -62,13 +79,9 @@ tests:
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: "golem"
customer: "giantswarm"
installation: "golem"
namespace: "mimir"
job: "mimir/mimir-to-grafana-cloud"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data."
Expand All @@ -88,13 +101,9 @@ tests:
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: "golem"
customer: "giantswarm"
installation: "golem"
namespace: "mimir"
job: "mimir/mimir-to-grafana-cloud"
pipeline: "testing"
provider: "capa"
region: "eu-west-2"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data."
Expand Down

0 comments on commit 56e693d

Please sign in to comment.