Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
QuantumEnigmaa committed Jun 5, 2024
2 parents 4873f15 + fc52fcc commit 0111d28
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ kind: PrometheusRule
metadata:
labels:
{{- include "labels.common" . | nindent 4 }}
cluster_type: "management_cluster"
name: mimir-to-grafana-cloud-exporter.rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: prometheus-mimir-to-grafana-cloud
- name: mimir-to-grafana-cloud-exporter
rules:
- alert: MimirToGrafanaCloudExporterDown
annotations:
Expand All @@ -32,11 +31,14 @@ spec:
description: '{{`Prometheus Mimir to Grafana-Cloud is failing to read or write data.`}}'
opsrecipe: mimir-grafana-cloud-exporter-failing/
dashboard: promRW001/prometheus-remote-write
# We can have encountered failures with remote read and/or remote write
# For remote write, some errors can increased the number of samples failed (non-recoverable errors) and/or dropped (unknown errors)
# See https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/#debug-metrics
expr: |
(
rate(prometheus_remote_storage_read_queries_total{job="mimir/mimir-to-grafana-cloud"}[2m]) == 0
or rate(prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud"}[2m]) > 0
or rate(prometheus_remote_storage_samples_dropped_total{job="mimir/mimir-to-grafana-cloud"}[2m]) > 0
rate(prometheus_remote_storage_read_queries_total{job="mimir/mimir-to-grafana-cloud"}[10m]) == 0
or rate(prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud"}[10m]) > 0
or rate(prometheus_remote_storage_samples_dropped_total{job="mimir/mimir-to-grafana-cloud"}[10m]) > 0
)
for: 30m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ tests:
# Tests for `MimirToGrafanaCloudExporterDown` alert
- interval: 1m
input_series:
- series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", cluster_type="management_cluster", installation="golem", namespace="mimir", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2"}'
- series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", installation="golem", namespace="mimir", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2"}'
values: "_x60 1+0x60 0+0x60 1+0x60"
alert_rule_test:
- alertname: MimirToGrafanaCloudExporterDown
Expand All @@ -24,7 +24,6 @@ tests:
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: "golem"
cluster_type: "management_cluster"
customer: "giantswarm"
installation: "golem"
namespace: "mimir"
Expand All @@ -42,8 +41,11 @@ tests:
- interval: 1m
input_series:
# remote read is working for 2 hours and then fails for 1 hour
- series: 'prometheus_remote_storage_read_queries_total{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
values: "_x60 10+0x60 0+0x60 10+0x60"
- series: 'prometheus_remote_storage_read_queries_total{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", customer="giantswarm", installation="golem", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
values: "_x60 0+10x60 0+0x60 0+10x180"
# remote write has no failure for 4 hours and then fails for 2 hours
- series: 'prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud", cluster_id="golem", customer="giantswarm", installation="golem", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
values: "_x60 0+0x180 0+10x120"
alert_rule_test:
- alertname: MimirToGrafanaCloudExporterFailures
eval_time: 70m
Expand All @@ -60,7 +62,6 @@ tests:
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: "golem"
cluster_type: "management_cluster"
customer: "giantswarm"
installation: "golem"
namespace: "mimir"
Expand All @@ -74,17 +75,8 @@ tests:
opsrecipe: "mimir-grafana-cloud-exporter-failing/"
- alertname: MimirToGrafanaCloudExporterFailures
eval_time: 200m
# Tests for `PrometheusMimirToGrafanaCloudTooManyRestarts` alert
- interval: 1m
input_series:
# prometheus-buddy is up for 1 hour and then fails for 1 hour
- series: 'kube_pod_status_ready{status="ready", container="prometheus", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
values: "_x30 1+0x60 0+0x2 1+0x2 0+0x2 1+0x2 0+0x2 1+0x60"
alert_rule_test:
- alertname: PrometheusMimirToGrafanaCloudTooManyRestarts
eval_time: 70m
- alertname: PrometheusMimirToGrafanaCloudTooManyRestarts
eval_time: 110m
- alertname: MimirToGrafanaCloudExporterFailures
eval_time: 280m
exp_alerts:
- exp_labels:
area: platform
Expand All @@ -96,7 +88,6 @@ tests:
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: "golem"
cluster_type: "management_cluster"
customer: "giantswarm"
installation: "golem"
namespace: "mimir"
Expand All @@ -106,7 +97,5 @@ tests:
region: "eu-west-2"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus Mimir to Grafana-Cloud is restarting too much."
description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data."
opsrecipe: "mimir-grafana-cloud-exporter-failing/"
- alertname: PrometheusMimirToGrafanaCloudTooManyRestarts
eval_time: 150m

0 comments on commit 0111d28

Please sign in to comment.