Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
QuentinBisson committed Nov 12, 2024
1 parent 5d3aa9e commit 4963d56
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,9 @@ spec:
- alert: MimirContinuousTestFailingOnWrites
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: 'Mimir continous-test detected errors in the write path.'
description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because writes are failing.`}}'
opsrecipe: mimir/
# Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097
# Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1196
expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0
for: 1h
labels:
Expand All @@ -191,9 +191,9 @@ spec:
- alert: MimirContinuousTestFailingOnReads
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: 'Mimir continous-test detected errors in the write path.'
description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}'
opsrecipe: mimir/
# Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097
# Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1185
expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0
for: 1h
labels:
Expand All @@ -205,4 +205,21 @@ spec:
severity: page
team: atlas
topic: observability
- alert: MimirContinuousTestFailed
annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}'
opsrecipe: mimir/
# Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1205
expr: sum by(cluster_id, installation, pipeline, provider, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ tests:
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
- series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailingOnWrites
Expand All @@ -412,12 +412,15 @@ tests:
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
test: continuous-test
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continous-test detected errors in the write path."
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because writes are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 160m
Expand All @@ -426,7 +429,7 @@ tests:
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
- series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailingOnReads
Expand All @@ -443,12 +446,49 @@ tests:
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
test: continuous-test
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continous-test detected errors in the write path."
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailingOnReads
eval_time: 160m

# Test for MimirContinuousTestFailed alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailed
eval_time: 40m
- alertname: MimirContinuousTestFailed
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
topic: observability
test: continuous-test
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailed
eval_time: 160m
Original file line number Diff line number Diff line change
Expand Up @@ -390,3 +390,105 @@ tests:
eval_time: 205m
- alertname: MimirCompactorFailedCompaction
eval_time: 350m

# Test for MimirContinuousTestFailingOnWrites alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 40m
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
test: continuous-test
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because writes are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailingOnWrites
eval_time: 160m

# Test for MimirContinuousTestFailingOnReads alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailingOnReads
eval_time: 40m
- alertname: MimirContinuousTestFailingOnReads
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
test: continuous-test
topic: observability
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailingOnReads
eval_time: 160m

# Test for MimirContinuousTestFailed alert
- interval: 1m
input_series:
# Test: none, rate > 0, rate = 0
- series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "_x20 1+1x80 0+0x70"
alert_rule_test:
- alertname: MimirContinuousTestFailed
eval_time: 40m
- alertname: MimirContinuousTestFailed
eval_time: 95m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: golem
installation: golem
namespace: mimir
pipeline: testing
provider: capa
severity: page
team: atlas
topic: observability
test: continuous-test
exp_annotations:
dashboard: mimir-continous-test/mimir-continous-test
description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing."
opsrecipe: "mimir/"
- alertname: MimirContinuousTestFailed
eval_time: 160m

0 comments on commit 4963d56

Please sign in to comment.