diff --git a/CHANGELOG.md b/CHANGELOG.md index 21604b25..444294c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add `MimirHPAReachedMaxReplicas` alert, to detect when Mimir's HPAs have reached maximum capacity. +- Add `MimirContinuousTestFailingOnWrites` and `MimirContinuousTestFailingOnReads` alerts. ### Changed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 2189daee..7c144bc8 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -182,4 +182,38 @@ spec: severity: page team: atlas topic: observability + - alert: MimirContinuousTestFailingOnWrites + annotations: + description: 'Mimir continous-test detected errors in the write path.' + opsrecipe: mimir/ + # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 + expr: sum by(cluster_id, installation, namespace, pipeline, provider test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + #TODO dashboard: + severity: page + team: atlas + topic: observability + - alert: MimirContinuousTestFailingOnReads + annotations: + description: 'Mimir continous-test detected errors in the write path.' + opsrecipe: mimir/ + # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 + expr: sum by(cluster_id, installation, namespace, pipeline, provider test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + # TODO dashboard: + severity: page + team: atlas + topic: observability {{- end }}