Skip to content

Commit

Permalink
increase prometheus agent inhibitions sensivity
Browse files Browse the repository at this point in the history
  • Loading branch information
hervenicol committed Nov 14, 2023
1 parent 9e303c6 commit fdfe6d1
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 2 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Split prometheus-agent alerts (`PrometheusAgentFailing` and `PrometheusAgentShardsMissing`) in 2:
- existing alerts will fire later
- new inhibitions alerts will fire earlier

## [2.140.2] - 2023-11-13

### Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
absent(up{instance="prometheus-agent"}) == 1
)[5m:]
)
for: 10m
for: 20m
labels:
area: empowerment
severity: page
Expand All @@ -38,6 +38,33 @@ spec:
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
- alert: PrometheusAgentFailingInhibition
annotations:
description: '{{`Prometheus agent remote write is failing.`}}'
summary: Prometheus agent fails to send samples to remote write endpoint.
opsrecipe: prometheus-agent-remote-write-failed/
dashboard: promRW001/prometheus-remote-write
# expr: count(absent_over_time(up{instance="prometheus-agent"}[10m]))
expr: |-
max_over_time(
sum by (cluster_type, cluster_id, installation, instance, service)
(
up{instance="prometheus-agent"} == 0
or
absent(up{instance="prometheus-agent"}) == 1
)[5m:]
)
for: 1m
labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus.
- alert: PrometheusAgentShardsMissing
annotations:
Expand All @@ -63,7 +90,7 @@ spec:
)
)
)[5m:])
for: 10m
for: 20m
labels:
area: empowerment
severity: page
Expand All @@ -74,4 +101,40 @@ spec:
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
- alert: PrometheusAgentShardsMissingInhibition
annotations:
description: '{{`Prometheus agent is missing shards.`}}'
summary: Prometheus agent is missing shards.
opsrecipe: prometheus-agent-missing-shards/
expr: |-
max_over_time(sum(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"}
)
!=
sum(
## number of shards defined in the Prometheus CR
prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
or
(
# if there is only 1 shard, there is no shard metric so we use the replicas metric
absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"})
and on(controller, name)
prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
)
)
)[5m:])
for: 1m
labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
{{- end }}
79 changes: 79 additions & 0 deletions test/tests/providers/global/prometheus-agent.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,24 @@ tests:
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent-remote-write-failed/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: PrometheusAgentFailingInhibition
eval_time: 30m
exp_alerts:
- exp_labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
instance: prometheus-agent
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent-remote-write-failed/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: PrometheusAgentFailing
eval_time: 90m
exp_alerts:
Expand All @@ -48,8 +66,31 @@ tests:
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent-remote-write-failed/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: PrometheusAgentFailingInhibition
eval_time: 90m
exp_alerts:
- exp_labels:
area: empowerment
cluster_id: gauss
cluster_type: workload_cluster
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
installation: myinstall
instance: prometheus-agent
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
exp_annotations:
dashboard: "promRW001/prometheus-remote-write"
description: "Prometheus agent remote write is failing."
opsrecipe: "prometheus-agent-remote-write-failed/"
summary: "Prometheus agent fails to send samples to remote write endpoint."
- alertname: PrometheusAgentFailing
eval_time: 150m
- alertname: PrometheusAgentFailingInhibition
eval_time: 150m
# Tests for `PrometheusAgentShardsMissing` alert
- interval: 1m
input_series:
Expand All @@ -66,6 +107,8 @@ tests:
alert_rule_test:
- alertname: PrometheusAgentShardsMissing
eval_time: 40m
- alertname: PrometheusAgentShardsMissingInhibition
eval_time: 40m
- alertname: PrometheusAgentShardsMissing
eval_time: 100m
exp_alerts:
Expand All @@ -83,6 +126,23 @@ tests:
description: "Prometheus agent is missing shards."
opsrecipe: "prometheus-agent-missing-shards/"
summary: "Prometheus agent is missing shards."
- alertname: PrometheusAgentShardsMissingInhibition
eval_time: 100m
exp_alerts:
- exp_labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
exp_annotations:
description: "Prometheus agent is missing shards."
opsrecipe: "prometheus-agent-missing-shards/"
summary: "Prometheus agent is missing shards."
- alertname: PrometheusAgentShardsMissing
eval_time: 125m
exp_alerts:
Expand All @@ -100,5 +160,24 @@ tests:
description: "Prometheus agent is missing shards."
opsrecipe: "prometheus-agent-missing-shards/"
summary: "Prometheus agent is missing shards."
- alertname: PrometheusAgentShardsMissingInhibition
eval_time: 125m
exp_alerts:
- exp_labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
exp_annotations:
description: "Prometheus agent is missing shards."
opsrecipe: "prometheus-agent-missing-shards/"
summary: "Prometheus agent is missing shards."
- alertname: PrometheusAgentShardsMissing
eval_time: 130m
- alertname: PrometheusAgentShardsMissingInhibition
eval_time: 130m

0 comments on commit fdfe6d1

Please sign in to comment.