Skip to content

Commit

Permalink
Update LokiRingUnhealthy query to avoid false positive (#1139)
Browse files Browse the repository at this point in the history
* Update LokiRingUnhealthy query to avoid false positive when a new pod is starting

* Update TU for LokiRingUnhealthy

* Restrict LokiRingUnhealthy rules to loki app
  • Loading branch information
marieroque authored Apr 26, 2024
1 parent cd3bcc4 commit 452df8e
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Update LokiRingUnhealthy query to avoid false positive when a new pod is starting.

## [3.12.2] - 2024-04-25

### Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ spec:
description: '{{`Loki pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) sees {{ $value }} unhealthy ring members`}}'
opsrecipe: loki/
expr: |
sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) > 0
sum by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) (cortex_ring_members{state="Unhealthy", app="loki"}) > 0
for: 30m
labels:
area: managedservices
cancel_if_apiserver_down: "true"
Expand Down
9 changes: 7 additions & 2 deletions test/tests/providers/global/loki.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ rule_files:
tests:
- interval: 1m
input_series:
- series: 'cortex_ring_members{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="compactor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}'
- series: 'cortex_ring_members{app="loki", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="compactor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}'
values: "0+0x20 1+0x160" # 1 unhealthy value after 20 minutes
- series: 'cortex_ring_members{app="loki", cluster_id="zj88t", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="distributor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-write-0", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}'
values: "_x30 1+0x10 0+0x60" # no data for 30 minutes then 1 unhealthy value for 10 minutes and back to normal for 1 hour
- series: 'loki_panic_total{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest"}'
values: "0+0x20 1+0x160" # 1 panic after 20 minutes
- series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}'
Expand Down Expand Up @@ -78,11 +80,14 @@ tests:
- alertname: LokiRingUnhealthy
eval_time: 25m # after 25 minutes we have an unhealthy member, but we want to filter too short events. So no alert yet.
exp_alerts:
- alertname: LokiRingUnhealthy
eval_time: 35m # special case to validate when a new pod is unhealthy (no data at the beginning)
exp_alerts:
- alertname: LokiRingUnhealthy
eval_time: 60m # now the event has been there for 20 minutes, we should have an alert.
exp_alerts:
- exp_labels:
app: loki-compactor
app: loki
area: managedservices
cancel_if_apiserver_down: "true"
cancel_if_cluster_status_creating: "true"
Expand Down

0 comments on commit 452df8e

Please sign in to comment.