From ad2ecd3d85c50339fa0f13df3f22280ca73a0d5e Mon Sep 17 00:00:00 2001 From: Vladimir Varankin Date: Wed, 30 Oct 2024 12:26:47 +0100 Subject: [PATCH] blockbuilder: Basic alerts (#9723) * mimir-mixin: basic alerting for block-builder Signed-off-by: Vladimir Varankin * runbook Signed-off-by: Vladimir Varankin * rebuild assets Signed-off-by: Vladimir Varankin * Update docs/sources/mimir/manage/mimir-runbooks/_index.md Co-authored-by: Marco Pracucci * per-instance alerting Signed-off-by: Vladimir Varankin * rebuild assets Signed-off-by: Vladimir Varankin * Apply suggestions from code review Co-authored-by: Taylor C <41653732+tacole02@users.noreply.github.com> * add MimirBlockBuilderLaging Signed-off-by: Vladimir Varankin * fixup! rebuild assets * improve MimirBlockBuilderLagging Signed-off-by: Vladimir Varankin * fixup! rebuild assets --------- Signed-off-by: Vladimir Varankin Co-authored-by: Marco Pracucci Co-authored-by: Taylor C <41653732+tacole02@users.noreply.github.com> --- .../mimir/manage/mimir-runbooks/_index.md | 41 ++++++++++++++++ .../metamonitoring/mixin-alerts.yaml | 27 +++++++++++ .../alerts.yaml | 27 +++++++++++ operations/mimir-mixin-compiled/alerts.yaml | 27 +++++++++++ .../alerts/ingest-storage.libsonnet | 47 +++++++++++++++++++ 5 files changed, 169 insertions(+) diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 0ca3da61303..1f4d3d29e71 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1611,6 +1611,47 @@ How to **fix**: 1. Once ingesters are stable, revert the temporarily config applied in the previous step. +### MimirBlockBuilderNoCycleProcessing + +This alert fires when the block-builder stops reporting any processed cycles for an unexpectedly long time. + +How it **works**: + +- The block-builder periodically consumes a portion of the backlog from Kafka partition, and processes the consumed data into TSDB blocks. The block-builder calls these periods "cycles". +- If the block-builder doesn't process any cycles for an extended period of time, this could indicate that a block-builder instance is stuck and cannot complete cycle processing. + +How to **investigate**: + +- Check the block-builder logs to see what its pods have been busy with. The block-builder logs the `start consuming` and `done consuming` log messages, that mark per-partition conume-cycles. These log records include the details about the cycle, the Kafka topic's offsets, etc. Troubleshoot based on that. + +### MimirBlockBuilderLagging + +This alert fires when the block-builder instances report a large number of unprocessed records in the Kafka partitions. + +How it **works**: + +- When the block-builder starts a new consume cycle, it checks how many records the Kafka partition has in the backlog. This number is tracked in the `cortex_blockbuilder_consumer_lag_records` metric. +- The block-builder must consume and process these records into TSDB blocks. +- At the end of the processing, the block-builder commits the offset of the last fully processed record into Kafka. +- If the block-builder reports high values in the lag, this could indicate that a block-builder instance cannot fully process and commit Kafka record. + +How to **investigate**: + +- Check if the per-partition lag, reported by the `cortex_blockbuilder_consumer_lag_records` metric, has been growing over the past hours. +- Explore the block-builder logs for any errors reported while it processed the partition. + +### MimirBlockBuilderCompactAndUploadFailed + +How it **works**: + +- The block-builder periodically consumes data from a Kafka topic and processes the consumed data into TSDB blocks. +- It compacts and uploads the produced TSDB blocks to object storage. +- If the block-builder encounters issues while compacting or uploading the blocks, it reports the failure metric, which then triggers the alert. + +How to **investigate**: + +- Explore the block-builder logs to check what errors are there. + ## Errors catalog Mimir has some codified error IDs that you might see in HTTP responses or logs. diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index f866fb0f3dc..056f25bba7d 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -1163,6 +1163,33 @@ spec: for: 5m labels: severity: critical + - alert: MimirBlockBuilderNoCycleProcessing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not processed cycles in the past hour. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing + expr: | + max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + for: 5m + labels: + severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 75m + labels: + severity: warning + - alert: MimirBlockBuilderCompactAndUploadFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed + expr: | + sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 + for: 5m + labels: + severity: warning - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 41ff8e17768..9b605e56d6d 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1137,6 +1137,33 @@ groups: for: 5m labels: severity: critical + - alert: MimirBlockBuilderNoCycleProcessing + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not processed cycles in the past hour. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing + expr: | + max by(cluster, namespace, instance) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + for: 5m + labels: + severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 75m + labels: + severity: warning + - alert: MimirBlockBuilderCompactAndUploadFailed + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed + expr: | + sum by (cluster, namespace, instance) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 + for: 5m + labels: + severity: warning - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 87696d3be85..3d1a5d3352d 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1151,6 +1151,33 @@ groups: for: 5m labels: severity: critical + - alert: MimirBlockBuilderNoCycleProcessing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not processed cycles in the past hour. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing + expr: | + max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + for: 5m + labels: + severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 75m + labels: + severity: warning + - alert: MimirBlockBuilderCompactAndUploadFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed + expr: | + sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 + for: 5m + labels: + severity: warning - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index 1dd4cc90046..371865130b0 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -212,6 +212,53 @@ message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s Kafka client produce buffer utilization is {{ printf "%%.2f" $value }}%%.' % $._config, }, }, + + // Alert if block-builder didn't process cycles in the past hour. + { + alert: $.alertName('BlockBuilderNoCycleProcessing'), + 'for': '5m', + expr: ||| + max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not processed cycles in the past hour.' % $._config, + }, + }, + + // Alert if block-builder per partition lag is higher than the threshhold. + // The value of the threshhold is arbitary large for now. We will reconsider this alert after we get the block-builder-scheduler. + // Note on "for: 75m": we assume one cycle is 1hr; with 10m loopback we expect the warning to trigger only if the metric is above the threshold for more than one cycle. + { + alert: $.alertName('BlockBuilderLagging'), + 'for': '75m', + expr: ||| + max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}%%.' % $._config, + }, + }, + + // Alert if block-builder is failing to compact and upload any blocks. + { + alert: $.alertName('BlockBuilderCompactAndUploadFailed'), + 'for': '5m', + expr: ||| + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to compact and upload blocks.' % $._config, + }, + }, ], }, ],