From d1b83df28d6dc77ff6d27f2ed73fcb56ea0051fc Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Sat, 27 Jul 2024 08:51:56 +0200 Subject: [PATCH] Dashboards: add strong consistency panels for query-frontend in 'Mimir / Queries' dashboard (#8830) Signed-off-by: Marco Pracucci --- CHANGELOG.md | 2 +- .../dashboards/dashboard-utils.libsonnet | 117 +++++++++++++++++ .../mimir-mixin/dashboards/queries.libsonnet | 124 +++--------------- 3 files changed, 138 insertions(+), 105 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23baa4efa7c..92370e84cdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,7 +81,7 @@ * Queries dashboard: `cortex_request_duration_seconds` metric. #8800 * Remote ruler reads dashboard: `cortex_request_duration_seconds` metric. #8801 * [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538 -* [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543 +* [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543 #8830 * [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484 * [ENHANCEMENT] Dashboards: Add panels for monitoring store-gateway autoscaling. These panels are disabled by default, but can be enabled using the `autoscaling.store_gateway.enabled: true` config option. #8824 * [ENHANCEMENT] Dashboards: add panels to show writes to experimental ingest storage backend in the "Mimir / Ruler" dashboard, when `_config.show_ingest_storage_panels` is enabled. #8732 diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 26dabc751e0..274b378cc37 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1836,4 +1836,121 @@ local utils = import 'mixin-utils/utils.libsonnet'; defaults+: { unit: 's' }, }, }, + + ingestStorageFetchLastProducedOffsetRequestsPanel(jobName):: + $.timeseriesPanel('Fetch last produced offset requests / sec') + + $.panelDescription( + 'Fetch last produced offset requests / sec', + 'Shows rate of successful and failed requests to fetch last produced offset(s).', + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names[jobName]), $.jobMatcher($._config.job_names[jobName])], + ||| + sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names[jobName])], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + + $.stack, + + ingestStorageFetchLastProducedOffsetLatencyPanel(jobName):: + $.timeseriesPanel('Fetch last produced offset latency') + + $.panelDescription( + 'Fetch last produced offset latency', + ||| + How long does it take to fetch "last produced offset" of partition(s). + ||| + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, + + ingestStorageStrongConsistencyRequestsPanel(jobName):: + // The unit changes whether the metric is exposed from ingesters or other components. In the ingesters it's the + // requests issued by queriers to ingesters, while in other components it's the actual query. + local unit = if jobName == 'ingester' then 'requests' else 'queries'; + local title = '%s with strong read consistency / sec' % (std.asciiUpper(std.substr(unit, 0, 1)) + std.substr(unit, 1, std.length(unit) - 1)); + + $.timeseriesPanel(title) + + $.panelDescription( + title, + 'Shows rate of %(unit)s with strong read consistency, and rate of failed %(unit)s with strong read consistency.' % { + unit: unit, + }, + ) + + $.queryPanel( + [ + ||| + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + - + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names[jobName]), $.jobMatcher($._config.job_names[jobName])], + ||| + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names[jobName])], + ], + [ + 'successful', + 'failed', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 'reqps' }, + }, + } + + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + + $.stack, + + ingestStorageStrongConsistencyWaitLatencyPanel(jobName):: + $.timeseriesPanel('Strong read consistency queries — wait latency') + + $.panelDescription( + 'Strong read consistency queries — wait latency', + 'How long does the request wait to guarantee strong read consistency.', + ) + + $.queryPanel( + [ + 'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + ], + [ + 'avg', + '99th percentile', + '99.9th percentile', + '100th percentile', + ], + ) + { + fieldConfig+: { + defaults+: { unit: 's' }, + }, + }, } diff --git a/operations/mimir-mixin/dashboards/queries.libsonnet b/operations/mimir-mixin/dashboards/queries.libsonnet index 011d6d39e9b..e746ff33267 100644 --- a/operations/mimir-mixin/dashboards/queries.libsonnet +++ b/operations/mimir-mixin/dashboards/queries.libsonnet @@ -153,6 +153,9 @@ local filename = 'mimir-queries.json'; .addRowIf( $._config.show_ingest_storage_panels, $.row('Query-frontend – strong consistency (ingest storage)') + .addPanel( + $.ingestStorageStrongConsistencyRequestsPanel('query_frontend') + ) .addPanel( $.timeseriesPanel('Queries with strong read consistency ratio') + $.panelDescription( @@ -176,6 +179,19 @@ local filename = 'mimir-queries.json'; + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + $.stack ) + .addPanel( + $.ingestStorageStrongConsistencyWaitLatencyPanel('query_frontend') + ) + ) + .addRowIf( + $._config.show_ingest_storage_panels, + $.row('') + .addPanel( + $.ingestStorageFetchLastProducedOffsetRequestsPanel('query_frontend') + ) + .addPanel( + $.ingestStorageFetchLastProducedOffsetLatencyPanel('query_frontend') + ) ) .addRow( $.row('Ingester') @@ -199,33 +215,7 @@ local filename = 'mimir-queries.json'; $._config.show_ingest_storage_panels, ($.row('Ingester – strong consistency (ingest storage)')) .addPanel( - $.timeseriesPanel('Requests with strong read consistency / sec') + - $.panelDescription( - 'Requests with strong read consistency / sec', - ||| - Shows rate of requests with strong read consistency, and rate of failed requests with strong read consistency. - ||| - ) + - $.queryPanel( - [ - ||| - sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) - - - sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], - ||| - sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names.ingester)], - ], - [ - 'successful', - 'failed', - ], - ) + { - fieldConfig+: { - defaults+: { unit: 'reqps' }, - }, - } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + $.ingestStorageStrongConsistencyRequestsPanel('ingester') ) .addPanel( $.timeseriesPanel('Requests with strong read consistency ratio') + @@ -263,91 +253,17 @@ local filename = 'mimir-queries.json'; + $.stack ) .addPanel( - $.timeseriesPanel('Strong read consistency queries — wait latency') + - $.panelDescription( - 'Strong read consistency queries — wait latency', - ||| - How long does the request wait to guarantee strong read consistency. - ||| - ) + - $.queryPanel( - [ - 'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - ], - [ - 'avg', - '99th percentile', - '99.9th percentile', - '100th percentile', - ], - ) + { - fieldConfig+: { - defaults+: { unit: 's' }, - }, - }, + $.ingestStorageStrongConsistencyWaitLatencyPanel('ingester'), ) ) .addRowIf( $._config.show_ingest_storage_panels, $.row('') .addPanel( - $.timeseriesPanel('Fetch last produced offset requests / sec') + - $.panelDescription( - 'Rate of requests to fetch last produced offset for partition', - ||| - Shows rate of requests to fetch last produced offset for partition, and rate of failed requests. - ||| - ) + - $.queryPanel( - [ - ||| - sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval])) - - - sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], - ||| - sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names.ingester)], - ], - [ - 'successful', - 'failed', - ], - ) + { - fieldConfig+: { - defaults+: { unit: 'reqps' }, - }, - } + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, + $.ingestStorageFetchLastProducedOffsetRequestsPanel('ingester'), ) .addPanel( - $.timeseriesPanel('Fetch last produced offset latency') + - $.panelDescription( - 'Latency', - ||| - How long does it take to fetch "last produced offset" of partition. - ||| - ) + - $.queryPanel( - [ - 'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)], - ], - [ - 'avg', - '99th percentile', - '99.9th percentile', - '100th percentile', - ], - ) + { - fieldConfig+: { - defaults+: { unit: 's' }, - }, - }, + $.ingestStorageFetchLastProducedOffsetLatencyPanel('ingester'), ) .addPanel( $.ingestStorageIngesterEndToEndLatencyWhenRunningPanel(),