Skip to content

Commit

Permalink
Dashboards: add strong consistency panels for query-frontend in 'Mimi…
Browse files Browse the repository at this point in the history
…r / Queries' dashboard (#8830)

Signed-off-by: Marco Pracucci <[email protected]>
  • Loading branch information
pracucci authored Jul 27, 2024
1 parent 7e801ef commit d1b83df
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 105 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
* Queries dashboard: `cortex_request_duration_seconds` metric. #8800
* Remote ruler reads dashboard: `cortex_request_duration_seconds` metric. #8801
* [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538
* [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543
* [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543 #8830
* [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484
* [ENHANCEMENT] Dashboards: Add panels for monitoring store-gateway autoscaling. These panels are disabled by default, but can be enabled using the `autoscaling.store_gateway.enabled: true` config option. #8824
* [ENHANCEMENT] Dashboards: add panels to show writes to experimental ingest storage backend in the "Mimir / Ruler" dashboard, when `_config.show_ingest_storage_panels` is enabled. #8732
Expand Down
117 changes: 117 additions & 0 deletions operations/mimir-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -1836,4 +1836,121 @@ local utils = import 'mixin-utils/utils.libsonnet';
defaults+: { unit: 's' },
},
},

ingestStorageFetchLastProducedOffsetRequestsPanel(jobName)::
$.timeseriesPanel('Fetch last produced offset requests / sec') +
$.panelDescription(
'Fetch last produced offset requests / sec',
'Shows rate of successful and failed requests to fetch last produced offset(s).',
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names[jobName]), $.jobMatcher($._config.job_names[jobName])],
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names[jobName])],
],
[
'successful',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} +
$.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) +
$.stack,

ingestStorageFetchLastProducedOffsetLatencyPanel(jobName)::
$.timeseriesPanel('Fetch last produced offset latency') +
$.panelDescription(
'Fetch last produced offset latency',
|||
How long does it take to fetch "last produced offset" of partition(s).
|||
) +
$.queryPanel(
[
'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
],
[
'avg',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},

ingestStorageStrongConsistencyRequestsPanel(jobName)::
// The unit changes whether the metric is exposed from ingesters or other components. In the ingesters it's the
// requests issued by queriers to ingesters, while in other components it's the actual query.
local unit = if jobName == 'ingester' then 'requests' else 'queries';
local title = '%s with strong read consistency / sec' % (std.asciiUpper(std.substr(unit, 0, 1)) + std.substr(unit, 1, std.length(unit) - 1));

$.timeseriesPanel(title) +
$.panelDescription(
title,
'Shows rate of %(unit)s with strong read consistency, and rate of failed %(unit)s with strong read consistency.' % {
unit: unit,
},
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names[jobName]), $.jobMatcher($._config.job_names[jobName])],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names[jobName])],
],
[
'successful',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} +
$.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) +
$.stack,

ingestStorageStrongConsistencyWaitLatencyPanel(jobName)::
$.timeseriesPanel('Strong read consistency queries — wait latency') +
$.panelDescription(
'Strong read consistency queries — wait latency',
'How long does the request wait to guarantee strong read consistency.',
) +
$.queryPanel(
[
'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])],
],
[
'avg',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
}
124 changes: 20 additions & 104 deletions operations/mimir-mixin/dashboards/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ local filename = 'mimir-queries.json';
.addRowIf(
$._config.show_ingest_storage_panels,
$.row('Query-frontend – strong consistency (ingest storage)')
.addPanel(
$.ingestStorageStrongConsistencyRequestsPanel('query_frontend')
)
.addPanel(
$.timeseriesPanel('Queries with strong read consistency ratio') +
$.panelDescription(
Expand All @@ -176,6 +179,19 @@ local filename = 'mimir-queries.json';
+ { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } }
+ $.stack
)
.addPanel(
$.ingestStorageStrongConsistencyWaitLatencyPanel('query_frontend')
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
$.row('')
.addPanel(
$.ingestStorageFetchLastProducedOffsetRequestsPanel('query_frontend')
)
.addPanel(
$.ingestStorageFetchLastProducedOffsetLatencyPanel('query_frontend')
)
)
.addRow(
$.row('Ingester')
Expand All @@ -199,33 +215,7 @@ local filename = 'mimir-queries.json';
$._config.show_ingest_storage_panels,
($.row('Ingester – strong consistency (ingest storage)'))
.addPanel(
$.timeseriesPanel('Requests with strong read consistency / sec') +
$.panelDescription(
'Requests with strong read consistency / sec',
|||
Shows rate of requests with strong read consistency, and rate of failed requests with strong read consistency.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'successful',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack,
$.ingestStorageStrongConsistencyRequestsPanel('ingester')
)
.addPanel(
$.timeseriesPanel('Requests with strong read consistency ratio') +
Expand Down Expand Up @@ -263,91 +253,17 @@ local filename = 'mimir-queries.json';
+ $.stack
)
.addPanel(
$.timeseriesPanel('Strong read consistency queries — wait latency') +
$.panelDescription(
'Strong read consistency queries — wait latency',
|||
How long does the request wait to guarantee strong read consistency.
|||
) +
$.queryPanel(
[
'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'avg',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
$.ingestStorageStrongConsistencyWaitLatencyPanel('ingester'),
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
$.row('')
.addPanel(
$.timeseriesPanel('Fetch last produced offset requests / sec') +
$.panelDescription(
'Rate of requests to fetch last produced offset for partition',
|||
Shows rate of requests to fetch last produced offset for partition, and rate of failed requests.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'successful',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack,
$.ingestStorageFetchLastProducedOffsetRequestsPanel('ingester'),
)
.addPanel(
$.timeseriesPanel('Fetch last produced offset latency') +
$.panelDescription(
'Latency',
|||
How long does it take to fetch "last produced offset" of partition.
|||
) +
$.queryPanel(
[
'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'avg',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
$.ingestStorageFetchLastProducedOffsetLatencyPanel('ingester'),
)
.addPanel(
$.ingestStorageIngesterEndToEndLatencyWhenRunningPanel(),
Expand Down

0 comments on commit d1b83df

Please sign in to comment.