From 5455b8cd3aecc1d6a0e5004d1cb89414f7489390 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 27 Aug 2024 16:44:56 +0200 Subject: [PATCH] Dashboards: add 'Read path' selector to 'Mimir / Queries' dashboard (#8878) * Dashboards: add 'Read path' selector to 'Mimir / Queries' dashboard Signed-off-by: Marco Pracucci * Updated CHANGELOG Signed-off-by: Marco Pracucci * Add mixin linter exclusion Signed-off-by: Marco Pracucci * Rename 'Standard' to 'Main' Signed-off-by: Marco Pracucci --------- Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + .../metamonitoring/grafana-dashboards.yaml | 117 ++++++++++++------ .../dashboards/mimir-queries.json | 105 ++++++++++------ .../dashboards/mimir-tenants.json | 6 +- .../dashboards/mimir-top-tenants.json | 6 +- .../dashboards/mimir-queries.json | 105 ++++++++++------ .../dashboards/mimir-tenants.json | 6 +- .../dashboards/mimir-top-tenants.json | 6 +- operations/mimir-mixin-tools/serve/run.sh | 2 +- operations/mimir-mixin/.lint | 3 +- operations/mimir-mixin/config.libsonnet | 4 + .../dashboards/dashboard-utils.libsonnet | 94 +++++++------- .../mimir-mixin/dashboards/queries.libsonnet | 101 ++++++++------- .../mimir-mixin/dashboards/tenants.libsonnet | 10 +- .../dashboards/top-tenants.libsonnet | 8 +- 15 files changed, 366 insertions(+), 208 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 000b683fa44..2a0edd09263 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,6 +123,7 @@ * [ENHANCEMENT] Dashboards: add Kafka end-to-end latency outliers panel in the "Mimir / Writes" dashboard. #8948 * [ENHANCEMENT] Dashboards: add "Out-of-order samples appended" panel to "Mimir / Tenants" dashboard. #8939 * [ENHANCEMENT] Alerts: `RequestErrors` and `RulerRemoteEvaluationFailing` have been enriched with a native histogram version. #9004 +* [ENHANCEMENT] Dashboards: add 'Read path' selector to 'Mimir / Queries' dashboard. #8878 * [BUGFIX] Dashboards: fix "current replicas" in autoscaling panels when HPA is not active. #8566 * [BUGFIX] Alerts: do not fire `MimirRingMembersMismatch` during the migration to experimental ingest storage. #8727 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index fd8ba946a59..7915b1eaccb 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -11214,19 +11214,19 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_frontend_queue_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_frontend_queue_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_frontend_queue_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_frontend_queue_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -11293,19 +11293,19 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_frontend_retries_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_query_frontend_retries_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_frontend_retries_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_query_frontend_retries_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -11371,7 +11371,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by(pod) (cortex_query_frontend_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"})", + "expr": "sum by(pod) (cortex_query_frontend_queue_length{$read_path_matcher})", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null @@ -11420,7 +11420,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by(user) (cortex_query_frontend_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}) > 0", + "expr": "sum by(user) (cortex_query_frontend_queue_length{$read_path_matcher}) > 0", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -11481,19 +11481,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_scheduler_queue_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_scheduler_queue_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_scheduler_queue_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_scheduler_queue_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -11559,7 +11559,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by(pod) (cortex_query_scheduler_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"})", + "expr": "sum by(pod) (cortex_query_scheduler_queue_length{$read_path_matcher})", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null @@ -11608,7 +11608,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by(user) (cortex_query_scheduler_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}) > 0", + "expr": "sum by(user) (cortex_query_scheduler_queue_length{$read_path_matcher}) > 0", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -11669,7 +11669,7 @@ data: "span": 4, "targets": [ { - "expr": "sum(rate(cortex_frontend_split_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", method=\"split_by_interval_and_results_cache\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_split_queries_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{$read_path_matcher, method=\"split_by_interval_and_results_cache\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "splitting rate", "legendLink": null @@ -11718,7 +11718,7 @@ data: "span": 4, "targets": [ { - "expr": "# Query the new metric introduced in Mimir 2.10.\n(\n sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n /\n sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n)\n# Otherwise fallback to the previous general-purpose metrics.\nor\n(\n label_replace(\n # Query metrics before and after dskit cache refactor.\n sum (\n rate(thanos_cache_memcached_hits_total{name=\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_hits_total{name=\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n )\n /\n sum (\n rate(thanos_cache_memcached_requests_total{name=~\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_requests_total{name=~\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n ),\n \"request_type\", \"query_range\", \"\", \"\")\n)\n", + "expr": "# Query the new metric introduced in Mimir 2.10.\n(\n sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{$read_path_matcher}[$__rate_interval]))\n /\n sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{$read_path_matcher}[$__rate_interval]))\n)\n# Otherwise fallback to the previous general-purpose metrics.\nor\n(\n label_replace(\n # Query metrics before and after dskit cache refactor.\n sum (\n rate(thanos_cache_memcached_hits_total{name=\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_hits_total{name=\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n )\n /\n sum (\n rate(thanos_cache_memcached_requests_total{name=~\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_requests_total{name=~\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n ),\n \"request_type\", \"query_range\", \"\", \"\")\n)\n", "format": "time_series", "legendFormat": "{{request_type}}", "legendLink": null @@ -11768,7 +11768,7 @@ data: "span": 4, "targets": [ { - "expr": "sum(rate(cortex_frontend_query_result_cache_skipped_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (reason) /\nignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n", + "expr": "sum(rate(cortex_frontend_query_result_cache_skipped_total{$read_path_matcher}[$__rate_interval])) by (reason) /\nignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{$read_path_matcher}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "{{reason}}", "legendLink": null @@ -11830,7 +11830,7 @@ data: "span": 6, "targets": [ { - "expr": "sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) /\nsum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n", + "expr": "sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{$read_path_matcher}[$__rate_interval])) /\nsum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{$read_path_matcher}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "sharded queries ratio", "legendLink": null @@ -11880,19 +11880,19 @@ data: "span": 6, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -12166,19 +12166,19 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_querier_storegateway_instances_hit_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_instances_hit_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_storegateway_instances_hit_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_instances_hit_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -12245,19 +12245,19 @@ data: "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_querier_storegateway_refetches_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_refetches_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_storegateway_refetches_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_refetches_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -12341,7 +12341,7 @@ data: "span": 3, "targets": [ { - "expr": "sum(rate(cortex_querier_blocks_consistency_checks_failed_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_blocks_consistency_checks_failed_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Failure Rate", "legendLink": null @@ -12391,7 +12391,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (reason) (rate(cortex_querier_queries_rejected_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_query(_range)?\"}[$__rate_interval]))", + "expr": "sum by (reason) (rate(cortex_querier_queries_rejected_total{$read_path_matcher}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{$read_path_matcher, route=~\"(prometheus|api_prom)_api_v1_query(_range)?\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "{{reason}}", "legendLink": null @@ -12451,19 +12451,19 @@ data: "span": 4, "targets": [ { - "expr": "max(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "max(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Max", "legendLink": null }, { - "expr": "min(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "min(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Min", "legendLink": null }, { - "expr": "avg(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "avg(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Average", "legendLink": null @@ -12542,13 +12542,13 @@ data: "span": 4, "targets": [ { - "expr": "sum(rate(cortex_bucket_index_loads_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_loads_total{$read_path_matcher}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "successful", "legendLink": null }, { - "expr": "sum(rate(cortex_bucket_index_load_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "failed", "legendLink": null @@ -12597,19 +12597,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_bucket_index_load_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_bucket_index_load_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_load_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_bucket_index_load_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -13665,6 +13665,39 @@ data: "skipUrlSync": false, "type": "custom", "useTags": false + }, + { + "current": { + "selected": true, + "text": "All", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + }, + "hide": 0, + "includeAll": false, + "label": "Read path", + "multi": false, + "name": "read_path_matcher", + "options": [ + { + "selected": true, + "text": "All", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + }, + { + "selected": false, + "text": "Main", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*))\"" + }, + { + "selected": false, + "text": "Remote ruler", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + } + ], + "query": "All : cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\",Main : cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*))\",Remote ruler : cluster=~\"$cluster\"\\, job=~\"($namespace)/((ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, @@ -35214,6 +35247,7 @@ data: }, "hide": 0, "includeAll": false, + "label": "limit", "multi": false, "name": "limit", "options": [ @@ -35243,7 +35277,10 @@ data: "value": "1000" } ], - "type": "custom" + "query": "10 : 10,50 : 50,100 : 100,500 : 500,1000 : 1000", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, @@ -36978,6 +37015,7 @@ data: }, "hide": 0, "includeAll": false, + "label": "limit", "multi": false, "name": "limit", "options": [ @@ -36997,7 +37035,10 @@ data: "value": "100" } ], - "type": "custom" + "query": "10 : 10,50 : 50,100 : 100", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-queries.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-queries.json index 5c89e47f0a2..3f96818784b 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-queries.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-queries.json @@ -74,19 +74,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_frontend_queue_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_frontend_queue_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_frontend_queue_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_frontend_queue_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -153,19 +153,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_frontend_retries_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_query_frontend_retries_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_frontend_retries_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_query_frontend_retries_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -231,7 +231,7 @@ "span": 3, "targets": [ { - "expr": "sum by(instance) (cortex_query_frontend_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"})", + "expr": "sum by(instance) (cortex_query_frontend_queue_length{$read_path_matcher})", "format": "time_series", "legendFormat": "{{instance}}", "legendLink": null @@ -280,7 +280,7 @@ "span": 3, "targets": [ { - "expr": "sum by(user) (cortex_query_frontend_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}) > 0", + "expr": "sum by(user) (cortex_query_frontend_queue_length{$read_path_matcher}) > 0", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -341,19 +341,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_scheduler_queue_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_scheduler_queue_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_scheduler_queue_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_scheduler_queue_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -419,7 +419,7 @@ "span": 4, "targets": [ { - "expr": "sum by(instance) (cortex_query_scheduler_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"})", + "expr": "sum by(instance) (cortex_query_scheduler_queue_length{$read_path_matcher})", "format": "time_series", "legendFormat": "{{instance}}", "legendLink": null @@ -468,7 +468,7 @@ "span": 4, "targets": [ { - "expr": "sum by(user) (cortex_query_scheduler_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}) > 0", + "expr": "sum by(user) (cortex_query_scheduler_queue_length{$read_path_matcher}) > 0", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -529,7 +529,7 @@ "span": 4, "targets": [ { - "expr": "sum(rate(cortex_frontend_split_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", method=\"split_by_interval_and_results_cache\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_split_queries_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{$read_path_matcher, method=\"split_by_interval_and_results_cache\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "splitting rate", "legendLink": null @@ -578,7 +578,7 @@ "span": 4, "targets": [ { - "expr": "# Query the new metric introduced in Mimir 2.10.\n(\n sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n /\n sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n)\n# Otherwise fallback to the previous general-purpose metrics.\nor\n(\n label_replace(\n # Query metrics before and after dskit cache refactor.\n sum (\n rate(thanos_cache_memcached_hits_total{name=\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_hits_total{name=\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n )\n /\n sum (\n rate(thanos_cache_memcached_requests_total{name=~\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_requests_total{name=~\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n ),\n \"request_type\", \"query_range\", \"\", \"\")\n)\n", + "expr": "# Query the new metric introduced in Mimir 2.10.\n(\n sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{$read_path_matcher}[$__rate_interval]))\n /\n sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{$read_path_matcher}[$__rate_interval]))\n)\n# Otherwise fallback to the previous general-purpose metrics.\nor\n(\n label_replace(\n # Query metrics before and after dskit cache refactor.\n sum (\n rate(thanos_cache_memcached_hits_total{name=\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_hits_total{name=\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n )\n /\n sum (\n rate(thanos_cache_memcached_requests_total{name=~\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_requests_total{name=~\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n ),\n \"request_type\", \"query_range\", \"\", \"\")\n)\n", "format": "time_series", "legendFormat": "{{request_type}}", "legendLink": null @@ -628,7 +628,7 @@ "span": 4, "targets": [ { - "expr": "sum(rate(cortex_frontend_query_result_cache_skipped_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (reason) /\nignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n", + "expr": "sum(rate(cortex_frontend_query_result_cache_skipped_total{$read_path_matcher}[$__rate_interval])) by (reason) /\nignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{$read_path_matcher}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "{{reason}}", "legendLink": null @@ -690,7 +690,7 @@ "span": 6, "targets": [ { - "expr": "sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) /\nsum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n", + "expr": "sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{$read_path_matcher}[$__rate_interval])) /\nsum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{$read_path_matcher}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "sharded queries ratio", "legendLink": null @@ -740,19 +740,19 @@ "span": 6, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1026,19 +1026,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_querier_storegateway_instances_hit_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_instances_hit_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_storegateway_instances_hit_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_instances_hit_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1105,19 +1105,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_querier_storegateway_refetches_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_refetches_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_storegateway_refetches_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_refetches_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1201,7 +1201,7 @@ "span": 3, "targets": [ { - "expr": "sum(rate(cortex_querier_blocks_consistency_checks_failed_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_blocks_consistency_checks_failed_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Failure Rate", "legendLink": null @@ -1251,7 +1251,7 @@ "span": 3, "targets": [ { - "expr": "sum by (reason) (rate(cortex_querier_queries_rejected_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_query(_range)?\"}[$__rate_interval]))", + "expr": "sum by (reason) (rate(cortex_querier_queries_rejected_total{$read_path_matcher}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{$read_path_matcher, route=~\"(prometheus|api_prom)_api_v1_query(_range)?\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "{{reason}}", "legendLink": null @@ -1311,19 +1311,19 @@ "span": 4, "targets": [ { - "expr": "max(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "max(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Max", "legendLink": null }, { - "expr": "min(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "min(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Min", "legendLink": null }, { - "expr": "avg(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "avg(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Average", "legendLink": null @@ -1402,13 +1402,13 @@ "span": 4, "targets": [ { - "expr": "sum(rate(cortex_bucket_index_loads_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_loads_total{$read_path_matcher}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "successful", "legendLink": null }, { - "expr": "sum(rate(cortex_bucket_index_load_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "failed", "legendLink": null @@ -1457,19 +1457,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_bucket_index_load_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_bucket_index_load_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_load_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_bucket_index_load_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -2525,6 +2525,39 @@ "skipUrlSync": false, "type": "custom", "useTags": false + }, + { + "current": { + "selected": true, + "text": "All", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + }, + "hide": 0, + "includeAll": false, + "label": "Read path", + "multi": false, + "name": "read_path_matcher", + "options": [ + { + "selected": true, + "text": "All", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + }, + { + "selected": false, + "text": "Main", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*))\"" + }, + { + "selected": false, + "text": "Remote ruler", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + } + ], + "query": "All : cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\",Main : cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*))\",Remote ruler : cluster=~\"$cluster\"\\, job=~\"($namespace)/((ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-tenants.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-tenants.json index 604c0e9e02f..3e23c26db2f 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-tenants.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-tenants.json @@ -2653,6 +2653,7 @@ }, "hide": 0, "includeAll": false, + "label": "limit", "multi": false, "name": "limit", "options": [ @@ -2682,7 +2683,10 @@ "value": "1000" } ], - "type": "custom" + "query": "10 : 10,50 : 50,100 : 100,500 : 500,1000 : 1000", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-top-tenants.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-top-tenants.json index e649a9d3b54..83ca6a7bd5f 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-top-tenants.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-top-tenants.json @@ -1679,6 +1679,7 @@ }, "hide": 0, "includeAll": false, + "label": "limit", "multi": false, "name": "limit", "options": [ @@ -1698,7 +1699,10 @@ "value": "100" } ], - "type": "custom" + "query": "10 : 10,50 : 50,100 : 100", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-queries.json b/operations/mimir-mixin-compiled/dashboards/mimir-queries.json index db27c339497..bd71fcabe50 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-queries.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-queries.json @@ -74,19 +74,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_frontend_queue_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_frontend_queue_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_frontend_queue_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_frontend_queue_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -153,19 +153,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_frontend_retries_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_query_frontend_retries_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_frontend_retries_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_query_frontend_retries_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -231,7 +231,7 @@ "span": 3, "targets": [ { - "expr": "sum by(pod) (cortex_query_frontend_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"})", + "expr": "sum by(pod) (cortex_query_frontend_queue_length{$read_path_matcher})", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null @@ -280,7 +280,7 @@ "span": 3, "targets": [ { - "expr": "sum by(user) (cortex_query_frontend_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}) > 0", + "expr": "sum by(user) (cortex_query_frontend_queue_length{$read_path_matcher}) > 0", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -341,19 +341,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_query_scheduler_queue_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_query_scheduler_queue_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_scheduler_queue_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_query_scheduler_queue_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_query_scheduler_queue_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -419,7 +419,7 @@ "span": 4, "targets": [ { - "expr": "sum by(pod) (cortex_query_scheduler_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"})", + "expr": "sum by(pod) (cortex_query_scheduler_queue_length{$read_path_matcher})", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null @@ -468,7 +468,7 @@ "span": 4, "targets": [ { - "expr": "sum by(user) (cortex_query_scheduler_queue_length{cluster=~\"$cluster\", job=~\"($namespace)/((query-scheduler.*|mimir-backend.*))\"}) > 0", + "expr": "sum by(user) (cortex_query_scheduler_queue_length{$read_path_matcher}) > 0", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -529,7 +529,7 @@ "span": 4, "targets": [ { - "expr": "sum(rate(cortex_frontend_split_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", method=\"split_by_interval_and_results_cache\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_split_queries_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{$read_path_matcher, method=\"split_by_interval_and_results_cache\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "splitting rate", "legendLink": null @@ -578,7 +578,7 @@ "span": 4, "targets": [ { - "expr": "# Query the new metric introduced in Mimir 2.10.\n(\n sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n /\n sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n)\n# Otherwise fallback to the previous general-purpose metrics.\nor\n(\n label_replace(\n # Query metrics before and after dskit cache refactor.\n sum (\n rate(thanos_cache_memcached_hits_total{name=\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_hits_total{name=\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n )\n /\n sum (\n rate(thanos_cache_memcached_requests_total{name=~\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_requests_total{name=~\"frontend-cache\", cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])\n ),\n \"request_type\", \"query_range\", \"\", \"\")\n)\n", + "expr": "# Query the new metric introduced in Mimir 2.10.\n(\n sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{$read_path_matcher}[$__rate_interval]))\n /\n sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{$read_path_matcher}[$__rate_interval]))\n)\n# Otherwise fallback to the previous general-purpose metrics.\nor\n(\n label_replace(\n # Query metrics before and after dskit cache refactor.\n sum (\n rate(thanos_cache_memcached_hits_total{name=\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_hits_total{name=\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n )\n /\n sum (\n rate(thanos_cache_memcached_requests_total{name=~\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n or ignoring(backend)\n rate(thanos_cache_requests_total{name=~\"frontend-cache\", $read_path_matcher}[$__rate_interval])\n ),\n \"request_type\", \"query_range\", \"\", \"\")\n)\n", "format": "time_series", "legendFormat": "{{request_type}}", "legendLink": null @@ -628,7 +628,7 @@ "span": 4, "targets": [ { - "expr": "sum(rate(cortex_frontend_query_result_cache_skipped_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (reason) /\nignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n", + "expr": "sum(rate(cortex_frontend_query_result_cache_skipped_total{$read_path_matcher}[$__rate_interval])) by (reason) /\nignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{$read_path_matcher}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "{{reason}}", "legendLink": null @@ -690,7 +690,7 @@ "span": 6, "targets": [ { - "expr": "sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) /\nsum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))\n", + "expr": "sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{$read_path_matcher}[$__rate_interval])) /\nsum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{$read_path_matcher}[$__rate_interval]))\n", "format": "time_series", "legendFormat": "sharded queries ratio", "legendLink": null @@ -740,19 +740,19 @@ "span": 6, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_frontend_sharded_queries_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_frontend_sharded_queries_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_frontend_sharded_queries_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1026,19 +1026,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_instances_hit_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_querier_storegateway_instances_hit_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_instances_hit_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_storegateway_instances_hit_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_instances_hit_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1105,19 +1105,19 @@ "span": 3, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.99, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1", + "expr": "histogram_quantile(0.50, sum(rate(cortex_querier_storegateway_refetches_per_query_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_querier_storegateway_refetches_per_query_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_refetches_per_query_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_storegateway_refetches_per_query_sum{$read_path_matcher}[$__rate_interval])) * 1 / sum(rate(cortex_querier_storegateway_refetches_per_query_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -1201,7 +1201,7 @@ "span": 3, "targets": [ { - "expr": "sum(rate(cortex_querier_blocks_consistency_checks_failed_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_querier_blocks_consistency_checks_failed_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Failure Rate", "legendLink": null @@ -1251,7 +1251,7 @@ "span": 3, "targets": [ { - "expr": "sum by (reason) (rate(cortex_querier_queries_rejected_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_query(_range)?\"}[$__rate_interval]))", + "expr": "sum by (reason) (rate(cortex_querier_queries_rejected_total{$read_path_matcher}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{$read_path_matcher, route=~\"(prometheus|api_prom)_api_v1_query(_range)?\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "{{reason}}", "legendLink": null @@ -1311,19 +1311,19 @@ "span": 4, "targets": [ { - "expr": "max(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "max(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Max", "legendLink": null }, { - "expr": "min(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "min(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Min", "legendLink": null }, { - "expr": "avg(cortex_bucket_index_loaded{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"})", + "expr": "avg(cortex_bucket_index_loaded{$read_path_matcher})", "format": "time_series", "legendFormat": "Average", "legendLink": null @@ -1402,13 +1402,13 @@ "span": 4, "targets": [ { - "expr": "sum(rate(cortex_bucket_index_loads_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_loads_total{$read_path_matcher}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "successful", "legendLink": null }, { - "expr": "sum(rate(cortex_bucket_index_load_failures_total{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "failed", "legendLink": null @@ -1457,19 +1457,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "99th Percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(cortex_bucket_index_load_duration_seconds_bucket{$read_path_matcher}[$__rate_interval])) by (le)) * 1e3", "format": "time_series", "legendFormat": "50th Percentile", "refId": "B" }, { - "expr": "sum(rate(cortex_bucket_index_load_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval])) * 1e3 / sum(rate(cortex_bucket_index_load_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((querier.*|cortex|mimir|mimir-read.*))\"}[$__rate_interval]))", + "expr": "sum(rate(cortex_bucket_index_load_duration_seconds_sum{$read_path_matcher}[$__rate_interval])) * 1e3 / sum(rate(cortex_bucket_index_load_duration_seconds_count{$read_path_matcher}[$__rate_interval]))", "format": "time_series", "legendFormat": "Average", "refId": "C" @@ -2525,6 +2525,39 @@ "skipUrlSync": false, "type": "custom", "useTags": false + }, + { + "current": { + "selected": true, + "text": "All", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + }, + "hide": 0, + "includeAll": false, + "label": "Read path", + "multi": false, + "name": "read_path_matcher", + "options": [ + { + "selected": true, + "text": "All", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + }, + { + "selected": false, + "text": "Main", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*))\"" + }, + { + "selected": false, + "text": "Remote ruler", + "value": "cluster=~\"$cluster\"\\, job=~\"($namespace)/((ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"" + } + ], + "query": "All : cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*|ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\",Main : cluster=~\"$cluster\"\\, job=~\"($namespace)/((cortex|mimir|mimir-backend.*|mimir-read.*|querier.*|query-frontend.*|query-scheduler.*))\",Remote ruler : cluster=~\"$cluster\"\\, job=~\"($namespace)/((ruler-querier.*|ruler-query-frontend.*|ruler-query-scheduler.*))\"", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json b/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json index b042c098269..40101675ad2 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-tenants.json @@ -2653,6 +2653,7 @@ }, "hide": 0, "includeAll": false, + "label": "limit", "multi": false, "name": "limit", "options": [ @@ -2682,7 +2683,10 @@ "value": "1000" } ], - "type": "custom" + "query": "10 : 10,50 : 50,100 : 100,500 : 500,1000 : 1000", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-top-tenants.json b/operations/mimir-mixin-compiled/dashboards/mimir-top-tenants.json index 282085397ad..ee8b2e6d155 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-top-tenants.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-top-tenants.json @@ -1679,6 +1679,7 @@ }, "hide": 0, "includeAll": false, + "label": "limit", "multi": false, "name": "limit", "options": [ @@ -1698,7 +1699,10 @@ "value": "100" } ], - "type": "custom" + "query": "10 : 10,50 : 50,100 : 100", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-tools/serve/run.sh b/operations/mimir-mixin-tools/serve/run.sh index a94eb3b23e9..549feef6400 100755 --- a/operations/mimir-mixin-tools/serve/run.sh +++ b/operations/mimir-mixin-tools/serve/run.sh @@ -5,7 +5,7 @@ set -e SCRIPT_DIR=$(cd `dirname $0` && pwd) # Ensure we run recent Grafana. -GRAFANA_VERSION=11.0.0 +GRAFANA_VERSION=11.1.3 DOCKER_CONTAINER_NAME="mixin-serve-grafana" DOCKER_OPTS="" diff --git a/operations/mimir-mixin/.lint b/operations/mimir-mixin/.lint index ba1a8492541..95d6d206cee 100644 --- a/operations/mimir-mixin/.lint +++ b/operations/mimir-mixin/.lint @@ -17,9 +17,10 @@ exclusions: - dashboard: Mimir / Top tenants panel: Top $limit users by received exemplars rate in last 5m target-promql-rule: - reason: Skipping in dashboards where the linter parses a Loki query as Prometheus one. + reason: Skipping in dashboards where the linter parses a Loki query as Prometheus one, or we define label matchers as template variables. entries: - dashboard: Mimir / Slow queries + - dashboard: Mimir / Queries template-datasource-rule: reason: We prefer to keep calling "datasource" the Prometheus datasource to keep consistency between dashboards. entries: diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index 6f37127a970..9542769a68f 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -88,6 +88,10 @@ alertmanager: ['alertmanager', 'cortex', 'mimir', 'mimir-backend.*'], overrides_exporter: ['overrides-exporter', 'mimir-backend.*'], + // The following are job matchers used to select all components in the read path. + main_read_path: std.uniq(std.sort(self.query_frontend + self.query_scheduler + self.querier)), + remote_ruler_read_path: std.uniq(std.sort(self.ruler_query_frontend + self.ruler_query_scheduler + self.ruler_querier)), + // The following are job matchers used to select all components in a given "path". write: ['distributor.*', 'ingester.*', 'mimir-write.*'], read: ['query-frontend.*', 'querier.*', 'ruler-query-frontend.*', 'ruler-querier.*', 'mimir-read.*'], diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 20a316009bb..a63b5a4b8ef 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -126,30 +126,38 @@ local utils = import 'mixin-utils/utils.libsonnet'; addActiveUserSelectorTemplates():: self.addTemplate('user', 'cortex_ingester_active_series{%s=~"$cluster", %s=~"$namespace"}' % [$._config.per_cluster_label, $._config.per_namespace_label], 'user', sort=sortAscending), - addCustomTemplate(name, values, defaultIndex=0):: self { + addCustomTemplate(label, name, options, defaultIndex=0):: self { + // Escape the comma because it's used a separator in the options list. + local escapeValue(v) = std.strReplace(v, ',', '\\,'), + templating+: { - list+: [ - { - name: name, - options: [ - { - selected: v == values[defaultIndex], - text: v, - value: v, - } - for v in values - ], - current: { - selected: true, - text: values[defaultIndex], - value: values[defaultIndex], - }, - type: 'custom', - hide: 0, - includeAll: false, - multi: false, + list+: [{ + current: { + selected: true, + text: options[defaultIndex].label, + value: escapeValue(options[defaultIndex].value), }, - ], + hide: 0, + includeAll: false, + label: label, + multi: false, + name: name, + query: std.join(',', [ + '%s : %s' % [option.label, escapeValue(option.value)] + for option in options + ]), + options: [ + { + selected: option.label == options[defaultIndex].label, + text: option.label, + value: escapeValue(option.value), + } + for option in options + ], + skipUrlSync: false, + type: 'custom', + useTags: false, + }], }, }, }, @@ -1884,7 +1892,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, - ingestStorageFetchLastProducedOffsetRequestsPanel(jobName):: + ingestStorageFetchLastProducedOffsetRequestsPanel(jobMatcher):: $.timeseriesPanel('Fetch last produced offset requests / sec') + $.panelDescription( 'Fetch last produced offset requests / sec', @@ -1896,10 +1904,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval])) - sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names[jobName]), $.jobMatcher($._config.job_names[jobName])], + ||| % [jobMatcher, jobMatcher], ||| sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names[jobName])], + ||| % [jobMatcher], ], [ 'successful', @@ -1913,7 +1921,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, - ingestStorageFetchLastProducedOffsetLatencyPanel(jobName):: + ingestStorageFetchLastProducedOffsetLatencyPanel(jobMatcher):: $.timeseriesPanel('Fetch last produced offset latency') + $.panelDescription( 'Fetch last produced offset latency', @@ -1923,10 +1931,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + $.queryPanel( [ - 'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], - 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], - 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], - 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_avg(sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [jobMatcher], + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [jobMatcher], + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [jobMatcher], + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s}[$__rate_interval])))' % [jobMatcher], ], [ 'avg', @@ -1940,10 +1948,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, - ingestStorageStrongConsistencyRequestsPanel(jobName):: - // The unit changes whether the metric is exposed from ingesters or other components. In the ingesters it's the + ingestStorageStrongConsistencyRequestsPanel(component, jobMatcher):: + // The unit changes whether the metric is exposed from ingesters (partition-reader) or other components. In the ingesters it's the // requests issued by queriers to ingesters, while in other components it's the actual query. - local unit = if jobName == 'ingester' then 'requests' else 'queries'; + local unit = if component == 'partition-reader' then 'requests' else 'queries'; local title = '%s with strong read consistency / sec' % (std.asciiUpper(std.substr(unit, 0, 1)) + std.substr(unit, 1, std.length(unit) - 1)); $.timeseriesPanel(title) + @@ -1956,13 +1964,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval])) + sum(rate(cortex_ingest_storage_strong_consistency_requests_total{component="%(component)s", %(jobMatcher)s}[$__rate_interval])) - - sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names[jobName]), $.jobMatcher($._config.job_names[jobName])], + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{component="%(component)s", %(jobMatcher)s}[$__rate_interval])) + ||| % { jobMatcher: jobMatcher, component: component }, ||| - sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names[jobName])], + sum(rate(cortex_ingest_storage_strong_consistency_failures_total{component="%(component)s", %(jobMatcher)s}[$__rate_interval])) + ||| % { jobMatcher: jobMatcher, component: component }, ], [ 'successful', @@ -1976,7 +1984,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack, - ingestStorageStrongConsistencyWaitLatencyPanel(jobName):: + ingestStorageStrongConsistencyWaitLatencyPanel(component, jobMatcher):: $.timeseriesPanel('Strong read consistency queries — wait latency') + $.panelDescription( 'Strong read consistency queries — wait latency', @@ -1984,10 +1992,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + $.queryPanel( [ - 'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], - 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], - 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], - 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names[jobName])], + 'histogram_avg(sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{component="%(component)s", %(jobMatcher)s}[$__rate_interval])))' % { component: component, jobMatcher: jobMatcher }, + 'histogram_quantile(0.99, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{component="%(component)s", %(jobMatcher)s}[$__rate_interval])))' % { component: component, jobMatcher: jobMatcher }, + 'histogram_quantile(0.999, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{component="%(component)s", %(jobMatcher)s}[$__rate_interval])))' % { component: component, jobMatcher: jobMatcher }, + 'histogram_quantile(1.0, sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds{component="%(component)s", %(jobMatcher)s}[$__rate_interval])))' % { component: component, jobMatcher: jobMatcher }, ], [ 'avg', diff --git a/operations/mimir-mixin/dashboards/queries.libsonnet b/operations/mimir-mixin/dashboards/queries.libsonnet index e746ff33267..f20a318e32d 100644 --- a/operations/mimir-mixin/dashboards/queries.libsonnet +++ b/operations/mimir-mixin/dashboards/queries.libsonnet @@ -7,28 +7,39 @@ local filename = 'mimir-queries.json'; ($.dashboard('Queries') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() .addShowNativeLatencyVariable() + // This selector allows to switch the read path components queried in this dashboard. + // Since the labels matcher used by this selector is very wide (includes multiple components) + // whenever you want to show panels for a specific component (e.g. query-frontend) you can safely + // use this selector only on metrics that are exposed by the specific component itself + // (e.g. cortex_query_frontend_* metrics are only exposed by query-frontend) or on metrics + // that have other labels that allow to distinguish the component (e.g. component="query-frontend"). + .addCustomTemplate('Read path', 'read_path_matcher', [ + { label: 'All', value: $.jobMatcher(std.uniq(std.sort($._config.job_names.main_read_path + $._config.job_names.remote_ruler_read_path))) }, + { label: 'Main', value: $.jobMatcher($._config.job_names.main_read_path) }, + { label: 'Remote ruler', value: $.jobMatcher($._config.job_names.remote_ruler_read_path) }, + ]) .addRow( $.row('Query-frontend') .addPanel( $.timeseriesPanel('Queue duration') + - $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{$read_path_matcher}'), ) .addPanel( $.timeseriesPanel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + + $.latencyPanel('cortex_query_frontend_retries', '{$read_path_matcher}', multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.timeseriesPanel('Queue length (per %s)' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_query_frontend_queue_length{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], + 'sum by(%s) (cortex_query_frontend_queue_length{$read_path_matcher})' % [$._config.per_instance_label], '{{%s}}' % $._config.per_instance_label ), ) .addPanel( $.timeseriesPanel('Queue length (per user)') + $.queryPanel( - 'sum by(user) (cortex_query_frontend_queue_length{%s}) > 0' % [$.jobMatcher($._config.job_names.query_frontend)], + 'sum by(user) (cortex_query_frontend_queue_length{$read_path_matcher}) > 0', '{{user}}' ) + { fieldConfig+: { defaults+: { noValue: '0' } } } @@ -38,19 +49,19 @@ local filename = 'mimir-queries.json'; $.row('Query-scheduler') .addPanel( $.timeseriesPanel('Queue duration') + - $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{$read_path_matcher}'), ) .addPanel( $.timeseriesPanel('Queue length (per %s)' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_query_scheduler_queue_length{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_scheduler)], + 'sum by(%s) (cortex_query_scheduler_queue_length{$read_path_matcher})' % [$._config.per_instance_label], '{{%s}}' % $._config.per_instance_label ), ) .addPanel( $.timeseriesPanel('Queue length (per user)') + $.queryPanel( - 'sum by(user) (cortex_query_scheduler_queue_length{%s}) > 0' % [$.jobMatcher($._config.job_names.query_scheduler)], + 'sum by(user) (cortex_query_scheduler_queue_length{$read_path_matcher}) > 0', '{{user}}' ) + { fieldConfig+: { defaults+: { noValue: '0' } } } @@ -60,7 +71,7 @@ local filename = 'mimir-queries.json'; $.row('Query-frontend – query splitting and results cache') .addPanel( $.timeseriesPanel('Intervals per query') + - $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval_and_results_cache"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{$read_path_matcher, method="split_by_interval_and_results_cache"}[$__rate_interval]))', 'splitting rate') + $.panelDescription( 'Intervals per query', ||| @@ -74,9 +85,9 @@ local filename = 'mimir-queries.json'; ||| # Query the new metric introduced in Mimir 2.10. ( - sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{%(frontend)s}[$__rate_interval])) + sum by(request_type) (rate(cortex_frontend_query_result_cache_hits_total{$read_path_matcher}[$__rate_interval])) / - sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{%(frontend)s}[$__rate_interval])) + sum by(request_type) (rate(cortex_frontend_query_result_cache_requests_total{$read_path_matcher}[$__rate_interval])) ) # Otherwise fallback to the previous general-purpose metrics. or @@ -84,21 +95,19 @@ local filename = 'mimir-queries.json'; label_replace( # Query metrics before and after dskit cache refactor. sum ( - rate(thanos_cache_memcached_hits_total{name="frontend-cache", %(frontend)s}[$__rate_interval]) + rate(thanos_cache_memcached_hits_total{name="frontend-cache", $read_path_matcher}[$__rate_interval]) or ignoring(backend) - rate(thanos_cache_hits_total{name="frontend-cache", %(frontend)s}[$__rate_interval]) + rate(thanos_cache_hits_total{name="frontend-cache", $read_path_matcher}[$__rate_interval]) ) / sum ( - rate(thanos_cache_memcached_requests_total{name=~"frontend-cache", %(frontend)s}[$__rate_interval]) + rate(thanos_cache_memcached_requests_total{name=~"frontend-cache", $read_path_matcher}[$__rate_interval]) or ignoring(backend) - rate(thanos_cache_requests_total{name=~"frontend-cache", %(frontend)s}[$__rate_interval]) + rate(thanos_cache_requests_total{name=~"frontend-cache", $read_path_matcher}[$__rate_interval]) ), "request_type", "query_range", "", "") ) - ||| % { - frontend: $.jobMatcher($._config.job_names.query_frontend), - }, + |||, '{{request_type}}', ) + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } @@ -106,9 +115,9 @@ local filename = 'mimir-queries.json'; .addPanel( $.timeseriesPanel('Query results cache skipped') + $.queryPanel(||| - sum(rate(cortex_frontend_query_result_cache_skipped_total{%s}[$__rate_interval])) by (reason) / - ignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], '{{reason}}') + + sum(rate(cortex_frontend_query_result_cache_skipped_total{$read_path_matcher}[$__rate_interval])) by (reason) / + ignoring (reason) group_left sum(rate(cortex_frontend_query_result_cache_attempted_total{$read_path_matcher}[$__rate_interval])) + |||, '{{reason}}') + $.stack + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + $.panelDescription( @@ -125,9 +134,9 @@ local filename = 'mimir-queries.json'; .addPanel( $.timeseriesPanel('Sharded queries ratio') + $.queryPanel(||| - sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{%s}[$__rate_interval])) / - sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'sharded queries ratio') + + sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{$read_path_matcher}[$__rate_interval])) / + sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{$read_path_matcher}[$__rate_interval])) + |||, 'sharded queries ratio') + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + $.panelDescription( 'Sharded queries ratio', @@ -139,7 +148,7 @@ local filename = 'mimir-queries.json'; ) .addPanel( $.timeseriesPanel('Number of sharded queries per query') + - $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{$read_path_matcher}', multiplier=1) + { fieldConfig+: { defaults+: { unit: 'short' } } } + $.panelDescription( 'Number of sharded queries per query', @@ -154,7 +163,7 @@ local filename = 'mimir-queries.json'; $._config.show_ingest_storage_panels, $.row('Query-frontend – strong consistency (ingest storage)') .addPanel( - $.ingestStorageStrongConsistencyRequestsPanel('query_frontend') + $.ingestStorageStrongConsistencyRequestsPanel('query-frontend', '$read_path_matcher') ) .addPanel( $.timeseriesPanel('Queries with strong read consistency ratio') + @@ -167,12 +176,10 @@ local filename = 'mimir-queries.json'; $.queryPanel( [ ||| - # Display the ratio by container so that it gives a quick visual clue whether requests are coming - # from user queries (query-frontend) or rule evaluations (ruler-query-frontend). - sum by(container) (rate(cortex_query_frontend_queries_consistency_total{%s,consistency="strong"}[$__rate_interval])) + sum by(container) (rate(cortex_query_frontend_queries_consistency_total{$read_path_matcher,consistency="strong"}[$__rate_interval])) / - sum by(container) (rate(cortex_query_frontend_queries_total{%s}[$__rate_interval])) - ||| % [$.namespaceMatcher(), $.namespaceMatcher()], + sum by(container) (rate(cortex_query_frontend_queries_total{$read_path_matcher}[$__rate_interval])) + |||, ], ['{{container}}'], ) @@ -180,17 +187,17 @@ local filename = 'mimir-queries.json'; + $.stack ) .addPanel( - $.ingestStorageStrongConsistencyWaitLatencyPanel('query_frontend') + $.ingestStorageStrongConsistencyWaitLatencyPanel('query-frontend', '$read_path_matcher') ) ) .addRowIf( $._config.show_ingest_storage_panels, $.row('') .addPanel( - $.ingestStorageFetchLastProducedOffsetRequestsPanel('query_frontend') + $.ingestStorageFetchLastProducedOffsetRequestsPanel('$read_path_matcher') ) .addPanel( - $.ingestStorageFetchLastProducedOffsetLatencyPanel('query_frontend') + $.ingestStorageFetchLastProducedOffsetLatencyPanel('$read_path_matcher') ) ) .addRow( @@ -215,7 +222,7 @@ local filename = 'mimir-queries.json'; $._config.show_ingest_storage_panels, ($.row('Ingester – strong consistency (ingest storage)')) .addPanel( - $.ingestStorageStrongConsistencyRequestsPanel('ingester') + $.ingestStorageStrongConsistencyRequestsPanel('partition-reader', $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.timeseriesPanel('Requests with strong read consistency ratio') + @@ -253,17 +260,17 @@ local filename = 'mimir-queries.json'; + $.stack ) .addPanel( - $.ingestStorageStrongConsistencyWaitLatencyPanel('ingester'), + $.ingestStorageStrongConsistencyWaitLatencyPanel('partition-reader', $.jobMatcher($._config.job_names.ingester)), ) ) .addRowIf( $._config.show_ingest_storage_panels, $.row('') .addPanel( - $.ingestStorageFetchLastProducedOffsetRequestsPanel('ingester'), + $.ingestStorageFetchLastProducedOffsetRequestsPanel($.jobMatcher($._config.job_names.ingester)), ) .addPanel( - $.ingestStorageFetchLastProducedOffsetLatencyPanel('ingester'), + $.ingestStorageFetchLastProducedOffsetLatencyPanel($.jobMatcher($._config.job_names.ingester)), ) .addPanel( $.ingestStorageIngesterEndToEndLatencyWhenRunningPanel(), @@ -273,17 +280,17 @@ local filename = 'mimir-queries.json'; $.row('Querier') .addPanel( $.timeseriesPanel('Number of store-gateways hit per query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{$read_path_matcher}', multiplier=1) + { fieldConfig+: { defaults+: { unit: 'short' } } }, ) .addPanel( $.timeseriesPanel('Refetches of missing blocks per query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{$read_path_matcher}', multiplier=1) + { fieldConfig+: { defaults+: { unit: 'short' } } }, ) .addPanel( $.timeseriesPanel('Consistency checks failed') + - $.failurePanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Failure Rate') + + $.failurePanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{$read_path_matcher}[$__rate_interval])) / sum(rate(cortex_querier_blocks_consistency_checks_total{$read_path_matcher}[$__rate_interval]))', 'Failure Rate') + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + $.panelDescription( 'Consistency checks failed', @@ -294,7 +301,7 @@ local filename = 'mimir-queries.json'; ) .addPanel( $.timeseriesPanel('Rejected queries') + - $.queryPanel('sum by (reason) (rate(cortex_querier_queries_rejected_total{%(job_matcher)s}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{%(job_matcher)s, route=~"%(routes_regex)s"}[$__rate_interval]))' % { job_matcher: $.jobMatcher($._config.job_names.querier), routes_regex: $.queries.query_http_routes_regex }, '{{reason}}') + + $.queryPanel('sum by (reason) (rate(cortex_querier_queries_rejected_total{$read_path_matcher}[$__rate_interval])) / ignoring (reason) group_left sum(rate(cortex_querier_request_duration_seconds_count{$read_path_matcher, route=~"%(routes_regex)s"}[$__rate_interval]))' % { routes_regex: $.queries.query_http_routes_regex }, '{{reason}}') + { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } } + $.panelDescription( 'Rejected queries', @@ -309,23 +316,23 @@ local filename = 'mimir-queries.json'; .addPanel( $.timeseriesPanel('Bucket indexes loaded (per querier)') + $.queryPanel([ - 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), - 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), - 'avg(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), + 'max(cortex_bucket_index_loaded{$read_path_matcher})', + 'min(cortex_bucket_index_loaded{$read_path_matcher})', + 'avg(cortex_bucket_index_loaded{$read_path_matcher})', ], ['Max', 'Min', 'Average']) + { fieldConfig+: { defaults+: { unit: 'short' } } }, ) .addPanel( $.timeseriesPanel('Bucket indexes load / sec') + $.successFailurePanel( - 'sum(rate(cortex_bucket_index_loads_total{%s}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], - 'sum(rate(cortex_bucket_index_load_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.querier), + 'sum(rate(cortex_bucket_index_loads_total{$read_path_matcher}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))', + 'sum(rate(cortex_bucket_index_load_failures_total{$read_path_matcher}[$__rate_interval]))', ) + $.stack ) .addPanel( $.timeseriesPanel('Bucket indexes load latency') + - $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{$read_path_matcher}'), ) ) .addRow( diff --git a/operations/mimir-mixin/dashboards/tenants.libsonnet b/operations/mimir-mixin/dashboards/tenants.libsonnet index 180d1bae810..daccca2315e 100644 --- a/operations/mimir-mixin/dashboards/tenants.libsonnet +++ b/operations/mimir-mixin/dashboards/tenants.libsonnet @@ -22,7 +22,15 @@ local filename = 'mimir-tenants.json'; ($.dashboard('Tenants') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() .addActiveUserSelectorTemplates() - .addCustomTemplate('limit', ['10', '50', '100', '500', '1000']) + .addCustomTemplate( + 'limit', 'limit', [ + { label: '10', value: '10' }, + { label: '50', value: '50' }, + { label: '100', value: '100' }, + { label: '500', value: '500' }, + { label: '1000', value: '1000' }, + ] + ) .addRowIf( $._config.show_dashboard_descriptions.tenants, ($.row('Tenants dashboard description') { height: '25px', showTitle: false }) diff --git a/operations/mimir-mixin/dashboards/top-tenants.libsonnet b/operations/mimir-mixin/dashboards/top-tenants.libsonnet index 676e0de394d..d8552d7b462 100644 --- a/operations/mimir-mixin/dashboards/top-tenants.libsonnet +++ b/operations/mimir-mixin/dashboards/top-tenants.libsonnet @@ -21,7 +21,13 @@ local filename = 'mimir-top-tenants.json'; assert std.md5(filename) == 'bc6e12d4fe540e4a1785b9d3ca0ffdd9' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Top tenants') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() - .addCustomTemplate('limit', ['10', '50', '100']) + .addCustomTemplate( + 'limit', 'limit', [ + { label: '10', value: '10' }, + { label: '50', value: '50' }, + { label: '100', value: '100' }, + ] + ) .addRowIf( $._config.show_dashboard_descriptions.top_tenants, ($.row('Top tenants dashboard description') { height: '25px', showTitle: false })