From bb914f6a95b50d9e0b23a7d669e31998ecd33f79 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Wed, 3 Apr 2024 13:52:52 +1100 Subject: [PATCH 01/10] Querytee: Add metric to measure relative backend latency This metric gives us a measurement of how individual queries compare between two backends in terms of latency (or duration). --- tools/querytee/proxy_endpoint.go | 2 + tools/querytee/proxy_endpoint_test.go | 132 ++++++++++++++++++++++++++ tools/querytee/proxy_metrics.go | 7 ++ 3 files changed, 141 insertions(+) diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go index e81a893b99f..4d89ad56fb8 100644 --- a/tools/querytee/proxy_endpoint.go +++ b/tools/querytee/proxy_endpoint.go @@ -224,6 +224,8 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba ) } + relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime + p.metrics.relativeDuration.WithLabelValues(req.Method, p.routeName).Observe(relativeDuration.Seconds()) p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc() } } diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index 0b549f41b9e..cd5aac61412 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -446,6 +446,138 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { } } +func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) { + scenarios := map[string]struct { + preferredResponseLatency time.Duration + secondaryResponseLatency time.Duration + expectedMetric string + }{ + "secondary backend is 2 seconds faster than preferred": { + preferredResponseLatency: 3 * time.Second, + secondaryResponseLatency: 1 * time.Second, + expectedMetric: (` + # HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend. + # TYPE cortex_querytee_backend_request_relative_duration_seconds histogram + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1 + cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} 2 + cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1 +`), + }, + "preferred backend is 5 seconds faster than secondary": { + preferredResponseLatency: 2 * time.Second, + secondaryResponseLatency: 7 * time.Second, + expectedMetric: (` + # HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend. + # TYPE cortex_querytee_backend_request_relative_duration_seconds histogram + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1 + cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1 + cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} -5 + cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1 +`), + }, + } + + for name, scenario := range scenarios { + t.Run(name, func(t *testing.T) { + backends := []ProxyBackendInterface{ + newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency), + newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency), + } + + logger := newMockLogger() + reg := prometheus.NewPedanticRegistry() + comparator := &mockComparator{ + comparisonResult: ComparisonSuccess, + } + + endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0) + + resp := httptest.NewRecorder() + req, err := http.NewRequest("GET", "http://test/api/v1/test", nil) + require.NoError(t, err) + endpoint.ServeHTTP(resp, req) + + // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. + // Wait for the response comparison to complete before checking the logged messages. + waitForResponseComparisonMetric(t, reg, ComparisonSuccess) + + require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds")) + + }) + } +} + func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) { started := time.Now() timeoutAt := started.Add(2 * time.Second) diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go index 019c9dea1d0..de2c53262e8 100644 --- a/tools/querytee/proxy_metrics.go +++ b/tools/querytee/proxy_metrics.go @@ -23,6 +23,7 @@ type ProxyMetrics struct { requestDuration *prometheus.HistogramVec responsesTotal *prometheus.CounterVec responsesComparedTotal *prometheus.CounterVec + relativeDuration *prometheus.HistogramVec } func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { @@ -43,6 +44,12 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { Name: "responses_compared_total", Help: "Total number of responses compared per route name by result.", }, []string{"route", "result"}), + relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ + Namespace: queryTeeMetricsNamespace, + Name: "backend_request_relative_duration_seconds", + Help: "Time (in seconds) of preferred backend less secondary backend.", + Buckets: []float64{-100, -50, -25, -10, -5, -4, -3, -2, -1.5, -1, -.75, -.5, -.25, -.1, -.05, -.025, -.01, -.005, .005, .01, .025, .05, .1, .25, .5, 0.75, 1, 1.5, 2, 3, 4, 5, 10, 25, 50, 100}, + }, []string{"method", "route"}), } return m From c0c91f8237aa90a045940662c9c53451d96daa33 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Thu, 4 Apr 2024 17:34:15 +1100 Subject: [PATCH 02/10] Add proportional metric, and change to native histograms --- tools/querytee/proxy_endpoint.go | 4 +- tools/querytee/proxy_endpoint_test.go | 111 ++++++-------------------- tools/querytee/proxy_metrics.go | 17 ++-- 3 files changed, 38 insertions(+), 94 deletions(-) diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go index 4d89ad56fb8..62615f3457b 100644 --- a/tools/querytee/proxy_endpoint.go +++ b/tools/querytee/proxy_endpoint.go @@ -225,7 +225,9 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba } relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime - p.metrics.relativeDuration.WithLabelValues(req.Method, p.routeName).Observe(relativeDuration.Seconds()) + proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds() + p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds()) + p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration) p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc() } } diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index cd5aac61412..b78a90689c8 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -21,6 +21,7 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/atomic" @@ -455,96 +456,12 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) { "secondary backend is 2 seconds faster than preferred": { preferredResponseLatency: 3 * time.Second, secondaryResponseLatency: 1 * time.Second, - expectedMetric: (` - # HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend. - # TYPE cortex_querytee_backend_request_relative_duration_seconds histogram - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1 - cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} 2 - cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1 -`), + expectedMetric: (`todo`), }, "preferred backend is 5 seconds faster than secondary": { preferredResponseLatency: 2 * time.Second, secondaryResponseLatency: 7 * time.Second, - expectedMetric: (` - # HELP cortex_querytee_backend_request_relative_duration_seconds Time (in seconds) of preferred backend less secondary backend. - # TYPE cortex_querytee_backend_request_relative_duration_seconds histogram - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-100"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-50"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-25"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-10"} 0 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-4"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-3"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-2"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1.5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-1"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.75"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.25"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.1"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.05"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.025"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.01"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="-0.005"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.005"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.01"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.025"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.05"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.1"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.25"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="0.75"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="1.5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="2"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="3"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="4"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="5"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="10"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="25"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="50"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="100"} 1 - cortex_querytee_backend_request_relative_duration_seconds_bucket{method="GET",route="test",le="+Inf"} 1 - cortex_querytee_backend_request_relative_duration_seconds_sum{method="GET",route="test"} -5 - cortex_querytee_backend_request_relative_duration_seconds_count{method="GET",route="test"} 1 -`), + expectedMetric: (`todo`), }, } @@ -572,12 +489,30 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) { // Wait for the response comparison to complete before checking the logged messages. waitForResponseComparisonMetric(t, reg, ComparisonSuccess) - require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds")) - + //require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds")) + got, done, err := prometheus.ToTransactionalGatherer(reg).Gather() + defer done() + require.NoError(t, err, "Failed to gather metrics from registry") + got = filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"}) + require.Equal(t, 1, len(got), "Expect only one metric after filtering") + require.Equal(t, uint64(1), got[0].Metric[0].Histogram.GetSampleCount(), "Expect only 1 sample") }) } } +func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily { + var filtered []*dto.MetricFamily + for _, m := range metrics { + for _, name := range names { + if m.GetName() == name { + filtered = append(filtered, m) + break + } + } + } + return filtered +} + func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) { started := time.Now() timeoutAt := started.Add(2 * time.Second) diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go index de2c53262e8..47b1b7afba0 100644 --- a/tools/querytee/proxy_metrics.go +++ b/tools/querytee/proxy_metrics.go @@ -24,6 +24,7 @@ type ProxyMetrics struct { responsesTotal *prometheus.CounterVec responsesComparedTotal *prometheus.CounterVec relativeDuration *prometheus.HistogramVec + proportionalDuration *prometheus.HistogramVec } func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { @@ -45,11 +46,17 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { Help: "Total number of responses compared per route name by result.", }, []string{"route", "result"}), relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: queryTeeMetricsNamespace, - Name: "backend_request_relative_duration_seconds", - Help: "Time (in seconds) of preferred backend less secondary backend.", - Buckets: []float64{-100, -50, -25, -10, -5, -4, -3, -2, -1.5, -1, -.75, -.5, -.25, -.1, -.05, -.025, -.01, -.005, .005, .01, .025, .05, .1, .25, .5, 0.75, 1, 1.5, 2, 3, 4, 5, 10, 25, 50, 100}, - }, []string{"method", "route"}), + Namespace: queryTeeMetricsNamespace, + Name: "backend_request_relative_duration_seconds", + Help: "Time (in seconds) of preferred backend less secondary backend.", + NativeHistogramBucketFactor: 2, + }, []string{"route"}), + proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ + Namespace: queryTeeMetricsNamespace, + Name: "backend_request_relative_duration_proportional", + Help: "Proportional time (%) of preferred backend vs secondary backend.", + NativeHistogramBucketFactor: 2, + }, []string{"route"}), } return m From 79fd1e9ea4d28d8690e3bd623bb8035b7f0b241f Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Thu, 4 Apr 2024 20:25:04 +1100 Subject: [PATCH 03/10] Improve histogram unit tests Also test the proportional metric --- tools/querytee/proxy_endpoint_test.go | 181 +++++++++++++++++--------- 1 file changed, 121 insertions(+), 60 deletions(-) diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index b78a90689c8..20ad46239dc 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -335,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. // Wait for the response comparison to complete before checking the logged messages. - waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult) + waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1) switch scenario.expectedComparisonResult { case ComparisonSuccess: @@ -354,67 +354,81 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { scenarios := map[string]struct { slowResponseThreshold time.Duration - preferredResponseLatency time.Duration - secondaryResponseLatency time.Duration + latencyPairs []latencyPair expectLatencyExceedsThreshold bool fastestBackend string slowestBackend string }{ "responses are below threshold": { - slowResponseThreshold: 100 * time.Millisecond, - preferredResponseLatency: 1 * time.Millisecond, - secondaryResponseLatency: 1 * time.Millisecond, + slowResponseThreshold: 100 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 1 * time.Millisecond, + secondaryResponseLatency: 1 * time.Millisecond, + }}, expectLatencyExceedsThreshold: false, }, "one response above threshold": { - slowResponseThreshold: 50 * time.Millisecond, - preferredResponseLatency: 1 * time.Millisecond, - secondaryResponseLatency: 70 * time.Millisecond, + slowResponseThreshold: 50 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 1 * time.Millisecond, + secondaryResponseLatency: 70 * time.Millisecond, + }}, expectLatencyExceedsThreshold: true, fastestBackend: "preferred-backend", slowestBackend: "secondary-backend", }, "responses are both above threshold, but lower than threshold between themselves": { - slowResponseThreshold: 50 * time.Millisecond, - preferredResponseLatency: 51 * time.Millisecond, - secondaryResponseLatency: 62 * time.Millisecond, + slowResponseThreshold: 50 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 51 * time.Millisecond, + secondaryResponseLatency: 62 * time.Millisecond, + }}, expectLatencyExceedsThreshold: false, }, "responses are both above threshold, and above threshold between themselves": { - slowResponseThreshold: 10 * time.Millisecond, - preferredResponseLatency: 11 * time.Millisecond, - secondaryResponseLatency: 52 * time.Millisecond, + slowResponseThreshold: 10 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 11 * time.Millisecond, + secondaryResponseLatency: 52 * time.Millisecond, + }}, expectLatencyExceedsThreshold: true, fastestBackend: "preferred-backend", slowestBackend: "secondary-backend", }, "secondary latency is faster than primary, and difference is below threshold": { - slowResponseThreshold: 50 * time.Millisecond, - preferredResponseLatency: 10 * time.Millisecond, - secondaryResponseLatency: 1 * time.Millisecond, + slowResponseThreshold: 50 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 10 * time.Millisecond, + secondaryResponseLatency: 1 * time.Millisecond, + }}, expectLatencyExceedsThreshold: false, }, "secondary latency is faster than primary, and difference is above threshold": { - slowResponseThreshold: 50 * time.Millisecond, - preferredResponseLatency: 71 * time.Millisecond, - secondaryResponseLatency: 1 * time.Millisecond, + slowResponseThreshold: 50 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 71 * time.Millisecond, + secondaryResponseLatency: 1 * time.Millisecond, + }}, expectLatencyExceedsThreshold: true, fastestBackend: "secondary-backend", slowestBackend: "preferred-backend", }, "slowest response threshold is disabled (0)": { - slowResponseThreshold: 0 * time.Millisecond, - preferredResponseLatency: 200 * time.Millisecond, - secondaryResponseLatency: 100 * time.Millisecond, + slowResponseThreshold: 0 * time.Millisecond, + latencyPairs: []latencyPair{{ + preferredResponseLatency: 200 * time.Millisecond, + secondaryResponseLatency: 100 * time.Millisecond, + }}, expectLatencyExceedsThreshold: false, }, } for name, scenario := range scenarios { t.Run(name, func(t *testing.T) { + preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs) backends := []ProxyBackendInterface{ - newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency), - newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency), + newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies), + newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies), } logger := newMockLogger() @@ -432,7 +446,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. // Wait for the response comparison to complete before checking the logged messages. - waitForResponseComparisonMetric(t, reg, ComparisonSuccess) + waitForResponseComparisonMetric(t, reg, ComparisonSuccess, uint64(len(scenario.latencyPairs))) if scenario.expectLatencyExceedsThreshold { requireLogKeyValues(t, logger.messages, map[string]string{ @@ -449,27 +463,42 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) { scenarios := map[string]struct { - preferredResponseLatency time.Duration - secondaryResponseLatency time.Duration - expectedMetric string + latencyPairs []latencyPair + expectedSampleCount uint64 + expectedDurationSampleSum float64 + expectedProportionalSampleSum float64 }{ - "secondary backend is 2 seconds faster than preferred": { - preferredResponseLatency: 3 * time.Second, - secondaryResponseLatency: 1 * time.Second, - expectedMetric: (`todo`), + "secondary backend is faster than preferred": { + latencyPairs: []latencyPair{{ + preferredResponseLatency: 3 * time.Second, + secondaryResponseLatency: 1 * time.Second, + }, { + preferredResponseLatency: 5 * time.Second, + secondaryResponseLatency: 2 * time.Second, + }, + }, + expectedSampleCount: 2, + expectedDurationSampleSum: 5, + expectedProportionalSampleSum: (3.0/1 + 5.0/2), }, "preferred backend is 5 seconds faster than secondary": { - preferredResponseLatency: 2 * time.Second, - secondaryResponseLatency: 7 * time.Second, - expectedMetric: (`todo`), + latencyPairs: []latencyPair{{ + preferredResponseLatency: 2 * time.Second, + secondaryResponseLatency: 7 * time.Second, + }, + }, + expectedSampleCount: 1, + expectedDurationSampleSum: -5, + expectedProportionalSampleSum: (2.0 / 7), }, } for name, scenario := range scenarios { t.Run(name, func(t *testing.T) { + preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs) backends := []ProxyBackendInterface{ - newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency), - newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency), + newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies), + newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies), } logger := newMockLogger() @@ -483,19 +512,28 @@ func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) { resp := httptest.NewRecorder() req, err := http.NewRequest("GET", "http://test/api/v1/test", nil) require.NoError(t, err) - endpoint.ServeHTTP(resp, req) + for range scenario.latencyPairs { + // This is just in serial to keep the tests simple + endpoint.ServeHTTP(resp, req) + } // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. - // Wait for the response comparison to complete before checking the logged messages. - waitForResponseComparisonMetric(t, reg, ComparisonSuccess) + // Wait for the response comparison to complete the number of requests we have + waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount) - //require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(scenario.expectedMetric), "cortex_querytee_backend_request_relative_duration_seconds")) got, done, err := prometheus.ToTransactionalGatherer(reg).Gather() defer done() require.NoError(t, err, "Failed to gather metrics from registry") - got = filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"}) - require.Equal(t, 1, len(got), "Expect only one metric after filtering") - require.Equal(t, uint64(1), got[0].Metric[0].Histogram.GetSampleCount(), "Expect only 1 sample") + + gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_seconds"}) + require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering") + require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount()) + require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum()) + + gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_request_relative_duration_proportional"}) + require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering") + require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount()) + require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum()) }) } } @@ -513,7 +551,7 @@ func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFam return filtered } -func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) { +func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) { started := time.Now() timeoutAt := started.Add(2 * time.Second) @@ -521,8 +559,8 @@ func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expect expected := fmt.Sprintf(` # HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result. # TYPE cortex_querytee_responses_compared_total counter - cortex_querytee_responses_compared_total{result="%v",route="test"} 1 -`, expectedResult) + cortex_querytee_responses_compared_total{result="%v",route="test"} %d +`, expectedResult, expectedCount) err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total") if err == nil { @@ -699,19 +737,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error { return nil } +type latencyPair struct { + preferredResponseLatency time.Duration + secondaryResponseLatency time.Duration +} + +func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) { + for _, pair := range latencyPairs { + preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency) + secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency) + } + return +} + type mockProxyBackend struct { - name string - timeout time.Duration - preferred bool - fakeResponseLatency time.Duration + name string + timeout time.Duration + preferred bool + fakeResponseLatencies []time.Duration + responseIndex int } -func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface { +func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface { return &mockProxyBackend{ - name: name, - timeout: timeout, - preferred: preferred, - fakeResponseLatency: fakeResponseLatency, + name: name, + timeout: timeout, + preferred: preferred, + fakeResponseLatencies: fakeResponseLatencies, } } @@ -728,11 +780,20 @@ func (b *mockProxyBackend) Preferred() bool { } func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (time.Duration, int, []byte, *http.Response, error) { + statusCode := 200 + if b.responseIndex >= len(b.fakeResponseLatencies) { + statusCode = 500 + } resp := &http.Response{ - StatusCode: 200, + StatusCode: statusCode, Header: make(http.Header), Body: io.NopCloser(bytes.NewBufferString(`{}`)), } resp.Header.Set("Content-Type", "application/json") - return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil + if b.responseIndex >= len(b.fakeResponseLatencies) { + return 0, 500, []byte("{}"), resp, errors.New("no more latencies available") + } + latency := b.fakeResponseLatencies[b.responseIndex] + b.responseIndex++ + return latency, 200, []byte("{}"), resp, nil } From 9230e43d3b4acd70a021403f28f534a97d36a397 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 9 Apr 2024 14:56:19 +1000 Subject: [PATCH 04/10] Update tools/querytee/proxy_metrics.go Co-authored-by: Charles Korn --- tools/querytee/proxy_metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go index 47b1b7afba0..5bc297d5d14 100644 --- a/tools/querytee/proxy_metrics.go +++ b/tools/querytee/proxy_metrics.go @@ -54,7 +54,7 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ Namespace: queryTeeMetricsNamespace, Name: "backend_request_relative_duration_proportional", - Help: "Proportional time (%) of preferred backend vs secondary backend.", + Help: "Response time of preferred backend, as a proportion of secondary backend response time.", NativeHistogramBucketFactor: 2, }, []string{"route"}), } From 5b437ac0e4bfe8270812ee1bdd629f52450f1da6 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 9 Apr 2024 14:58:45 +1000 Subject: [PATCH 05/10] Tidy up test --- tools/querytee/proxy_endpoint_test.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index 20ad46239dc..47514676f82 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -780,17 +780,14 @@ func (b *mockProxyBackend) Preferred() bool { } func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (time.Duration, int, []byte, *http.Response, error) { - statusCode := 200 - if b.responseIndex >= len(b.fakeResponseLatencies) { - statusCode = 500 - } resp := &http.Response{ - StatusCode: statusCode, + StatusCode: 200, Header: make(http.Header), Body: io.NopCloser(bytes.NewBufferString(`{}`)), } resp.Header.Set("Content-Type", "application/json") if b.responseIndex >= len(b.fakeResponseLatencies) { + resp.StatusCode = 500 return 0, 500, []byte("{}"), resp, errors.New("no more latencies available") } latency := b.fakeResponseLatencies[b.responseIndex] From acf0cc0ff9e89c865e33caeedbf92d9e0f2f5b4f Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 9 Apr 2024 15:14:41 +1000 Subject: [PATCH 06/10] Add changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7539640db68..337f13b68f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,9 @@ ### Query-tee * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346 +* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782 + * `cortex_querytee_backend_request_relative_duration_seconds` + * `cortex_querytee_backend_request_relative_duration_proportional` ### Documentation From 2b8da7063e8c5d7b2bef22ec6d7998411179ff39 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 9 Apr 2024 15:14:49 +1000 Subject: [PATCH 07/10] Add docs --- docs/sources/mimir/manage/tools/query-tee.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index 8ca1bb2e5b6..33dbd94adb9 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="",method="",route="}}) you can use the query-tee to compare rule evaluations too. From 55353dcd52088eb737d8974206f36250eee34e5f Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Sat, 20 Apr 2024 20:42:08 +0200 Subject: [PATCH 08/10] Generate doc --- docs/sources/mimir/manage/tools/query-tee.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index 33dbd94adb9..565082ef50a 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -165,8 +165,8 @@ cortex_querytee_responses_compared_total{route="",result="" Additionally, if backend results comparison is configured, two native-histograms are available: -* `cortex_querytee_backend_request_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend. -* `cortex_querytee_backend_request_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time. +- `cortex_querytee_backend_request_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend. +- `cortex_querytee_backend_request_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time. ### Ruler remote operational mode test From 36962be88b11b8ab2941663b3264dee227983e22 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Sat, 20 Apr 2024 21:21:39 +0200 Subject: [PATCH 09/10] Rename metric to be "response" --- CHANGELOG.md | 4 ++-- docs/sources/mimir/manage/tools/query-tee.md | 6 +++--- tools/querytee/proxy_endpoint_test.go | 4 ++-- tools/querytee/proxy_metrics.go | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 337f13b68f6..852fd4038f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,8 +44,8 @@ * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346 * [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782 - * `cortex_querytee_backend_request_relative_duration_seconds` - * `cortex_querytee_backend_request_relative_duration_proportional` + * `cortex_querytee_backend_response_relative_duration_seconds` + * `cortex_querytee_backend_response_relative_duration_proportional` ### Documentation diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index 565082ef50a..471576c94b3 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -163,10 +163,10 @@ cortex_querytee_responses_total{backend="",method="",route=" Date: Sat, 20 Apr 2024 21:22:01 +0200 Subject: [PATCH 10/10] Don't use latencyPair in Test_ProxyEndpoint_LogSlowQueries --- tools/querytee/proxy_endpoint_test.go | 66 +++++++++++---------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index ea97d245cff..0ce1af5b379 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -354,81 +354,67 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { scenarios := map[string]struct { slowResponseThreshold time.Duration - latencyPairs []latencyPair + preferredResponseLatency time.Duration + secondaryResponseLatency time.Duration expectLatencyExceedsThreshold bool fastestBackend string slowestBackend string }{ "responses are below threshold": { - slowResponseThreshold: 100 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 1 * time.Millisecond, - secondaryResponseLatency: 1 * time.Millisecond, - }}, + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 1 * time.Millisecond, + secondaryResponseLatency: 1 * time.Millisecond, expectLatencyExceedsThreshold: false, }, "one response above threshold": { - slowResponseThreshold: 50 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 1 * time.Millisecond, - secondaryResponseLatency: 70 * time.Millisecond, - }}, + slowResponseThreshold: 50 * time.Millisecond, + preferredResponseLatency: 1 * time.Millisecond, + secondaryResponseLatency: 70 * time.Millisecond, expectLatencyExceedsThreshold: true, fastestBackend: "preferred-backend", slowestBackend: "secondary-backend", }, "responses are both above threshold, but lower than threshold between themselves": { - slowResponseThreshold: 50 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 51 * time.Millisecond, - secondaryResponseLatency: 62 * time.Millisecond, - }}, + slowResponseThreshold: 50 * time.Millisecond, + preferredResponseLatency: 51 * time.Millisecond, + secondaryResponseLatency: 62 * time.Millisecond, expectLatencyExceedsThreshold: false, }, "responses are both above threshold, and above threshold between themselves": { - slowResponseThreshold: 10 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 11 * time.Millisecond, - secondaryResponseLatency: 52 * time.Millisecond, - }}, + slowResponseThreshold: 10 * time.Millisecond, + preferredResponseLatency: 11 * time.Millisecond, + secondaryResponseLatency: 52 * time.Millisecond, expectLatencyExceedsThreshold: true, fastestBackend: "preferred-backend", slowestBackend: "secondary-backend", }, "secondary latency is faster than primary, and difference is below threshold": { - slowResponseThreshold: 50 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 10 * time.Millisecond, - secondaryResponseLatency: 1 * time.Millisecond, - }}, + slowResponseThreshold: 50 * time.Millisecond, + preferredResponseLatency: 10 * time.Millisecond, + secondaryResponseLatency: 1 * time.Millisecond, expectLatencyExceedsThreshold: false, }, "secondary latency is faster than primary, and difference is above threshold": { - slowResponseThreshold: 50 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 71 * time.Millisecond, - secondaryResponseLatency: 1 * time.Millisecond, - }}, + slowResponseThreshold: 50 * time.Millisecond, + preferredResponseLatency: 71 * time.Millisecond, + secondaryResponseLatency: 1 * time.Millisecond, expectLatencyExceedsThreshold: true, fastestBackend: "secondary-backend", slowestBackend: "preferred-backend", }, "slowest response threshold is disabled (0)": { - slowResponseThreshold: 0 * time.Millisecond, - latencyPairs: []latencyPair{{ - preferredResponseLatency: 200 * time.Millisecond, - secondaryResponseLatency: 100 * time.Millisecond, - }}, + slowResponseThreshold: 0 * time.Millisecond, + preferredResponseLatency: 200 * time.Millisecond, + secondaryResponseLatency: 100 * time.Millisecond, expectLatencyExceedsThreshold: false, }, } for name, scenario := range scenarios { t.Run(name, func(t *testing.T) { - preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs) backends := []ProxyBackendInterface{ - newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies), - newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies), + newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}), + newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}), } logger := newMockLogger() @@ -446,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. // Wait for the response comparison to complete before checking the logged messages. - waitForResponseComparisonMetric(t, reg, ComparisonSuccess, uint64(len(scenario.latencyPairs))) + waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1) if scenario.expectLatencyExceedsThreshold { requireLogKeyValues(t, logger.messages, map[string]string{