From 56603198ba1fa47b5cb7fedbdea0d6a6d32e0be3 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Mon, 29 Apr 2024 12:42:17 +1000 Subject: [PATCH] Querytee: Add metric to measure relative backend latency (#7782) * Querytee: Add metric to measure relative backend latency This metric gives us a measurement of how individual queries compare between two backends in terms of latency (or duration). * Add proportional metric, and change to native histograms * Improve histogram unit tests Also test the proportional metric * Update tools/querytee/proxy_metrics.go Co-authored-by: Charles Korn * Tidy up test * Add changelog * Add docs * Generate doc * Rename metric to be "response" * Don't use latencyPair in Test_ProxyEndpoint_LogSlowQueries --------- Co-authored-by: Charles Korn --- CHANGELOG.md | 3 + docs/sources/mimir/manage/tools/query-tee.md | 5 + tools/querytee/proxy_endpoint.go | 4 + tools/querytee/proxy_endpoint_test.go | 145 ++++++++++++++++--- tools/querytee/proxy_metrics.go | 14 ++ 5 files changed, 154 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1c81946147..d1944bc74e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,6 +94,9 @@ ### Query-tee * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346 +* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782 + * `cortex_querytee_backend_response_relative_duration_seconds` + * `cortex_querytee_backend_response_relative_duration_proportional` ### Documentation diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index 8ca1bb2e5b6..471576c94b3 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="",method="",route="}}) you can use the query-tee to compare rule evaluations too. diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go index e81a893b99f..62615f3457b 100644 --- a/tools/querytee/proxy_endpoint.go +++ b/tools/querytee/proxy_endpoint.go @@ -224,6 +224,10 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba ) } + relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime + proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds() + p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds()) + p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration) p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc() } } diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index 95b5ec0a579..06e4c4f5e06 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -21,6 +21,7 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/atomic" @@ -334,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. // Wait for the response comparison to complete before checking the logged messages. - waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult) + waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1) switch scenario.expectedComparisonResult { case ComparisonSuccess: @@ -412,8 +413,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { for name, scenario := range scenarios { t.Run(name, func(t *testing.T) { backends := []ProxyBackendInterface{ - newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency), - newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency), + newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}), + newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}), } logger := newMockLogger() @@ -431,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. // Wait for the response comparison to complete before checking the logged messages. - waitForResponseComparisonMetric(t, reg, ComparisonSuccess) + waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1) if scenario.expectLatencyExceedsThreshold { requireLogKeyValues(t, logger.messages, map[string]string{ @@ -446,7 +447,97 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { } } -func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) { +func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) { + scenarios := map[string]struct { + latencyPairs []latencyPair + expectedSampleCount uint64 + expectedDurationSampleSum float64 + expectedProportionalSampleSum float64 + }{ + "secondary backend is faster than preferred": { + latencyPairs: []latencyPair{{ + preferredResponseLatency: 3 * time.Second, + secondaryResponseLatency: 1 * time.Second, + }, { + preferredResponseLatency: 5 * time.Second, + secondaryResponseLatency: 2 * time.Second, + }, + }, + expectedSampleCount: 2, + expectedDurationSampleSum: 5, + expectedProportionalSampleSum: (3.0/1 + 5.0/2), + }, + "preferred backend is 5 seconds faster than secondary": { + latencyPairs: []latencyPair{{ + preferredResponseLatency: 2 * time.Second, + secondaryResponseLatency: 7 * time.Second, + }, + }, + expectedSampleCount: 1, + expectedDurationSampleSum: -5, + expectedProportionalSampleSum: (2.0 / 7), + }, + } + + for name, scenario := range scenarios { + t.Run(name, func(t *testing.T) { + preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs) + backends := []ProxyBackendInterface{ + newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies), + newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies), + } + + logger := newMockLogger() + reg := prometheus.NewPedanticRegistry() + comparator := &mockComparator{ + comparisonResult: ComparisonSuccess, + } + + endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0) + + resp := httptest.NewRecorder() + req, err := http.NewRequest("GET", "http://test/api/v1/test", nil) + require.NoError(t, err) + for range scenario.latencyPairs { + // This is just in serial to keep the tests simple + endpoint.ServeHTTP(resp, req) + } + + // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. + // Wait for the response comparison to complete the number of requests we have + waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount) + + got, done, err := prometheus.ToTransactionalGatherer(reg).Gather() + defer done() + require.NoError(t, err, "Failed to gather metrics from registry") + + gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_seconds"}) + require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering") + require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount()) + require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum()) + + gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_proportional"}) + require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering") + require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount()) + require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum()) + }) + } +} + +func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily { + var filtered []*dto.MetricFamily + for _, m := range metrics { + for _, name := range names { + if m.GetName() == name { + filtered = append(filtered, m) + break + } + } + } + return filtered +} + +func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) { started := time.Now() timeoutAt := started.Add(2 * time.Second) @@ -454,8 +545,8 @@ func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expect expected := fmt.Sprintf(` # HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result. # TYPE cortex_querytee_responses_compared_total counter - cortex_querytee_responses_compared_total{result="%v",route="test"} 1 -`, expectedResult) + cortex_querytee_responses_compared_total{result="%v",route="test"} %d +`, expectedResult, expectedCount) err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total") if err == nil { @@ -632,19 +723,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error { return nil } +type latencyPair struct { + preferredResponseLatency time.Duration + secondaryResponseLatency time.Duration +} + +func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) { + for _, pair := range latencyPairs { + preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency) + secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency) + } + return +} + type mockProxyBackend struct { - name string - timeout time.Duration - preferred bool - fakeResponseLatency time.Duration + name string + timeout time.Duration + preferred bool + fakeResponseLatencies []time.Duration + responseIndex int } -func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface { +func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface { return &mockProxyBackend{ - name: name, - timeout: timeout, - preferred: preferred, - fakeResponseLatency: fakeResponseLatency, + name: name, + timeout: timeout, + preferred: preferred, + fakeResponseLatencies: fakeResponseLatencies, } } @@ -667,5 +772,11 @@ func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (tim Body: io.NopCloser(bytes.NewBufferString(`{}`)), } resp.Header.Set("Content-Type", "application/json") - return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil + if b.responseIndex >= len(b.fakeResponseLatencies) { + resp.StatusCode = 500 + return 0, 500, []byte("{}"), resp, errors.New("no more latencies available") + } + latency := b.fakeResponseLatencies[b.responseIndex] + b.responseIndex++ + return latency, 200, []byte("{}"), resp, nil } diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go index 019c9dea1d0..a022b4fcda0 100644 --- a/tools/querytee/proxy_metrics.go +++ b/tools/querytee/proxy_metrics.go @@ -23,6 +23,8 @@ type ProxyMetrics struct { requestDuration *prometheus.HistogramVec responsesTotal *prometheus.CounterVec responsesComparedTotal *prometheus.CounterVec + relativeDuration *prometheus.HistogramVec + proportionalDuration *prometheus.HistogramVec } func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { @@ -43,6 +45,18 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { Name: "responses_compared_total", Help: "Total number of responses compared per route name by result.", }, []string{"route", "result"}), + relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ + Namespace: queryTeeMetricsNamespace, + Name: "backend_response_relative_duration_seconds", + Help: "Time (in seconds) of preferred backend less secondary backend.", + NativeHistogramBucketFactor: 2, + }, []string{"route"}), + proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ + Namespace: queryTeeMetricsNamespace, + Name: "backend_response_relative_duration_proportional", + Help: "Response time of preferred backend, as a proportion of secondary backend response time.", + NativeHistogramBucketFactor: 2, + }, []string{"route"}), } return m