From 56603198ba1fa47b5cb7fedbdea0d6a6d32e0be3 Mon Sep 17 00:00:00 2001
From: Joshua Hesketh <joshua.hesketh@grafana.com>
Date: Mon, 29 Apr 2024 12:42:17 +1000
Subject: [PATCH] Querytee: Add metric to measure relative backend latency
 (#7782)

* Querytee: Add metric to measure relative backend latency

This metric gives us a measurement of how individual queries compare
between two backends in terms of latency (or duration).

* Add proportional metric, and change to native histograms

* Improve histogram unit tests

Also test the proportional metric

* Update tools/querytee/proxy_metrics.go

Co-authored-by: Charles Korn <charleskorn@users.noreply.github.com>

* Tidy up test

* Add changelog

* Add docs

* Generate doc

* Rename metric to be "response"

* Don't use latencyPair in Test_ProxyEndpoint_LogSlowQueries

---------

Co-authored-by: Charles Korn <charleskorn@users.noreply.github.com>
---
 CHANGELOG.md                                 |   3 +
 docs/sources/mimir/manage/tools/query-tee.md |   5 +
 tools/querytee/proxy_endpoint.go             |   4 +
 tools/querytee/proxy_endpoint_test.go        | 145 ++++++++++++++++---
 tools/querytee/proxy_metrics.go              |  14 ++
 5 files changed, 154 insertions(+), 17 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c1c81946147..d1944bc74e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -94,6 +94,9 @@
 ### Query-tee
 
 * [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346
+* [ENHANCEMENT] Add two new metrics for measuring the relative duration between backends: #7782
+  * `cortex_querytee_backend_response_relative_duration_seconds`
+  * `cortex_querytee_backend_response_relative_duration_proportional`
 
 ### Documentation
 
diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md
index 8ca1bb2e5b6..471576c94b3 100644
--- a/docs/sources/mimir/manage/tools/query-tee.md
+++ b/docs/sources/mimir/manage/tools/query-tee.md
@@ -163,6 +163,11 @@ cortex_querytee_responses_total{backend="<hostname>",method="<method>",route="<r
 cortex_querytee_responses_compared_total{route="<route>",result="<success|fail>"}
 ```
 
+Additionally, if backend results comparison is configured, two native histograms are available:
+
+- `cortex_querytee_backend_response_relative_duration_seconds`: Time (in seconds) of preferred backend less secondary backend.
+- `cortex_querytee_backend_response_relative_duration_proportional`: Response time of preferred backend, as a proportion of secondary backend response time.
+
 ### Ruler remote operational mode test
 
 When the ruler is configured with the [remote evaluation mode]({{< relref "../../references/architecture/components/ruler" >}}) you can use the query-tee to compare rule evaluations too.
diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go
index e81a893b99f..62615f3457b 100644
--- a/tools/querytee/proxy_endpoint.go
+++ b/tools/querytee/proxy_endpoint.go
@@ -224,6 +224,10 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba
 			)
 		}
 
+		relativeDuration := expectedResponse.elapsedTime - actualResponse.elapsedTime
+		proportionalDuration := expectedResponse.elapsedTime.Seconds() / actualResponse.elapsedTime.Seconds()
+		p.metrics.relativeDuration.WithLabelValues(p.routeName).Observe(relativeDuration.Seconds())
+		p.metrics.proportionalDuration.WithLabelValues(p.routeName).Observe(proportionalDuration)
 		p.metrics.responsesComparedTotal.WithLabelValues(p.routeName, string(result)).Inc()
 	}
 }
diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go
index 95b5ec0a579..06e4c4f5e06 100644
--- a/tools/querytee/proxy_endpoint_test.go
+++ b/tools/querytee/proxy_endpoint_test.go
@@ -21,6 +21,7 @@ import (
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/testutil"
+	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/atomic"
@@ -334,7 +335,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult)
+			waitForResponseComparisonMetric(t, reg, scenario.expectedComparisonResult, 1)
 
 			switch scenario.expectedComparisonResult {
 			case ComparisonSuccess:
@@ -412,8 +413,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	for name, scenario := range scenarios {
 		t.Run(name, func(t *testing.T) {
 			backends := []ProxyBackendInterface{
-				newMockProxyBackend("preferred-backend", time.Second, true, scenario.preferredResponseLatency),
-				newMockProxyBackend("secondary-backend", time.Second, false, scenario.secondaryResponseLatency),
+				newMockProxyBackend("preferred-backend", time.Second, true, []time.Duration{scenario.preferredResponseLatency}),
+				newMockProxyBackend("secondary-backend", time.Second, false, []time.Duration{scenario.secondaryResponseLatency}),
 			}
 
 			logger := newMockLogger()
@@ -431,7 +432,7 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 
 			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
 			// Wait for the response comparison to complete before checking the logged messages.
-			waitForResponseComparisonMetric(t, reg, ComparisonSuccess)
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, 1)
 
 			if scenario.expectLatencyExceedsThreshold {
 				requireLogKeyValues(t, logger.messages, map[string]string{
@@ -446,7 +447,97 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) {
 	}
 }
 
-func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) {
+func Test_ProxyEndpoint_RelativeDurationMetric(t *testing.T) {
+	scenarios := map[string]struct {
+		latencyPairs                  []latencyPair
+		expectedSampleCount           uint64
+		expectedDurationSampleSum     float64
+		expectedProportionalSampleSum float64
+	}{
+		"secondary backend is faster than preferred": {
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 3 * time.Second,
+				secondaryResponseLatency: 1 * time.Second,
+			}, {
+				preferredResponseLatency: 5 * time.Second,
+				secondaryResponseLatency: 2 * time.Second,
+			},
+			},
+			expectedSampleCount:           2,
+			expectedDurationSampleSum:     5,
+			expectedProportionalSampleSum: (3.0/1 + 5.0/2),
+		},
+		"preferred backend is 5 seconds faster than secondary": {
+			latencyPairs: []latencyPair{{
+				preferredResponseLatency: 2 * time.Second,
+				secondaryResponseLatency: 7 * time.Second,
+			},
+			},
+			expectedSampleCount:           1,
+			expectedDurationSampleSum:     -5,
+			expectedProportionalSampleSum: (2.0 / 7),
+		},
+	}
+
+	for name, scenario := range scenarios {
+		t.Run(name, func(t *testing.T) {
+			preferredLatencies, secondaryLatencies := splitLatencyPairs(scenario.latencyPairs)
+			backends := []ProxyBackendInterface{
+				newMockProxyBackend("preferred-backend", time.Second, true, preferredLatencies),
+				newMockProxyBackend("secondary-backend", time.Second, false, secondaryLatencies),
+			}
+
+			logger := newMockLogger()
+			reg := prometheus.NewPedanticRegistry()
+			comparator := &mockComparator{
+				comparisonResult: ComparisonSuccess,
+			}
+
+			endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0)
+
+			resp := httptest.NewRecorder()
+			req, err := http.NewRequest("GET", "http://test/api/v1/test", nil)
+			require.NoError(t, err)
+			for range scenario.latencyPairs {
+				// This is just in serial to keep the tests simple
+				endpoint.ServeHTTP(resp, req)
+			}
+
+			// The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed.
+			// Wait for the response comparison to complete the number of requests we have
+			waitForResponseComparisonMetric(t, reg, ComparisonSuccess, scenario.expectedSampleCount)
+
+			got, done, err := prometheus.ToTransactionalGatherer(reg).Gather()
+			defer done()
+			require.NoError(t, err, "Failed to gather metrics from registry")
+
+			gotDuration := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_seconds"})
+			require.Equal(t, 1, len(gotDuration), "Expect only one metric after filtering")
+			require.Equal(t, scenario.expectedSampleCount, gotDuration[0].Metric[0].Histogram.GetSampleCount())
+			require.Equal(t, scenario.expectedDurationSampleSum, gotDuration[0].Metric[0].Histogram.GetSampleSum())
+
+			gotProportional := filterMetrics(got, []string{"cortex_querytee_backend_response_relative_duration_proportional"})
+			require.Equal(t, 1, len(gotProportional), "Expect only one metric after filtering")
+			require.Equal(t, scenario.expectedSampleCount, gotProportional[0].Metric[0].Histogram.GetSampleCount())
+			require.Equal(t, scenario.expectedProportionalSampleSum, gotProportional[0].Metric[0].Histogram.GetSampleSum())
+		})
+	}
+}
+
+func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily {
+	var filtered []*dto.MetricFamily
+	for _, m := range metrics {
+		for _, name := range names {
+			if m.GetName() == name {
+				filtered = append(filtered, m)
+				break
+			}
+		}
+	}
+	return filtered
+}
+
+func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult, expectedCount uint64) {
 	started := time.Now()
 	timeoutAt := started.Add(2 * time.Second)
 
@@ -454,8 +545,8 @@ func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expect
 		expected := fmt.Sprintf(`
 			# HELP cortex_querytee_responses_compared_total Total number of responses compared per route name by result.
 			# TYPE cortex_querytee_responses_compared_total counter
-			cortex_querytee_responses_compared_total{result="%v",route="test"} 1
-`, expectedResult)
+			cortex_querytee_responses_compared_total{result="%v",route="test"} %d
+`, expectedResult, expectedCount)
 		err := testutil.GatherAndCompare(g, bytes.NewBufferString(expected), "cortex_querytee_responses_compared_total")
 
 		if err == nil {
@@ -632,19 +723,33 @@ func (m *mockLogger) Log(keyvals ...interface{}) error {
 	return nil
 }
 
+type latencyPair struct {
+	preferredResponseLatency time.Duration
+	secondaryResponseLatency time.Duration
+}
+
+func splitLatencyPairs(latencyPairs []latencyPair) (preferredLatencies []time.Duration, secondaryLatencies []time.Duration) {
+	for _, pair := range latencyPairs {
+		preferredLatencies = append(preferredLatencies, pair.preferredResponseLatency)
+		secondaryLatencies = append(secondaryLatencies, pair.secondaryResponseLatency)
+	}
+	return
+}
+
 type mockProxyBackend struct {
-	name                string
-	timeout             time.Duration
-	preferred           bool
-	fakeResponseLatency time.Duration
+	name                  string
+	timeout               time.Duration
+	preferred             bool
+	fakeResponseLatencies []time.Duration
+	responseIndex         int
 }
 
-func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatency time.Duration) ProxyBackendInterface {
+func newMockProxyBackend(name string, timeout time.Duration, preferred bool, fakeResponseLatencies []time.Duration) ProxyBackendInterface {
 	return &mockProxyBackend{
-		name:                name,
-		timeout:             timeout,
-		preferred:           preferred,
-		fakeResponseLatency: fakeResponseLatency,
+		name:                  name,
+		timeout:               timeout,
+		preferred:             preferred,
+		fakeResponseLatencies: fakeResponseLatencies,
 	}
 }
 
@@ -667,5 +772,11 @@ func (b *mockProxyBackend) ForwardRequest(_ *http.Request, _ io.ReadCloser) (tim
 		Body:       io.NopCloser(bytes.NewBufferString(`{}`)),
 	}
 	resp.Header.Set("Content-Type", "application/json")
-	return time.Duration(b.fakeResponseLatency), 200, []byte("{}"), resp, nil
+	if b.responseIndex >= len(b.fakeResponseLatencies) {
+		resp.StatusCode = 500
+		return 0, 500, []byte("{}"), resp, errors.New("no more latencies available")
+	}
+	latency := b.fakeResponseLatencies[b.responseIndex]
+	b.responseIndex++
+	return latency, 200, []byte("{}"), resp, nil
 }
diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go
index 019c9dea1d0..a022b4fcda0 100644
--- a/tools/querytee/proxy_metrics.go
+++ b/tools/querytee/proxy_metrics.go
@@ -23,6 +23,8 @@ type ProxyMetrics struct {
 	requestDuration        *prometheus.HistogramVec
 	responsesTotal         *prometheus.CounterVec
 	responsesComparedTotal *prometheus.CounterVec
+	relativeDuration       *prometheus.HistogramVec
+	proportionalDuration   *prometheus.HistogramVec
 }
 
 func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
@@ -43,6 +45,18 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics {
 			Name:      "responses_compared_total",
 			Help:      "Total number of responses compared per route name by result.",
 		}, []string{"route", "result"}),
+		relativeDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace:                   queryTeeMetricsNamespace,
+			Name:                        "backend_response_relative_duration_seconds",
+			Help:                        "Time (in seconds) of preferred backend less secondary backend.",
+			NativeHistogramBucketFactor: 2,
+		}, []string{"route"}),
+		proportionalDuration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace:                   queryTeeMetricsNamespace,
+			Name:                        "backend_response_relative_duration_proportional",
+			Help:                        "Response time of preferred backend, as a proportion of secondary backend response time.",
+			NativeHistogramBucketFactor: 2,
+		}, []string{"route"}),
 	}
 
 	return m